In [1]:
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
from time import time

In [2]:
# ! wget https://datasets.imdbws.com/name.basics.tsv.gz 
# ! wget https://datasets.imdbws.com/title.ratings.tsv.gz
# ! wget https://datasets.imdbws.com/title.akas.tsv.gz
# ! wget https://datasets.imdbws.com/title.basics.tsv.gz
# ! wget https://datasets.imdbws.com/title.crew.tsv.gz
# ! wget https://datasets.imdbws.com/title.episode.tsv.gz
# ! wget https://datasets.imdbws.com/title.principals.tsv.gz
# ! wget https://datasets.imdbws.com/title.episode.tsv.gz

# Choose imdb-ids of most popular movies

In [3]:
fpath = "./title.ratings.tsv.gz"

df_ratings = pd.read_table(fpath)
# print(df_ratings.head())

df = pd.read_table("./title.basics.tsv.gz", na_values={"startYear": ["\\N"], "endYear": ["\\N"], "isAdult": ["\\N"]})
# print(df.head())

df = df.merge(df_ratings, on="tconst")

#fill start year values
df["startYear"] = df["startYear"].fillna(value=df["startYear"])
# print(df["startYear"].isna().any())

df["startYear"] = df["startYear"].fillna(value=0)
# print(df["startYear"].isna().any())

df["endYear"] = df["endYear"].fillna(value=df["startYear"])
# print(df["endYear"].isna().any())

df["isAdult"] = df["isAdult"].fillna(value=0)
# print(df["isAdult"].isna().any())

df = df.astype(dtype={"startYear": np.int32, 
                 "endYear": np.int32, 
                 "isAdult": np.int32})
# print(df.head())
# print(df["titleType"].unique())

In [4]:
target = ["movie", "tvMovie", "tvSeries", "tvMiniSeries"] 
ind_drop = df[~df['titleType'].isin(target)].index
df = df.drop(ind_drop)
print(df.shape)

(429971, 11)


In [5]:
collect_movies_based_on_rating = False
collect_movies_based_on_numvotes = True

if collect_movies_based_on_rating:

    movies_ids = []

    movies_ids.extend(df.loc[(df["startYear"] <= 1990)  & (df["averageRating"] > 8), "tconst"].values)
    print(len(movies_ids))

    movies_ids.extend(df.loc[(df["startYear"] > 1990) & (df["startYear"] <= 2005) & (df["averageRating"] > 7), "tconst"].values)
    print(len(movies_ids))

    movies_ids.extend(df.loc[(df["startYear"] > 2005) & (df["startYear"] <= 2015) & (df["averageRating"] > 6), "tconst"].values)
    print(len(movies_ids))

    movies_ids.extend(df.loc[(df["startYear"] > 2015) & (df["startYear"] <= 2022) & (df["averageRating"] > 5), "tconst"].values)
    print(len(movies_ids))

    with open("all_imdb_ids.txt", "w") as f:
        for movie_id in movies_ids:
            f.write(str(movie_id) + "\n")

if collect_movies_based_on_numvotes:

    movies_ids = []

    movies_ids.extend(df.loc[df.loc[:, "numVotes"] > 1000, "tconst"].values)
    print(len(movies_ids))

    movies_ids.extend(df.loc[df.loc[:, "averageRating"] > 8., "tconst"].values)
    print(len(movies_ids))
    
    movies_ids = list(set(movies_ids))
    print(f"Total length of the database: {len(movies_ids)}")

    with open("most_popular_imdb_ids.txt", "w") as f:
        for movie_id in movies_ids:
            f.write(str(movie_id) + "\n")

49880
98133
Total length of the database: 94733


In [6]:
with open("most_popular_imdb_ids.txt", "r") as f:
    all_movies_ids = f.read().splitlines()
    
all_movies_ids = list(set(all_movies_ids))
print(f"Total number of considered movies: {len(all_movies_ids)}")

Total number of considered movies: 94733


# Collect titles and ratings

In [7]:
t0 = time()
fpath = "./title.ratings.tsv.gz"
df_ratings = pd.read_table(fpath)

ind_drop = df_ratings[~df_ratings['tconst'].isin(all_movies_ids)].index
df_ratings = df_ratings.drop(ind_drop)
assert df_ratings.shape[0] == len(all_movies_ids), print("Number of samples less than number of movies")
# print(df_ratings.head())


fpath = "./title.basics.tsv.gz"

df = pd.read_table(fpath, na_values={"startYear": ["\\N"], "endYear": ["\\N"], "isAdult": ["\\N"]})
# print(df.head())

ind_drop = df[~df['tconst'].isin(all_movies_ids)].index
df = df.drop(ind_drop)

df = df.merge(df_ratings, on="tconst")

df.rename(columns={"originalTitle": "original title",
                   "primaryTitle": "title",
                   "genres": "genre",
                   "averageRating": "imdb_rating",
                   "tconst": "imdb_id"
                  }, inplace=True)

# print(df.head())
df.drop_duplicates(inplace=True)
# print(df.shape)

df["titleType"] = df["titleType"].apply(lambda x: "Series" if "Series" in x else "")
df["genre"] = [",".join([x,y]) if y != "" else x for x,y in zip(df["genre"], df["titleType"])]
df["genre"] = df["genre"].apply(lambda x: x if x != "\\N" else "")
df["genre"] = df["genre"].apply(lambda x: x.split(","))

df.fillna({"startYear": 0, "endYear": 0}, inplace=True)
df["startYear"] = df["startYear"].astype("int")
df["endYear"] = df["endYear"].astype("int")
df.drop(["titleType", "isAdult", "runtimeMinutes"], axis=1, inplace=True)
assert df.shape[0] == len(all_movies_ids), print("Number of samples less than number of movies")

print(f"Total time: {time() - t0}")

Total time: 26.564144134521484


# Collect names of actors etc

In [8]:
t0 = time()
fpath = "./title.principals.tsv.gz"

df_principals = pd.read_table(fpath)
df_principals = df_principals.loc[:, ["tconst", "nconst", "ordering", "category", "characters"]]
df_principals.rename(columns={"tconst": "imdb_id"}, inplace=True)
print(df_principals.head())


     imdb_id     nconst  ordering         category characters
0  tt0000001  nm1588970         1             self   ["Self"]
1  tt0000001  nm0005690         2         director         \N
2  tt0000001  nm0374658         3  cinematographer         \N
3  tt0000002  nm0721526         1         director         \N
4  tt0000002  nm1335271         2         composer         \N


In [9]:
ind_drop = df_principals[~df_principals['imdb_id'].isin(all_movies_ids)].index
df_principals = df_principals.drop(ind_drop)
print(df_principals.head())


        imdb_id     nconst  ordering             category  \
9953  tt0002130  nm1376180        10  production_designer   
9954  tt0002130  nm0660139         1                actor   
9955  tt0002130  nm0685283         2                actor   
9956  tt0002130  nm0209738         3                actor   
9957  tt0002130  nm3942815         4                actor   

                                             characters  
9953                                                 \N  
9954                                ["Dante Alighieri"]  
9955                                       ["Virgilio"]  
9956  ["Farinata degli Uberti","Pier delle Vigne","I...  
9957                               ["Il conte Ugolino"]  


In [10]:

ind_drop = df_principals[~df_principals['ordering'].isin([1, 2, 3, 4, 5, 6])].index
df_principals = df_principals.drop(ind_drop)
print(df_principals.head())


        imdb_id     nconst  ordering  category  \
9954  tt0002130  nm0660139         1     actor   
9955  tt0002130  nm0685283         2     actor   
9956  tt0002130  nm0209738         3     actor   
9957  tt0002130  nm3942815         4     actor   
9958  tt0002130  nm0078205         5  director   

                                             characters  
9954                                ["Dante Alighieri"]  
9955                                       ["Virgilio"]  
9956  ["Farinata degli Uberti","Pier delle Vigne","I...  
9957                               ["Il conte Ugolino"]  
9958                                                 \N  


In [11]:

df_principals["category"] = df_principals["category"].apply(lambda x: x if x != "actress" else "actor")
target_profs = ["director", "producer", "actor", "writer"] 
ind_drop = df_principals[~df_principals['category'].isin(target_profs)].index
df_principals = df_principals.drop(ind_drop)
print(df_principals.head())
    

        imdb_id     nconst  ordering  category  \
9954  tt0002130  nm0660139         1     actor   
9955  tt0002130  nm0685283         2     actor   
9956  tt0002130  nm0209738         3     actor   
9957  tt0002130  nm3942815         4     actor   
9958  tt0002130  nm0078205         5  director   

                                             characters  
9954                                ["Dante Alighieri"]  
9955                                       ["Virgilio"]  
9956  ["Farinata degli Uberti","Pier delle Vigne","I...  
9957                               ["Il conte Ugolino"]  
9958                                                 \N  


In [12]:

fpath = "./name.basics.tsv.gz"

df_names = pd.read_table(fpath)
df_names = df_names.loc[:, ["primaryName", "nconst"]]
print(df_names.head())


       primaryName     nconst
0     Fred Astaire  nm0000001
1    Lauren Bacall  nm0000002
2  Brigitte Bardot  nm0000003
3     John Belushi  nm0000004
4   Ingmar Bergman  nm0000005


In [13]:

df_principals = df_principals.merge(df_names, on="nconst")
print(df_principals["characters"])

0                                       ["Dante Alighieri"]
1                                              ["Virgilio"]
2         ["Farinata degli Uberti","Pier delle Vigne","I...
3                                      ["Il conte Ugolino"]
4                                                        \N
                                ...                        
451193                                                   \N
451194                                            ["Sinta"]
451195                                           ["Vikash"]
451196                                             ["Dewi"]
451197                                                   \N
Name: characters, Length: 451198, dtype: object


In [15]:
special_char = df_principals.loc[4, "characters"]
df_principals["characters"] = df_principals["characters"].apply(
    lambda x: [] if x == special_char or len(x) == 0 else json.loads(x))
print(df_principals.head())

print(f"Total time: {time() - t0}")

     imdb_id     nconst  ordering  category  \
0  tt0002130  nm0660139         1     actor   
1  tt0002130  nm0685283         2     actor   
2  tt0002130  nm0209738         3     actor   
3  tt0002130  nm3942815         4     actor   
4  tt0002130  nm0078205         5  director   

                                          characters          primaryName  
0                                  [Dante Alighieri]       Salvatore Papa  
1                                         [Virgilio]      Arturo Pirovano  
2  [Farinata degli Uberti, Pier delle Vigne, Il c...  Giuseppe de Liguoro  
3                                 [Il conte Ugolino]     Pier Delle Vigne  
4                                                 []  Francesco Bertolini  
Total time: 111.03914666175842


# Collect persons

In [16]:
t0 = time()

def collect_movie_persons(x):
     return pd.Series({f"{role}s": x.loc[x.sort_values(by=["ordering"])["category"] == prof, name].values.tolist()
                       for prof, role, name in zip(
                           ["director", "producer", "actor", "writer", "actor"], 
                           ["director", "producer", "actor", "writer", "character"], 
                           ["primaryName", "primaryName", "primaryName", "primaryName", "characters"]
                       )
                      })
    
df_principals = pd.DataFrame(df_principals.groupby('imdb_id').apply(collect_movie_persons))
print(df_principals.head())
df_principals["characters"] = df_principals["characters"].apply(lambda x: sum(x, []) if isinstance(x, list) else [])
print(df_principals.head())


                                       directors producers  \
imdb_id                                                      
tt0002130  [Francesco Bertolini, Adolfo Padovan]        []   
tt0002844                      [Louis Feuillade]        []   
tt0003014                      [Victor Sjöström]        []   
tt0003037                      [Louis Feuillade]        []   
tt0003165                      [Louis Feuillade]        []   

                                                      actors          writers  \
imdb_id                                                                         
tt0002130  [Salvatore Papa, Arturo Pirovano, Giuseppe de ...               []   
tt0002844  [René Navarre, Edmund Breon, Georges Melchior,...  [Marcel Allain]   
tt0003014  [Hilda Borgström, Georg Grönroos, Aron Lindgre...      [Nils Krok]   
tt0003037  [René Navarre, Edmund Breon, Georges Melchior,...  [Marcel Allain]   
tt0003165  [René Navarre, Edmund Breon, Georges Melchior,...  [Marcel Allain]  

In [17]:

df.set_index("imdb_id", inplace=True)
df = df.join(df_principals, on="imdb_id")
df.reset_index(inplace=True)

df.fillna(value={f"{prof}s": "" for prof in target_profs}, inplace=True)

assert df.shape[0] == len(all_movies_ids), print("Number of samples less than number of movies")

print(f"Total time: {time() - t0}")

Total time: 470.4389228820801


# Collect alternative titles

In [18]:
t0 = time()

fpath = "./title.akas.tsv.gz"

df_akas = pd.read_table(fpath)
# df_akas = df_akas.loc[:, ["tconst", "nconst", "ordering", "category"]]
df_akas = df_akas.loc[df_akas["region"] == "US", :]
df_akas.rename(columns={"titleId": "imdb_id"}, inplace=True)

ind_drop = df_akas[~df_akas['imdb_id'].isin(all_movies_ids)].index
df_akas = df_akas.drop(ind_drop)
print(df_akas.shape)
print(df_akas.head())

grouped_data = df_akas.groupby('imdb_id')['title'].apply(lambda x: '::'.join(x))
df_titles = pd.DataFrame(grouped_data)
df_titles.rename(columns={"title": "all_titles"}, inplace=True)
# df_titles.set_index("imdb_id", inplace=True)

df.set_index("imdb_id", inplace=True)
df = df.join(df_titles, on="imdb_id")
df.reset_index(inplace=True)

df.fillna(value={"all_titles": ""}, inplace=True)

assert df.shape[0] == len(all_movies_ids), print("Number of samples less than number of movies")

print(f"Total time: {time() - t0}")

  if self.run_code(code, result):


(81178, 8)
        imdb_id  ordering                                      title region  \
6436  tt0002130        15                            Dante's Inferno     US   
8439  tt0002844        16              Fantomas: The Beltham Mystery     US   
8454  tt0002844         5                          The Phantom Crook     US   
8456  tt0002844         7  Fantômas: In the Shadow of the Guillotine     US   
8961  tt0003037        12                 Fantomas: The Man in Black     US   

     language        types     attributes isOriginalTitle  
6436       \N           \N             \N               0  
8439       \N           \N   review title               0  
8454       \N           \N  reissue title               0  
8456       \N  imdbDisplay             \N               0  
8961       \N  imdbDisplay             \N               0  
Total time: 43.85930609703064


In [19]:
df.head()

Unnamed: 0,imdb_id,title,original title,startYear,endYear,genre,imdb_rating,numVotes,directors,producers,actors,writers,characters,all_titles
0,tt0002130,Dante's Inferno,L'Inferno,1911,0,"[Adventure, Drama, Fantasy]",7.0,2987,"[Francesco Bertolini, Adolfo Padovan]",[],"[Salvatore Papa, Arturo Pirovano, Giuseppe de ...",[],"[Dante Alighieri, Virgilio, Farinata degli Ube...",Dante's Inferno
1,tt0002844,Fantômas: In the Shadow of the Guillotine,Fantômas - À l'ombre de la guillotine,1913,0,"[Crime, Drama]",6.9,2328,[Louis Feuillade],[],"[René Navarre, Edmund Breon, Georges Melchior,...",[Marcel Allain],"[Fantômas, Gurn, Inspector Juve, Jérôme Fandor...",Fantomas: The Beltham Mystery::The Phantom Cro...
2,tt0003014,Ingeborg Holm,Ingeborg Holm,1913,0,[Drama],7.0,1249,[Victor Sjöström],[],"[Hilda Borgström, Georg Grönroos, Aron Lindgre...",[Nils Krok],"[Ingeborg Holm, Poorhouse Superintendant, Sven...",
3,tt0003037,Fantomas: The Man in Black,Juve contre Fantômas,1913,0,"[Crime, Drama]",6.9,1584,[Louis Feuillade],[],"[René Navarre, Edmund Breon, Georges Melchior,...",[Marcel Allain],"[Fantômas, Dr Chaleck, Le Loupart, Inspector J...",Fantomas: The Man in Black::Fantômas: Juve ver...
4,tt0003165,Fantômas: The Dead Man Who Killed,Le mort qui tue,1913,0,"[Crime, Drama, Mystery]",6.9,1251,[Louis Feuillade],[],"[René Navarre, Edmund Breon, Georges Melchior,...",[Marcel Allain],"[Fantômas, le banquier Nanteuil, Inspector Juv...",Fantômas: The Dead Man Who Killed


In [20]:
database = df.to_dict("records")
for el in database:
    el["genre"] = el["genre"] if el["genre"] != "" else None
    el["startYear"] = el["startYear"] if el["startYear"] != 0 else None
    el["endYear"] = el["endYear"] if el["endYear"] != 0 else None
    el["all_titles"] = el["all_titles"].split("::") if el["all_titles"] != "" else []
    for prof in ["director", "producer", "actor", "writer"]:
        el[f"{prof}s"] = list(el[f"{prof}s"]) if list(el[f"{prof}s"]) != "" else []

print(len(database))
with open("database_most_popular_main_info.json", "w") as f:
    json.dump(database, f, indent=2)

94733


In [21]:
database[100]

{'actors': ['Rudolph Valentino', 'Lila Lee', 'Nita Naldi', 'George Field'],
 'all_titles': ['Blood and Sand'],
 'characters': ['Juan Gallardo', 'Carmen', 'Doña Sol', 'El Nacional'],
 'directors': ['Fred Niblo', 'Dorothy Arzner'],
 'endYear': None,
 'genre': ['Drama', 'Romance', 'Sport'],
 'imdb_id': 'tt0012952',
 'imdb_rating': 6.4,
 'numVotes': 1434,
 'original title': 'Blood and Sand',
 'producers': [],
 'startYear': 1922,
 'title': 'Blood and Sand',
 'writers': []}