In [54]:
# Allow python to import modules in our src folders
import sys
sys.path.append("src/")

from preprocessing.load_dataset import *

In [55]:
characters_df = load_characters_df()
movies_df = load_movies_df()
plot_df = load_plot_df()
tvtropes_tf = load_tvtropes_df()

# I suppose that we do not have cameras in 1010...
# movies_df.iloc[62836] = movies_df.iloc[62836].replace(to_replace='1010-12-02', value='2010-12-02')

In [56]:
WIKIDATA_PATH = "data/wikidata/"
WIKIDATA_TRANSLATION_ID = "id-translation.wikidata.json"

In [None]:
import json

with open(WIKIDATA_PATH + WIKIDATA_TRANSLATION_ID, 'r') as file:
    table = json.load(file)



In [3]:
movies_df.sample(5)

Unnamed: 0,wiki_movie_id,freebase_movie_id,movie_name,movie_release_date,box_office,movie_runtime,movie_languages,movie_countries,movie_genres
49542,1966809,/m/069mcz,Too Much Johnson,1938,,40.0,{'/m/02h40lc': 'English Language'},{'/m/09c7w0': 'United States of America'},"{'/m/02hmvc': 'Short Film', '/m/01z4y': 'Comed..."
32473,33099604,/m/0bbr93q,The Unpardonable Sin,1919,,,"{'/m/06ppq': 'Silent film', '/m/02h40lc': 'Eng...",{'/m/09c7w0': 'United States of America'},"{'/m/07s9rl0': 'Drama', '/m/082gq': 'War film'}"
13479,239587,/m/01jrbb,Finding Nemo,2003-05-30,921111197.0,100.0,{'/m/02h40lc': 'English Language'},"{'/m/09c7w0': 'United States of America', '/m/...","{'/m/03k9fj': 'Adventure', '/m/0hj3myq': 'Chil..."
46682,6550586,/m/0g9wv5,Wilbur Wants to Kill Himself,2002-11-08,,104.0,{'/m/02h40lc': 'English Language'},"{'/m/0f8l9c': 'France', '/m/0k6nt': 'Denmark',...","{'/m/06cvj': 'Romantic comedy', '/m/01lrrt': '..."
44146,17697199,/m/047cdy7,Emma's Bliss,2006,,99.0,{'/m/04306rv': 'German Language'},{'/m/0345h': 'Germany'},"{'/m/068d7h': 'Romantic drama', '/m/02l7c8': '..."


In [5]:
print(f"Number of movies loaded: {len(movies_df)}")

Number of movies loaded: 81741


In [6]:
plot_df.sample(5)

Unnamed: 0,wiki_movie_id,plot_summary
9020,9409095,Jericho Hudson is a street smart kid with a gi...
11481,9310559,"It is divided into ten scenes, each of which d..."
3708,32549690,"A young woman, Janice, is living with her rest..."
22252,31304530,Young noblewoman Elisabeth von Salmenau encou...
36304,28559151,"In Gdansk, Poland, in 1989, Alexander Reschke ..."


In [7]:
tvtropes_tf.sample(5)

Unnamed: 0,trope_name,character_data
442,stoner,"{'char': 'Thadeous', 'movie': 'Your Highness',..."
429,slacker,"{'char': 'John Winger', 'movie': 'Stripes', 'i..."
178,dirty_cop,"{'char': 'Colin Sullivan', 'movie': 'The Depar..."
173,dean_bitterman,"{'char': 'Ed Rooney', 'movie': 'Ferris Bueller..."
417,romantic_runnerup,"{'char': 'Proteus', 'movie': 'Sinbad: Legend o..."


In [8]:
# extract years from the release date of movies
movies_df.loc[:, 'movie_release_year'] = movies_df['movie_release_date'].astype(str).str[:4]
movies_df = movies_df[movies_df['movie_release_year'].str.contains("nan")==False]
movies_df['movie_release_year'] = movies_df['movie_release_year'].astype(int)


In [9]:
wrong_dates = movies_df[movies_df['movie_release_year'] < 1800].index
# there is only one movie that has the release date 1010, so we can change 1010 to 2010
movies_df.loc[wrong_dates, 'movie_release_year']= 2010
movies_df.loc[wrong_dates, 'movie_release_date'] ='2010-12-02'

## Perfomance of the movie in terms of revenue

In [10]:
# how many missing values in the column box_office
percentage_missing = movies_df['box_office'].isna().sum() / len(movies_df) * 100
print("{:.2f}% values for the box office revenue are missing.".format(percentage_missing))

88.87% values for the box office revenue are missing.


In [11]:
# There is no revenue from IMDB dataset ?
# https://developer.imdb.com/non-commercial-datasets/

## Extract ratings from the IMDB rating dataset

file: title.rating.tsv
-    tconst: unique identifier of the movie
-    averageRating: average of user ratings
-    numVotes: number of ratings submitted for the movie


In [42]:
# REVIEWS_METADATA = "title.ratings.tsv/data.tsv"
imdb_ratings_df = load_imdb_ratings()
imdb_names_df = load_imdb_basics()
# imdb_crew_df = load_imdb_crew()
# imdb_akas_df = load_imdb_akas()
# imdb_episodes_df = load_imdb_episodes()
# imdb_principals_df = load_imdb_principals()

In [44]:
print(movies_df.count())
print('==')
print(imdb_names_df.count())

wiki_movie_id         81741
freebase_movie_id     81741
movie_name            81741
movie_release_date    74839
box_office             8401
movie_runtime         61291
movie_languages       81741
movie_countries       81741
movie_genres          81741
dtype: int64
==
tconst            661815
titleType         661815
primaryTitle      661813
originalTitle     661813
isAdult           661815
startYear         661815
endYear           661815
runtimeMinutes    661815
genres            661815
dtype: int64


In [53]:
X = imdb_names_df.merge(movies_df, how='inner', left_on='primaryTitle', right_on='movie_name', copy=True, indicator=True)
X
# X[X['_merge'] != 'both'].head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,wiki_movie_id,freebase_movie_id,movie_name,movie_release_date,box_office,movie_runtime,movie_languages,movie_countries,movie_genres,_merge
0,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance,10109752,/m/02q23xk,Miss Jerry,1894-10-09,,,{'/m/06ppq': 'Silent film'},{'/m/09c7w0': 'United States of America'},"{'/m/02hmvc': 'Short Film', '/m/06ppq': 'Silen...",both
1,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport",28703057,/m/0czdh_n,The Corbett-Fitzsimmons Fight,1897-05-22,100000.0,,{},{},{'/m/01z02hx': 'Sports'},both
2,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography",142995,/m/0120y4,The Story of the Kelly Gang,1906-12-26,,70.0,{'/m/02h40lc': 'English Language'},{'/m/0chghy': 'Australia'},"{'/m/0lsxr': 'Crime Fiction', '/m/06ppq': 'Sil...",both
3,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama,27543288,/m/0bhcws6,Robbery Under Arms,1985-03-28,,141.0,{'/m/02h40lc': 'English Language'},{'/m/0chghy': 'Australia'},"{'/m/03g3w': 'History', '/m/0hfjk': 'Western',...",both
4,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama,26167223,/m/0b6gcsf,Robbery Under Arms,1920-10-02,,,{},{'/m/0chghy': 'Australia'},{'/m/07s9rl0': 'Drama'},both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141129,tt9850468,movie,Food,Food,0,\N,\N,\N,\N,6459981,/m/0g66kb,Food,1993-09-11,,17.0,{},"{'/m/01mk6': 'Czechoslovakia', '/m/07ssc': 'Un...","{'/m/02hmvc': 'Short Film', '/m/05p553': 'Come...",both
141130,tt9855214,movie,Kisan,Kisan,0,2006,\N,114,"Drama,Sport",7316179,/m/0kv4t6,Kisan,2006,,,{'/m/0999q': 'Malayalam Language'},{'/m/03rk0': 'India'},{},both
141131,tt9860522,movie,The Unafraid,The Unafraid,0,2018,\N,87,Documentary,14616560,/m/03gqqg_,The Unafraid,1915-04-01,,40.0,"{'/m/06ppq': 'Silent film', '/m/02h40lc': 'Eng...",{'/m/09c7w0': 'United States of America'},"{'/m/02hmvc': 'Short Film', '/m/06ppq': 'Silen...",both
141132,tt9881572,movie,Noose,Noose,0,2019,\N,100,"Comedy,Drama,Mystery",13863650,/m/03cll9g,Noose,1948,,95.0,{'/m/02h40lc': 'English Language'},{'/m/07ssc': 'United Kingdom'},"{'/m/0lsxr': 'Crime Fiction', '/m/07s9rl0': 'D...",both


In [51]:
dp = imdb_names_df[['primaryTitle']].duplicated()
X = imdb_names_df[dp][['primaryTitle']].to_numpy()[0]

imdb_names_df[imdb_names_df['primaryTitle'] == X[0]]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama
1228,tt0001240,movie,Hamlet,Hamlet,0,1911,\N,\N,Drama
2895,tt0002922,movie,Hamlet,Hamlet,0,1913,\N,64,Drama
4006,tt0004049,movie,Hamlet,Amleto,0,1914,\N,\N,Drama
7932,tt0008040,movie,Hamlet,Amleto,0,1917,\N,\N,Drama
12076,tt0012249,movie,Hamlet,Hamlet,0,1921,\N,131,Drama
39713,tt0040416,movie,Hamlet,Hamlet,0,1948,\N,154,Drama
46214,tt0047060,movie,Hamlet,Hamlet,0,1954,\N,\N,Drama
57016,tt0058126,movie,Hamlet,Gamlet,0,1964,\N,140,Drama
57063,tt0058174,movie,Hamlet,Hamile,0,1964,\N,120,Drama


In [14]:
# Extract the movies from the title.basics imbd dataset
imdb_names_df.sample(50)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
5762160,tt2249058,movie,Amateurs,Amateurs,0,2013,\N,80,"Comedy,Drama,Romance"
7457934,tt3645464,movie,Aquilo que Fazemos com as Nossas Desgraças,Aquilo que Fazemos com as Nossas Desgraças,0,2014,\N,63,\N
8382000,tt5733506,movie,The Tie,The Tie,0,\N,\N,85,Drama
3970803,tt15507456,movie,The Weight of Darkness,The Weight of Darkness,0,\N,\N,\N,Horror
3223892,tt1412772,movie,The Secret World of Comic Book Artists,The Secret World of Comic Book Artists,0,2007,\N,\N,Comedy
2814660,tt13374736,movie,The Cold War,The Cold War,0,\N,\N,\N,Comedy
179248,tt0186028,movie,Together Again,Din nou împreuna,0,1978,\N,75,Romance
9761321,tt8787834,movie,The Whispering Man,The Surreal Project,0,2019,\N,74,"Horror,Mystery,Thriller"
114771,tt0117494,movie,Robin Goodfellow,Robin Goodfellow,0,1996,\N,\N,\N
6115273,tt2473740,movie,The Circuit,The Circuit,0,\N,\N,\N,Drama


In [6]:
movies_df.head()

Unnamed: 0,wiki_movie_id,freebase_movie_id,movie_name,movie_release_date,box_office,movie_runtime,movie_languages,movie_countries,movie_genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,{'/m/02h40lc': 'English Language'},{'/m/09c7w0': 'United States of America'},"{'/m/01jfsb': 'Thriller', '/m/06n90': 'Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,{'/m/02h40lc': 'English Language'},{'/m/09c7w0': 'United States of America'},"{'/m/02n4kr': 'Mystery', '/m/03bxz7': 'Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,{'/m/05f_3': 'Norwegian Language'},{'/m/05b4w': 'Norway'},"{'/m/0lsxr': 'Crime Fiction', '/m/07s9rl0': 'D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,{'/m/02h40lc': 'English Language'},{'/m/07ssc': 'United Kingdom'},"{'/m/01jfsb': 'Thriller', '/m/0glj9q': 'Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,{'/m/04306rv': 'German Language'},{'/m/0345h': 'Germany'},{'/m/07s9rl0': 'Drama'}


In [14]:
# merge with ratings
imdb_ratings_meta = imdb_ratings_df.merge(imdb_names_df, on='tconst', how='inner')
imdb_ratings_meta.head()

Unnamed: 0,tconst,averageRating,numVotes,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000009,5.3,207,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
1,tt0000147,5.3,482,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
2,tt0000502,4.1,15,movie,Bohemios,Bohemios,0,1905,\N,100,\N
3,tt0000574,6.0,850,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
4,tt0000591,4.4,20,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama


## Merge the CMU dataset with the IMDB

In [15]:
merged_df = movies_df.merge(imdb_ratings_meta, left_on='movie_name', right_on = 'primaryTitle', how='inner')
print("Size of merged dataset:", merged_df.shape[0])
merged_df.head()

Size of merged dataset: 89255


Unnamed: 0,wiki_movie_id,freebase_movie_id,movie_name,movie_release_date,box_office,movie_runtime,movie_languages,movie_countries,movie_genres,movie_release_year,...,averageRating,numVotes,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,{'/m/02h40lc': 'English Language'},{'/m/09c7w0': 'United States of America'},"{'/m/01jfsb': 'Thriller', '/m/06n90': 'Science...",2001,...,4.9,56673,movie,Ghosts of Mars,Ghosts of Mars,0,2001,\N,98,"Action,Horror,Sci-Fi"
1,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,{'/m/05f_3': 'Norwegian Language'},{'/m/05b4w': 'Norway'},"{'/m/0lsxr': 'Crime Fiction', '/m/07s9rl0': 'D...",1988,...,5.6,40,movie,Brun bitter,Brun bitter,0,1988,\N,83,"Crime,Drama"
2,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,{'/m/04306rv': 'German Language'},{'/m/0345h': 'Germany'},{'/m/07s9rl0': 'Drama'},1983,...,6.0,616,movie,A Woman in Flames,Die flambierte Frau,0,1983,\N,106,Drama
3,13696889,/m/03cfc81,The Gangsters,1913-05-29,,35.0,"{'/m/06ppq': 'Silent film', '/m/02h40lc': 'Eng...",{'/m/09c7w0': 'United States of America'},"{'/m/02hmvc': 'Short Film', '/m/06ppq': 'Silen...",1913,...,5.6,33,movie,The Gangsters,Les truands,0,1957,\N,105,"Comedy,Crime"
4,23070193,/m/064mmt5,The Gangsters,1956,,,{'/m/064_8sq': 'French Language'},{'/m/0f8l9c': 'France'},{'/m/05p553': 'Comedy film'},1956,...,5.6,33,movie,The Gangsters,Les truands,0,1957,\N,105,"Comedy,Crime"
