In [1]:
# Allow python to import modules in our src folders
import sys
sys.path.append("src/")

from preprocessing.load_dataset import *

## Importing the CMU dataset

We load the 4 `.tsv` and `.txt` files of the CMU dataset into 4 different dataframes

In [2]:
characters_df = load_characters_df()
movies_df = load_movies_df()
plot_df = load_plot_df()
tvtropes_tf = load_tvtropes_df()

In [3]:
movies_df.sample(5)

Unnamed: 0,wiki_movie_id,freebase_movie_id,movie_name,movie_release_date,box_office,movie_runtime,movie_languages,movie_countries,movie_genres
8007,61171,/m/0glpb,The House of Rothschild,1934,,88.0,{'/m/02h40lc': 'English Language'},{'/m/09c7w0': 'United States of America'},"{'/m/03bxz7': 'Biographical film', '/m/07s9rl0..."
67343,9878665,/m/06_y5lw,La Culpa la tuvo el otro,1950,,,{'/m/06nm1': 'Spanish Language'},{'/m/0jgd': 'Argentina'},{'/m/01g6gs': 'Black-and-white'}
79230,6086462,/m/0fph6r,Flodder in Amerika,1992,,95.0,"{'/m/05qqm': 'Polish Language', '/m/02bv9': 'D...",{'/m/059j2': 'Netherlands'},{'/m/01z4y': 'Comedy'}
21212,14134377,/m/03cvn6_,In Bad Taste,2000-03-23,,100.0,{'/m/02h40lc': 'English Language'},{'/m/09c7w0': 'United States of America'},"{'/m/0219x_': 'Indie', '/m/0jtdp': 'Documentary'}"
74102,20989785,/m/05b44_g,100 Million BC,2008-07-29,,85.0,{'/m/02h40lc': 'English Language'},{'/m/09c7w0': 'United States of America'},"{'/m/06n90': 'Science Fiction', '/m/03npn': 'H..."


In [4]:
print(f"Number of movies loaded: {len(movies_df)}")

Number of movies loaded: 81741


In [5]:
plot_df.sample(5)

Unnamed: 0,wiki_movie_id,plot_summary
11095,19325619,Forty years after directing and starring in a ...
11130,17291040,Cornman is a superhero whose powers include th...
33662,33790053,The film is set in early 11th century China du...
12369,3787708,"Tommy Spinelli , is a wiseguy hired by a pair ..."
36781,33656303,"Devastated by the death of her younger sister,..."


In [6]:
tvtropes_tf.sample(5)

Unnamed: 0,trope_name,character_data
390,psycho_for_hire,"{'char': 'Major Arnold Toht', 'movie': 'Indian..."
54,bromantic_foil,"{'char': 'Brodie Bruce', 'movie': 'Mallrats', ..."
430,slacker,"{'char': 'Dewey Finn', 'movie': 'School of Roc..."
431,slacker,"{'char': 'Aram Finklestein', 'movie': 'The Reb..."
405,retired_outlaw,"{'char': 'Roy O'Bannon', 'movie': 'Shanghai No..."


In [7]:
# extract years from the release date of movies
movies_df.loc[:, 'movie_release_year'] = movies_df['movie_release_date'].astype(str).str[:4]
movies_df = movies_df[movies_df['movie_release_year'].str.contains("nan")==False]
movies_df['movie_release_year'] = movies_df['movie_release_year'].astype(int)


In [8]:
wrong_dates = movies_df[movies_df['movie_release_year'] < 1800].index
# there is only one movie that has the release date 1010, so we can change 1010 to 2010
movies_df.loc[wrong_dates, 'movie_release_year']= 2010
movies_df.loc[wrong_dates, 'movie_release_date'] ='2010-12-02'

## Perfomance of the movie in terms of revenue

In [9]:
# how many missing values in the column box_office
percentage_missing = movies_df['box_office'].isna().sum() / len(movies_df) * 100
print("{:.2f}% values for the box office revenue are missing.".format(percentage_missing))

88.87% values for the box office revenue are missing.


## Extract ratings from the IMDB rating dataset

file: title.rating.tsv
-    tconst: unique identifier of the movie
-    averageRating: average of user ratings
-    numVotes: number of ratings submitted for the movie


In [10]:
imdb_ratings_df = load_imdb_ratings()
imdb_title_df = load_imdb_title_basics()
imdb_principals_df = load_imdb_title_principals()
imdb_people_df = load_imdb_person_basics()

In [11]:
print(movies_df.count())
print('==========')
print(imdb_title_df.count())

wiki_movie_id         74839
freebase_movie_id     74839
movie_name            74839
movie_release_date    74839
box_office             8328
movie_runtime         58631
movie_languages       74839
movie_countries       74839
movie_genres          74839
movie_release_year    74839
dtype: int64
tconst            661815
titleType         661815
primaryTitle      661815
originalTitle     661815
isAdult           661815
startYear         661815
endYear           661815
runtimeMinutes    661815
genres            661815
dtype: int64


In [12]:
dp = imdb_title_df[['primaryTitle']].duplicated()
X = imdb_title_df[dp][['primaryTitle']].to_numpy()[0]

imdb_title_df[imdb_title_df['primaryTitle'] == X[0]]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama
1228,tt0001240,movie,Hamlet,Hamlet,0,1911,\N,\N,Drama
2895,tt0002922,movie,Hamlet,Hamlet,0,1913,\N,64,Drama
4006,tt0004049,movie,Hamlet,Amleto,0,1914,\N,\N,Drama
7932,tt0008040,movie,Hamlet,Amleto,0,1917,\N,\N,Drama
12076,tt0012249,movie,Hamlet,Hamlet,0,1921,\N,131,Drama
39713,tt0040416,movie,Hamlet,Hamlet,0,1948,\N,154,Drama
46214,tt0047060,movie,Hamlet,Hamlet,0,1954,\N,\N,Drama
57016,tt0058126,movie,Hamlet,Gamlet,0,1964,\N,140,Drama
57063,tt0058174,movie,Hamlet,Hamile,0,1964,\N,120,Drama


In [13]:
# Extract the movies from the title.basics imbd dataset
imdb_title_df.sample(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
7682219,tt4152238,movie,Wadjemup: Black Prison White Playground,Wadjemup: Black Prison White Playground,0,2014,\N,58,"Documentary,Drama"
3930570,tt1543314,movie,Kennzeichen Kohl,Kennzeichen Kohl,0,2009,\N,88,Documentary
6558569,tt27524995,movie,Le complot,Le complot,0,2016,\N,\N,Drama
1798828,tt11498846,movie,A Lively Experiment,A Lively Experiment,0,\N,\N,\N,"Drama,History"
57960,tt0059084,movie,Darling,Darling,0,1965,\N,128,"Drama,Romance"


In [14]:
# merge with ratings
imdb_ratings_meta = imdb_ratings_df.merge(imdb_title_df, on='tconst', how='inner')
imdb_ratings_meta.head()

Unnamed: 0,tconst,averageRating,numVotes,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000009,5.3,207,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
1,tt0000147,5.3,484,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
2,tt0000502,4.1,15,movie,Bohemios,Bohemios,0,1905,\N,100,\N
3,tt0000574,6.0,854,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
4,tt0000591,5.0,21,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama


## Merge the CMU dataset with the IMDB

We first import the translation dataframe, which includes both the IMDB id and the Freebase id, enabling us to combine the IMDB dataset with the CMU dataset.

In [15]:
translation_id = load_translation_df()
translation_id.columns

Index(['imdb_id', 'freebase_id'], dtype='object')

Then we merge the IMDB ratings with the CMU dataset

In [16]:
imdb_ratings_translated_df = imdb_ratings_df.merge(translation_id, how='inner', left_on='tconst', right_on='imdb_id')
merged_df = imdb_ratings_translated_df.merge(movies_df, how='inner', left_on='freebase_id', right_on='freebase_movie_id')

In [17]:
print(f"The merged dataframe contains {len(merged_df)} movies")

The merged dataframe contains 52003 movies
