In [1]:
# Allow python to import modules in our src folders
import sys
sys.path.append("src/")


from preprocessing.load_dataset import *

In [2]:
characters_df = load_characters_df()
movies_df = load_movies_df()
plot_df = load_plot_df()
tvtropes_tf = load_tvtropes_df()

# I suppose that we do not have cameras in 1010...
# movies_df.iloc[62836] = movies_df.iloc[62836].replace(to_replace='1010-12-02', value='2010-12-02')

In [3]:
characters_df.sample(5)

Unnamed: 0,wiki_movie_id,freebase_movie_id,release_date,character_name,actor_birth,actor_gender,actor_height,actor_ethnicity,actor_name,release_actor_age,freebase_map_id,freebase_character_id,freebase_actor_id
255746,26351240,/m/0bbv9mc,1985,,,M,,,Beau Leland,,/m/0gd5fps,,/m/0gd5fpw
237685,3169283,/m/08wsvg,2000-02-21,,1982-09-04,F,,,Lou Doillon,17.0,/m/0j88p6z,,/m/0h1mqg
142629,13348042,/m/03c2gyh,2001-09-12,,1971-04-16,M,1.778,,Max Beesley,30.0,/m/04hv6n6,,/m/09rw9k
214960,2158246,/m/06r3yf,1961,,1955-10-19,F,,,Sabine Haudepin,5.0,/m/0b_8yjz,,/m/04g290y
75690,1089042,/m/044_wc,1922-10-18,,1883-05-23,M,1.75,,Douglas Fairbanks,,/m/0k2l_3,,/m/01hdht


In [4]:
movies_df.sample(5)

Unnamed: 0,wiki_movie_id,freebase_movie_id,movie_name,movie_release_date,box_office,movie_runtime,movie_languages,movie_countries,movie_genres
818,1132138,/m/0490_m,It's a Very Merry Muppet Christmas Movie,2002-11-29,,85.0,{'/m/02h40lc': 'English Language'},{'/m/09c7w0': 'United States of America'},"{'/m/0hj3myq': 'Children's/Family', '/m/0hj3n6..."
41450,212612,/m/0glsn39,Urbanized,2011-09-09,,82.0,"{'/m/02h40lc': 'English Language', '/m/0t_2': ...","{'/m/09c7w0': 'United States of America', '/m/...",{'/m/0jtdp': 'Documentary'}
64494,34014071,/m/0hrdn0y,The Gruffalo's Child,2011-12,,27.0,{'/m/02h40lc': 'English Language'},"{'/m/07ssc': 'United Kingdom', '/m/0345h': 'Ge...","{'/m/02hmvc': 'Short Film', '/m/0hqxf': 'Famil..."
33292,26553693,/m/0bh7m9d,A Talent for Loving,1969,,110.0,{'/m/02h40lc': 'English Language'},"{'/m/09c7w0': 'United States of America', '/m/...","{'/m/0gf28': 'Parody', '/m/03btsm8': 'Action/A..."
68357,32960328,/m/0h5387n,Baat Ban Jaye,,,135.0,{'/m/03k50': 'Hindi Language'},{},"{'/m/02kdv5l': 'Action', '/m/01z4y': 'Comedy'}"


In [5]:
print(f"Number of movies loaded: {len(movies_df)}")

Number of movies loaded: 81741


In [6]:
plot_df.sample(5)

Unnamed: 0,wiki_movie_id,plot_summary
90,32137084,The film CHAMPION is a tale about a man who ha...
11894,19286426,The story is based on the story of sea-pirates...
9035,9442685,The scenes of this film do not appear in chron...
6155,11186374,Kamini Gupta and Vinod Gupta with their two ...
29007,29458938,Nadan who find himself for falling in love wit...


In [7]:
tvtropes_tf.sample(5)

Unnamed: 0,trope_name,character_data
58,bruiser_with_a_soft_center,"{'char': 'Tron', 'movie': 'Tron', 'id': '/m/0h..."
222,egomaniac_hunter,"{'char': 'Charles Muntz', 'movie': 'Up', 'id':..."
302,heartbroken_badass,"{'char': 'Bruce Wayne', 'movie': 'The Dark Kni..."
410,revenge,"{'char': 'Harmonica', 'movie': 'Once Upon a Ti..."
469,tranquil_fury,"{'char': 'John Preston', 'movie': 'Equilibrium..."


In [8]:
# extract years from the release date of movies
movies_df.loc[:, 'movie_release_year'] = movies_df['movie_release_date'].astype(str).str[:4]
movies_df = movies_df[movies_df['movie_release_year'].str.contains("nan")==False]
movies_df['movie_release_year'] = movies_df['movie_release_year'].astype(int)


In [9]:
wrong_dates = movies_df[movies_df['movie_release_year']<  1800].index
# there is only one movie that has the release date 1010, so we can change 1010 to 2010
movies_df.loc[wrong_dates, 'movie_release_year']= 2010
movies_df.loc[wrong_dates, 'movie_release_date'] ='2010-12-02'

**Perfomance of the movie in terms of revenue**

In [10]:
# how many missing values in the column box_office
percentage_missing = movies_df['box_office'].isna().sum() / len(movies_df) * 100
print("{:.2f}% values for the box office revenue are missing.".format(percentage_missing))

88.87% values for the box office revenue are missing.


In [11]:
# There is no revenue from IMDB dataset ?
# https://developer.imdb.com/non-commercial-datasets/

**Extract ratings from the IMDB rating dataset**

file: title.rating.tsv
-    tconst: unique identifier of the movie
-    averageRating: average of user ratings
-    numVotes: number of ratings submitted for the movie


In [12]:
IMDB_PATH = "imdb/"
REVIEWS_METADATA = "title.ratings.tsv/data.tsv"
imdb_ratings = pd.read_csv(os.path.join(DATASET_PATH+IMDB_PATH, REVIEWS_METADATA), sep='	')
imdb_ratings.head()


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2004
1,tt0000002,5.8,269
2,tt0000003,6.5,1902
3,tt0000004,5.5,178
4,tt0000005,6.2,2685


In [13]:
# Extract the movies from the title.basics imbd dataset
VIDEO_METADATA = 'title.basics.tsv/data.tsv'   # move to load_dataset file 
imdb_names_df = pd.read_csv(os.path.join(DATASET_PATH+IMDB_PATH, VIDEO_METADATA), dtype={4: str}, sep='	')
# Only keep the ones labeled as movies
imdb_names_df = imdb_names_df[imdb_names_df['titleType'] == 'movie']
imdb_names_df.head()


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama


In [14]:
# merge with ratings
imdb_ratings_meta = imdb_ratings.merge(imdb_names_df, on='tconst', how='inner')
imdb_ratings_meta.head()

Unnamed: 0,tconst,averageRating,numVotes,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000009,5.3,207,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
1,tt0000147,5.3,484,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
2,tt0000502,4.1,15,movie,Bohemios,Bohemios,0,1905,\N,100,\N
3,tt0000574,6.0,854,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
4,tt0000591,5.0,21,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama


**Merge the CMU dataset with the IMDB**

In [18]:
merged_df = movies_df.merge(imdb_ratings_meta, left_on='movie_name', right_on = 'primaryTitle', how='inner')
print("Size of merged dataset:", merged_df.shape[0])
merged_df.head()

Size of merged dataset: 89595


Unnamed: 0,wiki_movie_id,freebase_movie_id,movie_name,movie_release_date,box_office,movie_runtime,movie_languages,movie_countries,movie_genres,movie_release_year,...,averageRating,numVotes,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,{'/m/02h40lc': 'English Language'},{'/m/09c7w0': 'United States of America'},"{'/m/01jfsb': 'Thriller', '/m/06n90': 'Science...",2001,...,4.9,56854,movie,Ghosts of Mars,Ghosts of Mars,0,2001,\N,98,"Action,Horror,Sci-Fi"
1,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,{'/m/05f_3': 'Norwegian Language'},{'/m/05b4w': 'Norway'},"{'/m/0lsxr': 'Crime Fiction', '/m/07s9rl0': 'D...",1988,...,5.6,40,movie,Brun bitter,Brun bitter,0,1988,\N,83,"Crime,Drama"
2,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,{'/m/04306rv': 'German Language'},{'/m/0345h': 'Germany'},{'/m/07s9rl0': 'Drama'},1983,...,6.0,621,movie,A Woman in Flames,Die flambierte Frau,0,1983,\N,106,Drama
3,13696889,/m/03cfc81,The Gangsters,1913-05-29,,35.0,"{'/m/06ppq': 'Silent film', '/m/02h40lc': 'Eng...",{'/m/09c7w0': 'United States of America'},"{'/m/02hmvc': 'Short Film', '/m/06ppq': 'Silen...",1913,...,5.6,33,movie,The Gangsters,Les truands,0,1957,\N,105,"Comedy,Crime"
4,13696889,/m/03cfc81,The Gangsters,1913-05-29,,35.0,"{'/m/06ppq': 'Silent film', '/m/02h40lc': 'Eng...",{'/m/09c7w0': 'United States of America'},"{'/m/02hmvc': 'Short Film', '/m/06ppq': 'Silen...",1913,...,5.8,5,movie,The Gangsters,The Gangsters,0,2019,\N,\N,Action
