In [106]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from matplotlib import pyplot as plt

In [107]:
movie_headers = ["Wikipedia movie ID", "Freebase movie ID", "Movie name", "Movie release date", "Movie box office revenue", "Movie runtime", "Movie languages", "Movie countries", "Movie genres"]
movie_headers = [header.lower().replace(' ', '_') for header in movie_headers]

## Loading the "_CMU Movie Summary Corpus_" dataset

In [108]:
CMU_df = pd.read_csv("./data/MovieSummaries/movie.metadata.tsv", sep='\t', header=None, names=movie_headers)
print("CMU data shape:", CMU_df.shape)
CMU_df.head(3)

CMU data shape: (81741, 9)


Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."


### Cleaning columns `movie_languages`, `movie_countries` and `movie_genres`

In [109]:
CMU_cleaned_df = CMU_df.copy(deep=True)
cols_to_clean = ["movie_languages", "movie_countries", "movie_genres"]

for col in cols_to_clean:
		CMU_cleaned_df[col] = CMU_cleaned_df[col].apply(lambda x: list(dict(eval(x)).values()))

CMU_cleaned_df.to_pickle("./data/CMU_cleaned.pkl")

## Loading the "_TMDB_" dataset

In [110]:
TMDB_df = pd.read_csv("./data/TMDBMovies/TMDB_movie_dataset_v11.csv", sep=',', header=0)
print("TMDB data shape:", TMDB_df.shape)
TMDB_df.head(3)

TMDB data shape: (1133053, 24)


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,160000000,https://www.warnerbros.com/movies/inception,tt1375666,en,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,165000000,http://www.interstellarmovie.net/,tt0816692,en,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,185000000,https://www.warnerbros.com/movies/dark-knight/,tt0468569,en,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."


In [None]:
TMDB_cleaned_df = TMDB_df.copy(deep=True)
cols_to_clean = ['genres', 'production_companies', 'production_countries', 'spoken_languages', 'keywords']

for col in cols_to_clean:
	 TMDB_cleaned_df[col] = TMDB_cleaned_df[col].apply(lambda x: x.split(', ') if x is not np.nan else x)

In [112]:
TMDB_cleaned_df.head(3)

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,160000000,https://www.warnerbros.com/movies/inception,tt1375666,en,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"[Action, Science Fiction, Adventure]","[Legendary Pictures, Syncopy, Warner Bros. Pic...","[United Kingdom, United States of America]","[English, French, Japanese, Swahili]","[rescue, mission, dream, airplane, paris, fran..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,165000000,http://www.interstellarmovie.net/,tt0816692,en,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"[Adventure, Drama, Science Fiction]","[Legendary Pictures, Syncopy, Lynda Obst Produ...","[United Kingdom, United States of America]",[English],"[rescue, future, spacecraft, race against time..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,185000000,https://www.warnerbros.com/movies/dark-knight/,tt0468569,en,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"[Drama, Action, Crime, Thriller]","[DC Comics, Legendary Pictures, Syncopy, Isobe...","[United Kingdom, United States of America]","[English, Mandarin]","[joker, sadism, chaos, secret identity, crime ..."


## Merging the two datasets on their `freebase_id`

In [123]:
TMDB_ids_df = TMDB_cleaned_df.dropna(subset=['imdb_id'])
query_df = pd.read_csv("./data/Converter/query.csv", sep=',', header=0)
TMDB_ids_df = TMDB_ids_df.merge(query_df, how='inner', on='imdb_id')
combined_df = pd.merge(CMU_cleaned_df, TMDB_ids_df, left_on='freebase_movie_id', right_on='freebase_id', how='inner')
combined_df.head(3)

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,budget,homepage,imdb_id,original_language,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,freebase_id
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,...",10016,Ghosts of Mars,5.127,977,Released,2001-08-24,14010832,98,False,/anSbunnEMI0TSmizqUSRACoe18l.jpg,28000000,http://www.theofficialjohncarpenter.com/ghost-...,tt0228333,en,Ghosts of Mars,"In 2176, a Martian police unit is sent to pick...",14.189,/i2zztssCIbahGES1fdfWFmDXian.jpg,Terror is the same on any planet.,"[Action, Horror, Science Fiction]","[Animationwerks, Screen Gems, Storm King Produ...",[United States of America],[English],"[future, planet mars, anti hero, possession, h...",/m/03vyhn
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,[English Language],[United States of America],"[Mystery, Biographical film, Drama, Crime Drama]",784579,Getting Away with Murder: The JonBenét Ramsey ...,0.0,0,Released,2000-02-16,0,60,False,,0,,tt0245916,en,Getting Away with Murder: The JonBenét Ramsey ...,Dramatization of the story behind the murder o...,0.6,,,"[Drama, Crime]",,[United States of America],[English],"[colorado, jonbenet]",/m/08yl5d
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,[Norwegian Language],[Norway],"[Crime Fiction, Drama]",396302,Hair of the Dog,0.0,0,Released,1988-11-17,0,83,False,/6nGFaOiHOorBAIqSDuUQlMtD6sQ.jpg,0,,tt0094806,no,Brun bitter,A stolen bicycle case ends with drunken detect...,0.898,/rtFEOQQ0LVDv7y2bCgVVg3wWpXK.jpg,A film about murder and love,"[Mystery, Crime, Drama]","[Filmeffekt AS, Norsk Film]",[Norway],[Norwegian],"[bicycle, private detective]",/m/0crgdbh


## Cleaning merged dataset

In [124]:
combined_df['runtime_final'] = combined_df['movie_runtime'].fillna(combined_df['runtime'])
combined_df['release_date_final'] = combined_df['release_date'].fillna(combined_df['movie_release_date'])

In [125]:
columns_to_drop = [
  "wikipedia_movie_id", 
  "freebase_movie_id",
 	"movie_box_office_revenue", 
  "id", 
	"status", 
	"adult",
  "backdrop_path", 
  "homepage", 
  "poster_path", 
  "tagline", 
  "movie_runtime",
  "runtime",
	"movie_release_date",
	"release_date",
	"movie_languages",
	"movie_countries",
	# "movie_genres",
]

combined_df.drop(columns=columns_to_drop, inplace=True, axis=1)

In [126]:
combined_df.head()

Unnamed: 0,movie_name,movie_genres,title,vote_average,vote_count,revenue,budget,imdb_id,original_language,original_title,overview,popularity,genres,production_companies,production_countries,spoken_languages,keywords,freebase_id,runtime_final,release_date_final
0,Ghosts of Mars,"[Thriller, Science Fiction, Horror, Adventure,...",Ghosts of Mars,5.127,977,14010832,28000000,tt0228333,en,Ghosts of Mars,"In 2176, a Martian police unit is sent to pick...",14.189,"[Action, Horror, Science Fiction]","[Animationwerks, Screen Gems, Storm King Produ...",[United States of America],[English],"[future, planet mars, anti hero, possession, h...",/m/03vyhn,98.0,2001-08-24
1,Getting Away with Murder: The JonBenét Ramsey ...,"[Mystery, Biographical film, Drama, Crime Drama]",Getting Away with Murder: The JonBenét Ramsey ...,0.0,0,0,0,tt0245916,en,Getting Away with Murder: The JonBenét Ramsey ...,Dramatization of the story behind the murder o...,0.6,"[Drama, Crime]",,[United States of America],[English],"[colorado, jonbenet]",/m/08yl5d,95.0,2000-02-16
2,Brun bitter,"[Crime Fiction, Drama]",Hair of the Dog,0.0,0,0,0,tt0094806,no,Brun bitter,A stolen bicycle case ends with drunken detect...,0.898,"[Mystery, Crime, Drama]","[Filmeffekt AS, Norsk Film]",[Norway],[Norwegian],"[bicycle, private detective]",/m/0crgdbh,83.0,1988-11-17
3,White Of The Eye,"[Thriller, Erotic thriller, Psychological thri...",White of the Eye,5.742,64,0,0,tt0094320,en,White of the Eye,"In a wealthy and isolated desert community, a ...",8.297,"[Horror, Thriller]",[Mrs. White's Productions],[United Kingdom],[English],"[based on novel or book, gas station, psychopa...",/m/0285_cd,110.0,1987-06-19
4,A Woman in Flames,[Drama],A Woman in Flames,5.3,13,0,0,tt0083949,de,Die flambierte Frau,"Eva, an upper-class housewife, frustratedly le...",2.801,[Drama],[Dieter Geissler Filmproduktion],[Germany],[German],"[jealousy, eroticism, gigolo, longing, dominat...",/m/01mrr1,106.0,1983-05-11


In [127]:
print("Data shape:",combined_df.shape)
combined_df.to_pickle('./data/combined_dataset.pkl')

Data shape: (68396, 20)
