In [1]:
import pandas as pd

In [2]:
def extract_year(date):
    if date:
        return str(date).split('-')[0]
    return None

#### CMU dataset

Link to download the data: https://www.cs.cmu.edu/~ark/personas/

In [3]:
df_cmu_movie_metadata = pd.read_csv('../../data/cmu/movie.metadata.tsv', sep='\t', header=None)
df_cmu_movie_metadata.columns = ['wikipedia_movie_id', 'freebase_movie_id', 'name', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres']
df_cmu_movie_metadata.sample(5)

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,name,release_date,revenue,runtime,languages,countries,genres
8047,7643463,/m/0kvd1p,Double Dragon,1994-11-04,2341309.0,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06n90"": ""Science Fiction"", ""/m/04t2t"": ""M..."
18846,30731965,/m/0gfgnn5,An Angel Named Billy,,,120.0,{},{},"{""/m/07s9rl0"": ""Drama""}"
80859,4405649,/m/0c0jgy,The Little Vampire,2000,27965865.0,95.0,{},"{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/0hj3myq"": ""Children's/Family"", ""/m/0bj8m2..."
25493,2631788,/m/07t0lc,Arabesque,1966-05-05,,106.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/0c3351"": ""Suspen..."
27196,32127870,/m/0gwz_3r,Baader,2002-02-15,,110.0,{},"{""/m/0345h"": ""Germany""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/01jfsb"": ""Th..."


In [4]:
df_cmu_movie_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81741 entries, 0 to 81740
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   wikipedia_movie_id  81741 non-null  int64  
 1   freebase_movie_id   81741 non-null  object 
 2   name                81741 non-null  object 
 3   release_date        74839 non-null  object 
 4   revenue             8401 non-null   float64
 5   runtime             61291 non-null  float64
 6   languages           81741 non-null  object 
 7   countries           81741 non-null  object 
 8   genres              81741 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 5.6+ MB


In [5]:
print("CMU Movie Metadata shape before filtering: ", df_cmu_movie_metadata.shape)

CMU Movie Metadata shape before filtering:  (81741, 9)


In [6]:
print("Number of movies with revenue information: ", df_cmu_movie_metadata[df_cmu_movie_metadata['revenue'].notnull()].shape)
# We need to update this column with updated information

Number of movies with revenue information:  (8401, 9)


In [7]:
print("Number of movies with release date information: ", df_cmu_movie_metadata[df_cmu_movie_metadata['release_date'].notnull()].shape)
df_cmu_movie_metadata.dropna(subset=['release_date'], inplace=True)

Number of movies with release date information:  (74839, 9)


In [8]:
df_cmu_movie_metadata['release_year'] = df_cmu_movie_metadata['release_date'].apply(extract_year)
df_cmu_movie_metadata.head()

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,name,release_date,revenue,runtime,languages,countries,genres,release_year
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",2001
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",2000
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",1988
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",1987
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",1983


In [9]:
df_cmu_plot_summaries = pd.read_csv('../../data/cmu/plot_summaries.txt', sep='\t', header=None)
df_cmu_plot_summaries.columns = ['wiki_id', 'summary']
df_cmu_plot_summaries.sample(5)

Unnamed: 0,wiki_id,summary
10675,11614763,"In a Trieste gambling casino, the cynical Coun..."
38255,9179879,The big high school dance in Santa Barbara is ...
12409,34982215,"One night, a young traveller seeks shelter in ..."
21357,8476961,"Anna, a modern day Parisian psychologist, is r..."
4869,36252599,Sivankutty loses his job as a police officer w...


In [10]:
df_cmu_character_metadata = pd.read_csv('../../data/cmu/character.metadata.tsv', sep='\t', header=None)
df_cmu_character_metadata.columns = [
    "wikipedia_movie_id", "freebase_movie_id", "movie_release_date", "character_name",
    "actor_date_of_birth", "actor_gender", "actor_height_in_meters", "actor_ethnicity_freebase_id",
    "actor_name", "actor_age_at_movie_release", "freebase_character_actor_map_id",
    "freebase_character_id", "freebase_actor_id"
]

df_cmu_character_metadata.sample(5)

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_release_date,character_name,actor_date_of_birth,actor_gender,actor_height_in_meters,actor_ethnicity_freebase_id,actor_name,actor_age_at_movie_release,freebase_character_actor_map_id,freebase_character_id,freebase_actor_id
43688,346196,/m/01yr53,1984-05-14,,1946-09-19,M,,,Michael Elphick,37.0,/m/0k4_3t,,/m/07rqcj
115432,32367366,/m/0ch5kjd,1974,,,,,,Sylvio Dieu,,/m/0gc74gm,,/m/0gc74gr
238821,231829,/m/01hqhm,1999-12-08,Gwenovier,1962-05-12,F,,/m/0x67,April Grace,37.0,/m/0k0bcx,/m/0cj5y2f,/m/0dm44t
386454,2062208,/m/06jfpb,1989-08-11,Pandit,1920-06-14,M,,,Bharat Bhushan,69.0,/m/0h77s7d,/m/0h77s7h,/m/02vmzb
190072,33026594,/m/0h526ws,,,1876-05-08,M,,,Monroe Salisbury,,/m/0h588vk,,/m/04ctj37


#### Tropes dataset

Link to download the data: https://drive.google.com/file/d/1Duyz5ATlLHzwMidj15bWVnWHpdE4aRXn/view?usp=sharing

In [11]:
df_tropes = pd.read_csv('../../data/tropes/tropes.csv', index_col=0)
df_tropes.columns = ['trope_id', 'trope', 'description']
df_tropes.sample(5)

Unnamed: 0,trope_id,trope,description
12131,t12132,JudgeJuryAndExecutioner,"In modern legal systems, the power to render j..."
25846,t25847,WhatHappenedToMommy,One of the protagonists has a friend or loved ...
20111,t20112,Shipping,Rooting for fictional romance to happen.\nThe ...
1781,t01782,BalancedHarem,"A Subtrope of the Harem Anime genre, a Romanti..."
13095,t13096,LockedRoomMystery,A seemingly impossible crime. The standard exa...


In [12]:
df_imdb_movie_tropes = pd.read_csv('../../data/tropes/film_imdb_match.csv', index_col=0)
df_imdb_movie_tropes.columns = ['title', 'trope', 'example', 'clean_title', 'tconst', 'trope_id', 'title_id']
df_imdb_movie_tropes = df_imdb_movie_tropes.drop(columns=['trope'])
df_imdb_movie_tropes.head()

Unnamed: 0,title,example,clean_title,tconst,trope_id,title_id
0,ABBATheMovie,The concert segments make it clear that Agnet...,abbathemovie,tt0075617,t14656,f0
1,ABBATheMovie,The radio station manager that tasks Ashley w...,abbathemovie,tt0075617,t11527,f0
2,ABBATheMovie,A rare male example; when Benny reads one new...,abbathemovie,tt0075617,t23019,f0
3,ABBATheMovie,During Ashley's fantasy sequence in which he ...,abbathemovie,tt0075617,t09016,f0
4,ABBATheMovie,"Frida's outfit in the ""Why Did It Have to Be ...",abbathemovie,tt0075617,t25994,f0


In [13]:
df_imdb_movie_tropes = df_imdb_movie_tropes.merge(df_tropes, how='inner', left_on='trope_id', right_on='trope_id')
df_imdb_movie_tropes = df_imdb_movie_tropes[['tconst', 'title_id', 'clean_title', 'trope_id', 'trope', 'description', 'example']]
df_imdb_movie_tropes.rename(columns={'tconst': 'imdb_id'}, inplace=True)
df_imdb_movie_tropes.head()

Unnamed: 0,imdb_id,title_id,clean_title,trope_id,trope,description,example
0,tt0075617,f0,abbathemovie,t14656,MsFanservice,\nA female character who provides a significan...,The concert segments make it clear that Agnet...
1,tt0075617,f0,abbathemovie,t11527,InsistentTerminology,"\nHey, we aren't ""describing"" Insistent Termin...",The radio station manager that tasks Ashley w...
2,tt0075617,f0,abbathemovie,t23019,TheIngenue,\nThe Ingenuenote pronounced ON-jeh-noo is a y...,A rare male example; when Benny reads one new...
3,tt0075617,f0,abbathemovie,t09016,GettingCrapPastTheRadar,\n\nGetting Crap Past the Radar refers to inst...,During Ashley's fantasy sequence in which he ...
4,tt0075617,f0,abbathemovie,t25994,WhoWearsShortShorts,"\nHot pants, mini shorts, or daisy dukes? What...","Frida's outfit in the ""Why Did It Have to Be ..."


In [14]:
K = 10
top_k_tropes = df_imdb_movie_tropes['trope'].value_counts().reset_index().head(K)
top_k_tropes

Unnamed: 0,trope,count
0,ShoutOut,1151
1,HorrorFilms,1072
2,FilmsOfThe1980s,771
3,OhCrap,719
4,ChekhovsGun,716
5,BigBad,635
6,Foreshadowing,627
7,DeadpanSnarker,555
8,TheCameo,552
9,BittersweetEnding,513


#### IMDB dataset

Link to download the data: https://developer.imdb.com/non-commercial-datasets/, download the title.basics.tsv.gz file

In [15]:
df_imdb = pd.read_csv('../../data/imdb/title.basics.tsv', sep='\t')
df_imdb.sample(5)

  df_imdb = pd.read_csv('../../data/imdb/title.basics.tsv', sep='\t')


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
341338,tt0356488,short,Cousin of Sleep,Cousin of Sleep,0,2001,\N,27,"Drama,Short"
6547648,tt2752332,tvEpisode,New Kid in Town,New Kid in Town,0,2013,\N,44,"Adventure,Reality-TV"
3211382,tt14112194,tvEpisode,Episode #1.20,Episode #1.20,0,\N,\N,\N,Drama
4947250,tt1908264,tvEpisode,Episode dated 19 July 2007,Episode dated 19 July 2007,0,2007,\N,\N,Reality-TV
6601086,tt27669641,tvEpisode,Better than Mario Party. JK It sucks. (with Ra...,Better than Mario Party. JK It sucks. (with Ra...,0,2023,\N,\N,Comedy


#### TMDB dataset

Link to download the data: https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies?resource=download

In [16]:
df_tmdb = pd.read_csv('../../data/tmdb/TMDB_movie_dataset_v11.csv')
df_tmdb.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords'],
      dtype='object')

In [17]:
df_tmdb.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ..."
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,The Avengers,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com..."


In [18]:
df_tmdb['release_year'] = df_tmdb['release_date'].apply(extract_year)
df_tmdb.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,release_year
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",2010
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,...",2014
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f...",2008
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ...",2009
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com...",2012


In [19]:
# Check tmdb dataset shape before filtering
df_tmdb.shape

(1131296, 25)

In [20]:
# Clean tmdb dataset before merging it with the cmu dataset

# Filter released movies
df_tmdb = df_tmdb[df_tmdb['status'] == 'Released']
print("Number of released movies in tmdb dataset: ", df_tmdb.shape)

Number of released movies in tmdb dataset:  (1105740, 25)


In [21]:
# Drop movies with missing release date
df_tmdb.dropna(subset=['release_date'], inplace=True)
print("Number of movies with release date information: ", df_tmdb.shape)

Number of movies with release date information:  (938049, 25)


##### Merge IMDB and Tropes datasets

In [22]:
# tropes with imdb ids matched
df_imdb_movie_tropes.head()

Unnamed: 0,imdb_id,title_id,clean_title,trope_id,trope,description,example
0,tt0075617,f0,abbathemovie,t14656,MsFanservice,\nA female character who provides a significan...,The concert segments make it clear that Agnet...
1,tt0075617,f0,abbathemovie,t11527,InsistentTerminology,"\nHey, we aren't ""describing"" Insistent Termin...",The radio station manager that tasks Ashley w...
2,tt0075617,f0,abbathemovie,t23019,TheIngenue,\nThe Ingenuenote pronounced ON-jeh-noo is a y...,A rare male example; when Benny reads one new...
3,tt0075617,f0,abbathemovie,t09016,GettingCrapPastTheRadar,\n\nGetting Crap Past the Radar refers to inst...,During Ashley's fantasy sequence in which he ...
4,tt0075617,f0,abbathemovie,t25994,WhoWearsShortShorts,"\nHot pants, mini shorts, or daisy dukes? What...","Frida's outfit in the ""Why Did It Have to Be ..."


In [23]:
# merge imdb titles basics information with tropes
df_movie_tropes = pd.merge(df_imdb_movie_tropes, df_imdb, how='inner', left_on='imdb_id', right_on='tconst')

In [24]:
print("-------" * 10)
print(f"imdb shape: {df_imdb.shape}")
print(f"movie tropes imdb shape: {df_imdb_movie_tropes.shape}")
print(f"movie tropes merged with imdb dataset shape: {df_movie_tropes.shape}")
print("-------" * 10)

df_movie_tropes.head()

----------------------------------------------------------------------
imdb shape: (11230548, 9)
movie tropes imdb shape: (390511, 7)
movie tropes merged with imdb dataset shape: (390408, 16)
----------------------------------------------------------------------


Unnamed: 0,imdb_id,title_id,clean_title,trope_id,trope,description,example,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0075617,f0,abbathemovie,t14656,MsFanservice,\nA female character who provides a significan...,The concert segments make it clear that Agnet...,tt0075617,movie,ABBA: The Movie,ABBA: The Movie,0,1977,\N,95,"Comedy,Documentary,Music"
1,tt0075617,f0,abbathemovie,t11527,InsistentTerminology,"\nHey, we aren't ""describing"" Insistent Termin...",The radio station manager that tasks Ashley w...,tt0075617,movie,ABBA: The Movie,ABBA: The Movie,0,1977,\N,95,"Comedy,Documentary,Music"
2,tt0075617,f0,abbathemovie,t23019,TheIngenue,\nThe Ingenuenote pronounced ON-jeh-noo is a y...,A rare male example; when Benny reads one new...,tt0075617,movie,ABBA: The Movie,ABBA: The Movie,0,1977,\N,95,"Comedy,Documentary,Music"
3,tt0075617,f0,abbathemovie,t09016,GettingCrapPastTheRadar,\n\nGetting Crap Past the Radar refers to inst...,During Ashley's fantasy sequence in which he ...,tt0075617,movie,ABBA: The Movie,ABBA: The Movie,0,1977,\N,95,"Comedy,Documentary,Music"
4,tt0075617,f0,abbathemovie,t25994,WhoWearsShortShorts,"\nHot pants, mini shorts, or daisy dukes? What...","Frida's outfit in the ""Why Did It Have to Be ...",tt0075617,movie,ABBA: The Movie,ABBA: The Movie,0,1977,\N,95,"Comedy,Documentary,Music"


In [25]:
df_movie_tropes.to_csv('../../data/movie_tropes.csv', index=False)

#### Merge CMU and TMDB datasets

In [26]:
df_cmu_movie_metadata.columns

Index(['wikipedia_movie_id', 'freebase_movie_id', 'name', 'release_date',
       'revenue', 'runtime', 'languages', 'countries', 'genres',
       'release_year'],
      dtype='object')

In [27]:
df_tmdb.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords', 'release_year'],
      dtype='object')

Example before merging:

In [28]:
MOVIE = 'Avatar'

In [29]:
df_cmu_movie_metadata[df_cmu_movie_metadata['name'] == MOVIE]

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,name,release_date,revenue,runtime,languages,countries,genres,release_year
23702,4273140,/m/0bth54,Avatar,2009-12-10,2782275000.0,178.0,"{""/m/02h40lc"": ""English Language"", ""/m/06nm1"":...","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",2009
52274,15945267,/m/03qhwlm,Avatar,2004,,90.0,"{""/m/02h40lc"": ""English Language""}","{""/m/06t2t"": ""Singapore""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",2004


In [30]:
df_tmdb[df_tmdb['title'] == MOVIE]

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,release_year
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ...",2009
45668,1096978,Avatar,6.022,23,Released,2006-04-11,0,15,False,/pzgzH4LJoFmQKVPgKs0VZPWUoFZ.jpg,...,Tension mounts between a quadraplegic man and ...,4.098,/gmnD2e1RvMdCl9D1rsDEQaQlJxK.jpg,A bizarre love story...,Drama,Just Films,Spain,Spanish,,2006
69784,282908,Avatar,5.875,12,Released,2011-04-30,0,93,False,/gO81IsgrhwUS6aChXCjnjFKUS3S.jpg,...,Michiko lost her dad in a car accident when sh...,3.005,/xEAhV18F5Ej78qL0EneyPqa2a20.jpg,,Horror,,Japan,Japanese,,2011
105594,421403,Avatar,6.3,6,Released,1941-09-16,0,114,False,,...,"Indranath’s son is ill, Guru Omkarananda sugge...",0.771,,,,,,,,1941
515600,1295181,Avatar,0.0,0,Released,1916-03-06,0,50,False,/mljnZyk8gwO2YH9EDC5SMy4XgeP.jpg,...,Based on Théophile Gautier's novel of the same...,1.4,/nUTlHxnwomoIwojD0AF0OMzkonw.jpg,,"Drama, Fantasy",Società Italiana Cines,Italy,No Language,,1916


Merging the CMU and TMDB datasets by movie name and release year

In [31]:
# merge cmu movie metadata with tmdb dataset to fill in missing information such as revenue which has a lot of missing values
df_cmu_movie_metadata = df_cmu_movie_metadata[['wikipedia_movie_id', 'freebase_movie_id', 'name', 'release_year']]
df_cmu_tmdb = pd.merge(
    df_tmdb,
    df_cmu_movie_metadata,
    how='inner',
    left_on=['title','release_year'],
    right_on=['name', 'release_year']
)

print("-------" * 10)
print(f"CMU Movie Summary Corpus shape: {df_cmu_movie_metadata.shape}")
print(f"TMDB shape: {df_tmdb.shape}")
print(f"CMU TMDB merged dataframe shape: {df_cmu_tmdb.shape}")
print("-------" * 10)

df_cmu_tmdb.head()

----------------------------------------------------------------------
CMU Movie Summary Corpus shape: (74839, 4)
TMDB shape: (938049, 25)
CMU TMDB merged dataframe shape: (50630, 28)
----------------------------------------------------------------------


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,tagline,genres,production_companies,production_countries,spoken_languages,keywords,release_year,wikipedia_movie_id,freebase_movie_id,name
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",2010,23270459,/m/0661ql3,Inception
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,...",2014,6009939,/m/0fkf28,Interstellar
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f...",2008,4276475,/m/0btpm6,The Dark Knight
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ...",2009,4273140,/m/0bth54,Avatar
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com...",2012,22114132,/m/062zm5h,The Avengers


Check the results of the merge for the following movie:

In [32]:
df_cmu_tmdb[df_cmu_tmdb['title'] == MOVIE]

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,tagline,genres,production_companies,production_countries,spoken_languages,keywords,release_year,wikipedia_movie_id,freebase_movie_id,name
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ...",2009,4273140,/m/0bth54,Avatar


Before saving the data, inspect that column names, and non-null values are correct

In [33]:
df_cmu_tmdb.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords', 'release_year', 'wikipedia_movie_id', 'freebase_movie_id',
       'name'],
      dtype='object')

In [34]:
df_cmu_tmdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50630 entries, 0 to 50629
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    50630 non-null  int64  
 1   title                 50630 non-null  object 
 2   vote_average          50630 non-null  float64
 3   vote_count            50630 non-null  int64  
 4   status                50630 non-null  object 
 5   release_date          50630 non-null  object 
 6   revenue               50630 non-null  int64  
 7   runtime               50630 non-null  int64  
 8   adult                 50630 non-null  bool   
 9   backdrop_path         34516 non-null  object 
 10  budget                50630 non-null  int64  
 11  homepage              4285 non-null   object 
 12  imdb_id               49516 non-null  object 
 13  original_language     50630 non-null  object 
 14  original_title        50630 non-null  object 
 15  overview           

In [35]:
# Remove movies with missing imdb because we need it for the tropes analysis
df_cmu_tmdb.dropna(subset=['imdb_id'], inplace=True)
df_cmu_tmdb.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49516 entries, 0 to 50629
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    49516 non-null  int64  
 1   title                 49516 non-null  object 
 2   vote_average          49516 non-null  float64
 3   vote_count            49516 non-null  int64  
 4   status                49516 non-null  object 
 5   release_date          49516 non-null  object 
 6   revenue               49516 non-null  int64  
 7   runtime               49516 non-null  int64  
 8   adult                 49516 non-null  bool   
 9   backdrop_path         34367 non-null  object 
 10  budget                49516 non-null  int64  
 11  homepage              4220 non-null   object 
 12  imdb_id               49516 non-null  object 
 13  original_language     49516 non-null  object 
 14  original_title        49516 non-null  object 
 15  overview              48

In [36]:
df_cmu_tmdb.to_csv('../../data/cmu_tmdb.csv', index=False)