In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### CMU dataset

Link to download the data: https://www.cs.cmu.edu/~ark/personas/

In [2]:
df_cmu_movie_metadata = pd.read_csv('../data/cmu/movie.metadata.tsv', sep='\t', header=None)
df_cmu_movie_metadata.columns = ['wikipedia_movie_id', 'freebase_movie_id', 'name', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres']
df_cmu_movie_metadata.sample(5)

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,name,release_date,revenue,runtime,languages,countries,genres
34909,22644373,/m/05zk59k,Pinkeltje,1978,,,"{""/m/02bv9"": ""Dutch Language""}","{""/m/059j2"": ""Netherlands""}","{""/m/0hqxf"": ""Family Film""}"
17329,25040223,/m/09gj_3z,The Christmas Secret,2000,,100.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0bwgnb"": ""Christmas movie"", ""/m/0hj3n26"":..."
54362,7013113,/m/0h07hg,My Architect,2003,,116.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/03g3w"": ""History"", ""/m/0jtdp"": ""Documenta..."
38593,33268309,/m/0h7m0jx,Back to Your Arms,2010-11-26,,90.0,"{""/m/0k7zj"": ""Lithuanian language"", ""/m/06b_j""...","{""/m/05qhw"": ""Poland"", ""/m/04gzd"": ""Lithuania""...","{""/m/07s9rl0"": ""Drama""}"
1566,22683998,/m/05z_qhr,Nocturna,2007-10-11,,88.0,"{""/m/064_8sq"": ""French Language"", ""/m/06nm1"": ...","{""/m/0f8l9c"": ""France"", ""/m/06mkj"": ""Spain""}","{""/m/01hmnh"": ""Fantasy"", ""/m/0hcr"": ""Animation""}"


In [3]:
df_cmu_plot_summaries = pd.read_csv('../data/cmu/plot_summaries.txt', sep='\t', header=None)
df_cmu_plot_summaries.columns = ['wiki_id', 'summary']
df_cmu_plot_summaries.sample(5)

Unnamed: 0,wiki_id,summary
23093,2354462,The 1992-96 Siege of Sarajevo by the Bosnian S...
33899,13093495,"The movie starts with a narrator, Captain Robe..."
20793,1607765,"Steve Prefontaine comes from Coos Bay, Oregon ..."
12091,12231155,When Pope John Paul II is visiting Canada in 2...
11342,13164574,The film begins in 1980's Spain during the pol...


In [4]:
df_cmu_character_metadata = pd.read_csv('../data/cmu/character.metadata.tsv', sep='\t', header=None)
df_cmu_character_metadata.columns = [
    "wikipedia_movie_id", "freebase_movie_id", "movie_release_date", "character_name",
    "actor_date_of_birth", "actor_gender", "actor_height_in_meters", "actor_ethnicity_freebase_id",
    "actor_name", "actor_age_at_movie_release", "freebase_character_actor_map_id",
    "freebase_character_id", "freebase_actor_id"
]

df_cmu_character_metadata.sample(5)

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_release_date,character_name,actor_date_of_birth,actor_gender,actor_height_in_meters,actor_ethnicity_freebase_id,actor_name,actor_age_at_movie_release,freebase_character_actor_map_id,freebase_character_id,freebase_actor_id
274406,321496,/m/01vksx,2003-07-09,Hector Barbossa,1951-07-06,M,1.83,,Geoffrey Rush,52.0,/m/0k1x_2,/m/02v9k1,/m/0170pk
6577,2267722,/m/06_hzg,1954-01-15,,1923-09-07,M,1.816,,Peter Lawford,30.0,/m/02tbf0n,,/m/0202p_
252202,2724555,/m/07_466,2005-09-23,Helium 3 Commander,1969-08-17,M,1.78,/m/065b6q,Donnie Wahlberg,36.0,/m/0gyxn64,/m/0gyxn68,/m/02stwg
443803,6563151,/m/0gbr99,1992,,,,,,Terrie Snell,,/m/0gcg335,,/m/0gc5lcn
218820,3557194,/m/09lcyt,2003-05-17,,1938-11-20,M,,,Colin Fox,64.0,/m/0gc7yvm,,/m/0chmb_


#### Tropes dataset

Link to download the data: https://drive.google.com/file/d/1Duyz5ATlLHzwMidj15bWVnWHpdE4aRXn/view?usp=sharing

In [5]:
df_tropes = pd.read_csv('../data/tropes/tropes.csv', index_col=0)
df_tropes.columns = ['trope_id', 'trope', 'description']
df_tropes.sample(5)

Unnamed: 0,trope_id,trope,description
17170,t17171,PlayerPersonalityQuiz,"In some video games, particularly at the start..."
23270,t23271,TheOnlyOneITrust,Alice is in a difficult situation: perhaps she...
6470,t06471,EarthShatteringPoster,A common Film Poster meant to shock and attrac...
19768,t19769,SelfInsertFic,"As the name implies, a Self-Insert Fic is one ..."
4244,t04245,ConceptAlbum,Some albums are just a random assortment of so...


In [6]:
df_imdb_movie_tropes = pd.read_csv('../data/tropes/film_imdb_match.csv', index_col=0)
df_imdb_movie_tropes.columns = ['title', 'trope', 'example', 'clean_title', 'tconst', 'trope_id', 'title_id']
df_imdb_movie_tropes = df_imdb_movie_tropes.drop(columns=['trope'])
df_imdb_movie_tropes.head()

Unnamed: 0,title,example,clean_title,tconst,trope_id,title_id
0,ABBATheMovie,The concert segments make it clear that Agnet...,abbathemovie,tt0075617,t14656,f0
1,ABBATheMovie,The radio station manager that tasks Ashley w...,abbathemovie,tt0075617,t11527,f0
2,ABBATheMovie,A rare male example; when Benny reads one new...,abbathemovie,tt0075617,t23019,f0
3,ABBATheMovie,During Ashley's fantasy sequence in which he ...,abbathemovie,tt0075617,t09016,f0
4,ABBATheMovie,"Frida's outfit in the ""Why Did It Have to Be ...",abbathemovie,tt0075617,t25994,f0


In [7]:
df_imdb_movie_tropes = df_imdb_movie_tropes.merge(df_tropes, how='inner', left_on='trope_id', right_on='trope_id')
df_imdb_movie_tropes = df_imdb_movie_tropes[['tconst', 'title_id', 'clean_title', 'trope_id', 'trope', 'description', 'example']]
df_imdb_movie_tropes.rename(columns={'tconst': 'imdb_id'}, inplace=True)
df_imdb_movie_tropes.head()

Unnamed: 0,imdb_id,title_id,clean_title,trope_id,trope,description,example
0,tt0075617,f0,abbathemovie,t14656,MsFanservice,\nA female character who provides a significan...,The concert segments make it clear that Agnet...
1,tt0075617,f0,abbathemovie,t11527,InsistentTerminology,"\nHey, we aren't ""describing"" Insistent Termin...",The radio station manager that tasks Ashley w...
2,tt0075617,f0,abbathemovie,t23019,TheIngenue,\nThe Ingenuenote pronounced ON-jeh-noo is a y...,A rare male example; when Benny reads one new...
3,tt0075617,f0,abbathemovie,t09016,GettingCrapPastTheRadar,\n\nGetting Crap Past the Radar refers to inst...,During Ashley's fantasy sequence in which he ...
4,tt0075617,f0,abbathemovie,t25994,WhoWearsShortShorts,"\nHot pants, mini shorts, or daisy dukes? What...","Frida's outfit in the ""Why Did It Have to Be ..."


In [8]:
K = 10
top_k_tropes = df_imdb_movie_tropes['trope'].value_counts().reset_index().head(K)
top_k_tropes

Unnamed: 0,trope,count
0,ShoutOut,1151
1,HorrorFilms,1072
2,FilmsOfThe1980s,771
3,OhCrap,719
4,ChekhovsGun,716
5,BigBad,635
6,Foreshadowing,627
7,DeadpanSnarker,555
8,TheCameo,552
9,BittersweetEnding,513


#### IMDB dataset

Link to download the data: https://developer.imdb.com/non-commercial-datasets/, download the title.basics.tsv.gz file

In [9]:
df_imdb = pd.read_csv('../data/imdb/title.basics.tsv', sep='\t')
df_imdb.sample(5)

  df_imdb = pd.read_csv('../data/imdb/title.basics.tsv', sep='\t')


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
10361174,tt8023690,tvEpisode,Episode #1.2,Episode #1.2,0,2018,\N,50,"Adventure,Comedy,Drama"
8023642,tt33097670,tvEpisode,Episode dated 1 October 2024,Episode dated 1 October 2024,0,2024,\N,\N,News
10141018,tt7542546,tvEpisode,Chapter 5,Chapter 5,0,2017,\N,\N,Mystery
9875297,tt6957532,short,Floating in the Air,Floating in the Air,0,2012,\N,\N,"Drama,Short"
4304955,tt16231950,tvEpisode,The Detective and the Thief,The Detective and the Thief,1,2018,\N,42,Adult


#### TMDB dataset

Link to download the data: https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies?resource=download

In [10]:
df_tmdb = pd.read_csv('../data/TMDB_movie_dataset_v11.csv')
df_tmdb.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords'],
      dtype='object')

In [11]:
df_tmdb.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ..."
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,The Avengers,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com..."


##### Merge IMDB and Tropes datasets

In [12]:
# tropes with imdb ids matched
df_imdb_movie_tropes.head()

Unnamed: 0,imdb_id,title_id,clean_title,trope_id,trope,description,example
0,tt0075617,f0,abbathemovie,t14656,MsFanservice,\nA female character who provides a significan...,The concert segments make it clear that Agnet...
1,tt0075617,f0,abbathemovie,t11527,InsistentTerminology,"\nHey, we aren't ""describing"" Insistent Termin...",The radio station manager that tasks Ashley w...
2,tt0075617,f0,abbathemovie,t23019,TheIngenue,\nThe Ingenuenote pronounced ON-jeh-noo is a y...,A rare male example; when Benny reads one new...
3,tt0075617,f0,abbathemovie,t09016,GettingCrapPastTheRadar,\n\nGetting Crap Past the Radar refers to inst...,During Ashley's fantasy sequence in which he ...
4,tt0075617,f0,abbathemovie,t25994,WhoWearsShortShorts,"\nHot pants, mini shorts, or daisy dukes? What...","Frida's outfit in the ""Why Did It Have to Be ..."


In [13]:
# merge imdb titles basics information with tropes
df_movie_tropes = pd.merge(df_imdb_movie_tropes, df_imdb, how='inner', left_on='imdb_id', right_on='tconst')

In [14]:
print("-------" * 10)
print(f"imdb shape: {df_imdb.shape}")
print(f"movie tropes imdb shape: {df_imdb_movie_tropes.shape}")
print(f"movie tropes merged with imdb dataset shape: {df_movie_tropes.shape}")
print("-------" * 10)

df_movie_tropes.head()

----------------------------------------------------------------------
imdb shape: (11230548, 9)
movie tropes imdb shape: (390511, 7)
movie tropes merged with imdb dataset shape: (390408, 16)
----------------------------------------------------------------------


Unnamed: 0,imdb_id,title_id,clean_title,trope_id,trope,description,example,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0075617,f0,abbathemovie,t14656,MsFanservice,\nA female character who provides a significan...,The concert segments make it clear that Agnet...,tt0075617,movie,ABBA: The Movie,ABBA: The Movie,0,1977,\N,95,"Comedy,Documentary,Music"
1,tt0075617,f0,abbathemovie,t11527,InsistentTerminology,"\nHey, we aren't ""describing"" Insistent Termin...",The radio station manager that tasks Ashley w...,tt0075617,movie,ABBA: The Movie,ABBA: The Movie,0,1977,\N,95,"Comedy,Documentary,Music"
2,tt0075617,f0,abbathemovie,t23019,TheIngenue,\nThe Ingenuenote pronounced ON-jeh-noo is a y...,A rare male example; when Benny reads one new...,tt0075617,movie,ABBA: The Movie,ABBA: The Movie,0,1977,\N,95,"Comedy,Documentary,Music"
3,tt0075617,f0,abbathemovie,t09016,GettingCrapPastTheRadar,\n\nGetting Crap Past the Radar refers to inst...,During Ashley's fantasy sequence in which he ...,tt0075617,movie,ABBA: The Movie,ABBA: The Movie,0,1977,\N,95,"Comedy,Documentary,Music"
4,tt0075617,f0,abbathemovie,t25994,WhoWearsShortShorts,"\nHot pants, mini shorts, or daisy dukes? What...","Frida's outfit in the ""Why Did It Have to Be ...",tt0075617,movie,ABBA: The Movie,ABBA: The Movie,0,1977,\N,95,"Comedy,Documentary,Music"


In [15]:
df_movie_tropes.to_csv('../data/movie_tropes.csv', index=False)

#### Merge CMU and TMDB datasets

In [16]:
df_cmu_movie_metadata.columns

Index(['wikipedia_movie_id', 'freebase_movie_id', 'name', 'release_date',
       'revenue', 'runtime', 'languages', 'countries', 'genres'],
      dtype='object')

In [17]:
df_tmdb.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords'],
      dtype='object')

In [18]:
# merge cmu movie metadata with tmdb dataset to fill in missing information such as revenue which has a lot of missing values
df_cmu_movie_metadata = df_cmu_movie_metadata[['wikipedia_movie_id', 'freebase_movie_id', 'name']]
df_cmu_tmdb = pd.merge(df_tmdb, df_cmu_movie_metadata, how='inner', left_on='title', right_on='name')

print("-------" * 10)
print(f"CMU Movie Summary Corpus shape: {df_cmu_movie_metadata.shape}")
print(f"TMDB shape: {df_tmdb.shape}")
print(f"CMU TMDB merged dataframe shape: {df_cmu_tmdb.shape}")
print("-------" * 10)

df_cmu_tmdb.head()

----------------------------------------------------------------------
CMU Movie Summary Corpus shape: (81741, 3)
TMDB shape: (1131296, 24)
CMU TMDB merged dataframe shape: (201275, 27)
----------------------------------------------------------------------


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,wikipedia_movie_id,freebase_movie_id,name
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",23270459,/m/0661ql3,Inception
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,...",6009939,/m/0fkf28,Interstellar
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f...",4276475,/m/0btpm6,The Dark Knight
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ...",4273140,/m/0bth54,Avatar
4,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ...",29448256,/m/0dr_b26,Avatar


In [19]:
df_cmu_tmdb.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords', 'wikipedia_movie_id', 'freebase_movie_id', 'name'],
      dtype='object')

In [20]:
df_cmu_tmdb.to_csv('../data/cmu_tmdb.csv', index=False)