# Exploration and initial analyses

In [1]:
# Allow python to import modules from src folders
import sys
sys.path.append("src/")

from preprocessing.load_dataset import *

## CMU dataset

We load the 4 `.tsv` and `.txt` files of the CMU dataset into 4 different dataframes

In [2]:
characters_df = load_characters_df()
movies_df = load_movies_df()
plot_df = load_plot_df()
tvtropes_df = load_tvtropes_df()

### Characters

In [3]:
characters_df.sample(5)

Unnamed: 0,wiki_movie_id,freebase_movie_id,release_date,character_name,actor_birth,actor_gender,actor_height,actor_ethnicity,actor_name,release_actor_age,freebase_map_id,freebase_character_id,freebase_actor_id
125073,18716756,/m/04gq4z4,1976,,1953-08-19,M,,/m/0222qb,Nanni Moretti,22.0,/m/04lrpkd,,/m/06dx0w
159237,28694282,/m/0czd_vx,1930-10-16,Scotty,1895-07-16,M,,,Hay Petrie,,/m/0l1s393,/m/0l1s396,/m/03mht_b
266482,13053911,/m/04czgnm,2009-02-09,Trent DeMarco,1982-11-04,M,1.83,,Travis Van Winkle,26.0,/m/052dnlj,/m/0h5p1vg,/m/0ks898
292142,4566666,/m/0c9755,1982,,1962-05-31,M,,,Tommy Redmond Hicks,19.0,/m/0bwb30n,,/m/0k35y5
155578,28308054,/m/0cp0w24,1957-07,,1907-04-23,M,,,James Hayter,,/m/0gcsvrm,,/m/03k_xc


In [39]:
total_characters = len(characters_df)
print(f"Number of characters loaded: {total_characters}")
print(f"Number of unique movies: {len(characters_df['wiki_movie_id'].unique())}")
print(f"Number of unique actors: {len(characters_df['freebase_actor_id'].unique())}", end="\n\n")

print(f"Percentage of missing gender: {characters_df['actor_gender'].isna().sum()/total_characters*100:.2f}%")
print(f"Percentage of missing height: {characters_df['actor_height'].isna().sum()/total_characters*100:.2f}%")
print(f"Percentage of missing ethnicity: {characters_df['actor_ethnicity'].isna().sum()/total_characters*100:.2f}%")
print(f"Percentage of missing age upon release: {characters_df['release_actor_age'].isna().sum()/total_characters*100:.2f}%")

Number of characters loaded: 450669
Number of unique movies: 64330
Number of unique actors: 135761

Percentage of missing gender: 10.12%
Percentage of missing height: 65.65%
Percentage of missing ethnicity: 76.47%
Percentage of missing age upon release: 35.08%


In [35]:
print(f"Number of character records for which ethnicity is known: {len(characters_df[~characters_df['actor_ethnicity'].isna()])}")
print(f"Number of unique actors for which ethnicity is known: {len(characters_df[~characters_df['actor_ethnicity'].isna()]['freebase_actor_id'].unique())}")

Number of character records for which ethnicity is known: 106058
Number of unique actors for which ethnicity is known: 8151


### Movies

In [4]:
movies_df.sample(5)

Unnamed: 0,wiki_movie_id,freebase_movie_id,movie_name,movie_release_date,box_office,movie_runtime,movie_languages,movie_countries,movie_genres
46401,13806062,/m/05p0pmh,Secret of Giving,1999,,87.0,{'/m/02h40lc': 'English Language'},{'/m/09c7w0': 'United States of America'},"{'/m/0hfjk': 'Western', '/m/07s9rl0': 'Drama',..."
11698,4541896,/m/0c7z7q,Next Stop Wonderland,1998-08-21,3386698.0,96.0,{'/m/02h40lc': 'English Language'},{'/m/09c7w0': 'United States of America'},"{'/m/06cvj': 'Romantic comedy', '/m/0219x_': '..."
1026,14613957,/m/03gqmqv,Rose of the Rancho,1914-11-15,87028.0,,"{'/m/06ppq': 'Silent film', '/m/02h40lc': 'Eng...",{'/m/09c7w0': 'United States of America'},"{'/m/0hfjk': 'Western', '/m/06ppq': 'Silent fi..."
28643,31156503,/m/0gh6tgn,Inthu Manushyano,1973,,,{'/m/0999q': 'Malayalam Language'},{'/m/03rk0': 'India'},{}
78238,32790121,/m/0h3wxss,Atrocious,,,75.0,{'/m/06nm1': 'Spanish Language'},{'/m/06mkj': 'Spain'},{'/m/03npn': 'Horror'}


In [5]:
total_movies = len(movies_df)
print(f"Number of movies loaded: {total_movies}")
print(f"Percentage of box office revenue: {movies_df['box_office'].isna().sum()/total_movies*100:.2f}%")

Number of movies loaded: 81741
Percentage of box office revenue: 89.72%


In [50]:
print(f"Number of movies for which we have matching character data: {len(movies_df.merge(characters_df, how='inner', on=['wiki_movie_id', 'freebase_movie_id'])['freebase_movie_id'].unique())}")

Number of movies for which we have matching character data: 64330


### Plot

In [6]:
plot_df.sample(5)

Unnamed: 0,wiki_movie_id,plot_summary
20777,6851920,Bobby Deol is a young man from an upper-middl...
19408,25681893,Víctor Martínez is a shy 36-year-old man work...
38357,13267586,A film maker thinks that he is HIV positive an...
20493,1931788,Hannie Caulder is a frontier wife whose husba...
1297,25899488,Haridas moves to the city after his twin siste...


In [55]:
total_plot = len(plot_df)
print(f"Number of plot summaries loaded: {total_plot}")

Number of plot summaries loaded: 42303


### Tropes

In [57]:
tvtropes_df.sample(5)

Unnamed: 0,trope_name,character_data
288,grumpy_old_man,"{'char': 'Judge Elihu Smails', 'movie': 'Caddy..."
360,morally_bankrupt_banker,"{'char': 'Patrick Bateman', 'movie': 'American..."
357,master_swordsman,"{'char': 'Blade', 'movie': 'Blade', 'id': '/m/..."
120,corrupt_corporate_executive,"{'char': 'B.Z.', 'movie': 'Santa Claus: The Mo..."
261,final_girl,"{'char': 'Laurie Strode', 'movie': 'Halloween'..."


In [58]:
total_tropes = len(tvtropes_df)
print(f"Number of tropes loaded: {total_tropes}")

Number of tropes loaded: 501


# -- Luca stopped cleanup here --

In [8]:
# extract years from the release date of movies
movies_df.loc[:, 'movie_release_year'] = movies_df['movie_release_date'].astype(str).str[:4]
movies_df = movies_df[movies_df['movie_release_year'].str.contains("nan")==False]
movies_df['movie_release_year'] = movies_df['movie_release_year'].astype(int)


In [9]:
wrong_dates = movies_df[movies_df['movie_release_year'] < 1800].index
# there is only one movie that has the release date 1010, so we can change 1010 to 2010
movies_df.loc[wrong_dates, 'movie_release_year']= 2010
movies_df.loc[wrong_dates, 'movie_release_date'] ='2010-12-02'

## Perfomance of the movie in terms of revenue

In [9]:
# how many missing values in the column box_office
percentage_missing = movies_df['box_office'].isna().sum() / len(movies_df) * 100
print("{:.2f}% values for the box office revenue are missing.".format(percentage_missing))

88.87% values for the box office revenue are missing.


## Extract ratings from the IMDB rating dataset

file: title.rating.tsv
-    tconst: unique identifier of the movie
-    averageRating: average of user ratings
-    numVotes: number of ratings submitted for the movie


In [None]:
imdb_ratings_df = load_imdb_ratings()
imdb_title_df = load_imdb_title_basics()
imdb_principals_df = load_imdb_title_principals()
imdb_people_df = load_imdb_person_basics()

In [17]:
print(movies_df.count())
print('==========')
print(imdb_title_df.count())

wiki_movie_id         74839
freebase_movie_id     74839
movie_name            74839
movie_release_date    74839
box_office             8328
movie_runtime         58631
movie_languages       74839
movie_countries       74839
movie_genres          74839
movie_release_year    74839
dtype: int64
tconst            661815
titleType         661815
primaryTitle      661815
originalTitle     661815
isAdult           661815
startYear         661815
endYear           661815
runtimeMinutes    661815
genres            661815
dtype: int64


In [3]:
# Extract the movies from the title.basics imbd dataset
imdb_title_df.sample(5)

NameError: name 'imdb_title_df' is not defined

In [19]:
imdb_people_df.sample(5)

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
5565840,nm14995081,Tau Braun,\N,\N,,tt22696252
252954,nm0267003,Tim Faraday,\N,\N,"actor,director,producer","tt0808096,tt3569230,tt0208092,tt1139328"
1061000,nm10150070,Ivan Aguirre,\N,\N,actor,tt7933566
6520076,nm2185071,Patrick Queen,\N,\N,"miscellaneous,editorial_department,camera_depa...","tt9814218,tt3097248,tt6223802,tt5976986"
12805530,nm9792012,Skrotes,\N,\N,composer,tt8321498


In [20]:
imdb_principals_df.sample(5)

Unnamed: 0,tconst,ordering,nconst,category,job,characters
24420608,tt15564846,7,nm12374146,writer,\N,\N
38145263,tt27609766,2,nm0110828,writer,head writer,\N
18788392,tt13648152,2,nm6209025,self,\N,"[""Self - Co-Host""]"
13270950,tt11703382,5,nm1168198,director,\N,\N
5412367,tt0725520,3,nm0429809,self,\N,"[""Self""]"


In [21]:
# merge with ratings
imdb_ratings_meta = imdb_ratings_df.merge(imdb_title_df, on='tconst', how='inner')
imdb_ratings_meta.head()

Unnamed: 0,tconst,averageRating,numVotes,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000009,5.3,207,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
1,tt0000147,5.3,484,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
2,tt0000502,4.1,15,movie,Bohemios,Bohemios,0,1905,\N,100,\N
3,tt0000574,6.0,854,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
4,tt0000591,5.0,21,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama


## Merge the CMU dataset with the IMDB

We first import the translation dataframe, which includes both the IMDB id and the Freebase id, enabling us to combine the IMDB dataset with the CMU dataset.

In [15]:
translation_id = load_translation_df()
translation_id.columns

Index(['imdb_id', 'freebase_id'], dtype='object')

Then we merge the IMDB ratings with the CMU dataset

In [16]:
imdb_ratings_translated_df = imdb_ratings_df.merge(translation_id, how='inner', left_on='tconst', right_on='imdb_id')
merged_df = imdb_ratings_translated_df.merge(movies_df, how='inner', left_on='freebase_id', right_on='freebase_movie_id')

In [17]:
print(f"The merged dataframe contains {len(merged_df)} movies")

The merged dataframe contains 52003 movies


## Load personas

In [10]:
personas_df = load_personas()

In [11]:
personas_df

Unnamed: 0,freebase_id,movie_name,secondary_name,full_name,token_occurences,estimated_trope,trope_distrib
0,e7,The Trap,she,her,5,1,"[0.0, 0.63265, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0..."
1,e8,The Trap,she,her,5,1,"[0.0, 0.77551, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0..."
2,e12,The Trap,he,his,7,19,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,e19,The Trap,she,her,6,1,"[0.0, 0.67347, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0..."
4,e4,The Trap,Eve,Eve,11,23,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...
178729,e11,Una Aventura Llamada Menudo,Concepci,Danny Concepci,3,1,"[0.0, 0.22449, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0..."
178730,e10,Una Aventura Llamada Menudo,he,the son of C sar Concepci n Jr.,4,44,"[0.0, 0.02041, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0..."
178731,e1,Una Aventura Llamada Menudo,they,their,6,10,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
178732,e0,Plaza de Almas,he,his,6,1,"[0.0, 0.46939, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0..."


In [22]:
personas_df.sample(5)

Unnamed: 0,freebase_id,movie_name,secondary_name,full_name,token_occurences,estimated_trope,trope_distrib
105638,e6,The Four-Faced Liar,girlfriend,a girlfriend because no one can hold her inter...,5,26,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
178427,e5,The War Between Men and Women,she,her,8,26,"[0.0, 0.34694, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0..."
166965,/m/0bvsnb5,Stomp the Yard,Jackie,Jackie,6,26,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
137223,e23,Kohtalon kirja,friend,his friend who was on the line of fire,3,9,"[0.0, 0.02041, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0..."
8902,e11,That's My Mommy,Jerry,Jerry,17,24,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
