In [1]:
import pandas as pd
import gc

from helpers.readers import read_dataframe

<a id="Contents"></a> <br>
# Content
* [1 - Loading default dataframes](#default)
<br>
* [2 - Merged Dataframes](#merged)
<br>

- **Q1: How impactful is the team surrounding the director on the success of a movie?**
- **Q2: To what extent does the director’s choice of movie genre affect the success of the movie?**
- **Q3: What is the impact of the director’s character choices on the success of the movie?**
- **Q4 (opt.): Can we spot bright underrated directors who are in the early stages of their career?**

<img src="https://i.postimg.cc/d3X75w5g/cmu-with-nlp.png" width="600"/>
<img src="https://i.postimg.cc/NjvFFNS8/imdb.png" width="400"/>
<img src="https://i.postimg.cc/yNVNXDVk/mappings.png" width="500"/>
<img src="https://i.postimg.cc/KvTnQPp9/movielens.png" width="550"/>

<a class="anchor" id="default"></a>
## Loading default dataframes
[Back to Table of Contents](#Contents)

### CMU Metadata

In [2]:
cmu_movies = read_dataframe(name='cmu/movies', preprocess=True, usecols=[
    "Wikipedia movie ID", 
    "Freebase movie ID", 
    "Movie name", 
    "Movie release date", 
    "Movie box office revenue", 
    "Movie runtime", 
    "Movie languages", 
    "Movie countries", 
    "Movie genres",
])

cmu_movies.info()
cmu_movies.head(1)

Preprocess logs:
✅ Fixed Movie Languages inside Movie Countries
✅ Removed Deseret characters
✅ Movie release date splitted to three columns: Movie release Year, Movie release Month, Movie release Day
✅ Seperated freebase identifiers from Movie Languages, Movie Countries and Movie Genres
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81741 entries, 0 to 81740
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Wikipedia movie ID        81741 non-null  int32  
 1   Freebase movie ID         81741 non-null  string 
 2   Movie name                81741 non-null  string 
 3   Movie box office revenue  8401 non-null   float64
 4   Movie runtime             61291 non-null  float32
 5   Movie release Year        74839 non-null  Int16  
 6   Movie release Month       42667 non-null  Int8   
 7   Movie release Day         39373 non-null  Int8   
 8   Movie languages           81741 non-null  string

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie box office revenue,Movie runtime,Movie release Year,Movie release Month,Movie release Day,Movie languages,Movie countries,Movie genres
0,975900,/m/03vyhn,Ghosts of Mars,14010832.0,98.0,2001,8,24,English,United States of America,"Thriller,Science Fiction,Horror,Adventure,Supe..."


In [3]:
cmu_characters = read_dataframe(name='cmu/characters', preprocess=True, usecols=[
    "Wikipedia movie ID",
    "Freebase movie ID",
    "Movie release date",
    "Character name",
    "Actor DOB",
    "Actor gender",
    "Actor height",
    "Actor ethnicity",
    "Actor name",
    "Actor age at movie release",
    "Freebase character/actor map ID",
    "Freebase character ID",
    "Freebase actor ID",
])

cmu_characters.info()
cmu_characters.head(1)

Preprocess logs:
✅ Movie release date splitted to three columns: Movie release Year, Movie release Month, Movie release Day
✅ Actor DOB splitted to three columns: Actor DOB Year, Actor DOB Month, Actor DOB Day
✅ Dropped Freebase character/actor map ID and Freebase character ID
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450669 entries, 0 to 450668
Data columns (total 15 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   Wikipedia movie ID               450669 non-null  int32  
 1   Freebase movie ID                450669 non-null  string 
 2   Character name                   192794 non-null  string 
 3   Actor gender                     405060 non-null  string 
 4   Actor height                     154824 non-null  float32
 5   Actor ethnicity                  106058 non-null  string 
 6   Actor name                       449441 non-null  string 
 7   Actor age at movie release       292556

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Character name,Actor gender,Actor height,Actor ethnicity,Actor name,Actor age at movie release,Freebase character/actor map ID,Movie release Year,Movie release Month,Movie release Day,Actor DOB Year,Actor DOB Month,Actor DOB Day
0,975900,/m/03vyhn,Akooshay,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,2001,8,24,1958,8,26


### IMDb (https://developer.imdb.com/non-commercial-datasets/) (4/7)

In [4]:
imdb_people = read_dataframe(name='imdb/names')
imdb_people.info()
imdb_people.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12904751 entries, 0 to 12904750
Data columns (total 6 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   nconst             string
 1   primaryName        string
 2   birthYear          Int16 
 3   deathYear          Int16 
 4   primaryProfession  string
 5   knownForTitles     string
dtypes: Int16(2), string(4)
memory usage: 467.7 MB


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0053137,tt0050419,tt0072308,tt0031983"


In [None]:
imdb_info = read_dataframe(name='imdb/movies', preprocess=True)
imdb_info.info()
imdb_info.head(1)

In [None]:
imdb_principals = read_dataframe(name='imdb/principals')
imdb_principals.info()
imdb_principals.head(1)

In [None]:
imdb_ratings = read_dataframe(name='imdb/ratings')
imdb_ratings.info()
imdb_ratings.head(1)

### Unused IMDb (3/7):

In [None]:
imdb_akas = read_dataframe(name='imdb/akas')
imdb_akas.info()
imdb_akas.head(1)

In [None]:
imdb_crew = read_dataframe(name='imdb/crew')
imdb_crew.info()
imdb_crew.head(1)

In [None]:
imdb_episode = read_dataframe(name='imdb/episode')
imdb_episode.info()
imdb_episode.head(1)

### Mappings

In [None]:
mapping_w_i_f = read_dataframe(name='mapping_wikipedia_imdb_freebase')
mapping_w_i_f.info()
mapping_w_i_f.head(1)

In [None]:
mapping_w_i = read_dataframe(name='mapping_wikipedia_imdb')
mapping_w_i.info()
mapping_w_i.head(1)

In [None]:
mapping_f_i = read_dataframe(name='mapping_freebase_imdb')
mapping_f_i.info()
mapping_f_i.head(1)

### MovieLens (https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset) 1/5

In [None]:
movieLens_movies = read_dataframe(name='movieLens/movies', preprocess=True)
movieLens_movies.info()
movieLens_movies.head(1)

### Unused MovieLens (4/5)

In [None]:
movieLens_credits = read_dataframe('movieLens/credits')
movieLens_credits.info()
movieLens_credits.head(1)

In [None]:
movieLens_keywords = read_dataframe('movieLens/keywords')
movieLens_keywords.info()
movieLens_keywords.head(1)

In [None]:
movieLens_links = read_dataframe('movieLens/links', preprocess=True)
movieLens_links.info()
movieLens_links.head(1)

In [None]:
movieLens_ratings = read_dataframe('movieLens/ratings')
movieLens_ratings.info()
movieLens_ratings.head(1)

### CMU Summaries NLP

In [None]:
cmu_summaries = read_dataframe(name='cmu/summaries', usecols=[
    "Wikipedia movie ID", 
    "Plot Summary"
])
cmu_summaries.info()

cmu_nameclusters = read_dataframe(name='cmu/nameclusters', usecols=['Character name', 'Freebase character/actor map ID'])
cmu_nameclusters.info()

cmu_tvtropes = read_dataframe(name='cmu/tvtropes')
cmu_tvtropes.info()

In [None]:
cmu_characters = read_dataframe('cmu/characters_2023')
cmu_characters.info()
cmu_characters.head(1)

## Merged Dataframes

### Mappings preliminary analysis

In [None]:
mapping_w_i_f.isna().any()

In [None]:
mapping_w_i.isna().any()

In [None]:
mapping_f_i.isna().any()

In [None]:
len(mapping_w_i_f), len(mapping_w_i), len(mapping_f_i)

In [None]:
w_ids_from_w_i_f = set(mapping_w_i_f['wikipedia'])
i_ids_from_w_i_f = set(mapping_w_i_f['imdb']) # tconst
f_ids_from_w_i_f = set(mapping_w_i_f['freebase'])

w_ids_from_w_i = set(mapping_w_i['wikipedia'])
i_ids_from_w_i = set(mapping_w_i['imdb']) # tconst

f_ids_from_f_i = set(mapping_f_i['freebase'])
i_ids_from_f_i = set(mapping_f_i['imdb']) # # tconst


# Intersections involving Wikipedia IDs
intersection_w = w_ids_from_w_i_f.intersection(w_ids_from_w_i)

# Intersections involving IMDb IDs
intersection_i = i_ids_from_w_i_f.intersection(i_ids_from_w_i, i_ids_from_f_i)

# Intersections involving Freebase IDs
intersection_f = f_ids_from_w_i_f.intersection(f_ids_from_f_i)

len(intersection_w), len(intersection_i), len(intersection_f)

In [None]:
len(mapping_w_i_f.dropna())
# equivalent to pd.merge(mapping_w_i, mapping_f_i, left_on="imdb", right_on="imdb", how="inner")

<a class="anchor" id="merged"></a>
## CMU IMDb movie merge

[Back to Table of Contents](#Contents)

### 1) Using "wikipedia" and "imdb" of mapping_w_i

In [None]:
cmu_movies['Wikipedia movie ID'] = cmu_movies['Wikipedia movie ID'].astype('int64')
mapping_w_i['wikipedia'] = mapping_w_i['wikipedia'].astype('int64')

merged_df = pd.merge(cmu_movies, mapping_w_i, left_on='Wikipedia movie ID', right_on='wikipedia', how='inner')

cmu_imdb_movies = pd.merge(merged_df, imdb_info, left_on='imdb', right_on='tconst', how='inner')
cmu_imdb_movies.drop(['wikipedia', 'imdb'], axis=1, inplace=True)

cmu_movies["Wikipedia movie ID"] = pd.to_numeric(cmu_movies["Wikipedia movie ID"], downcast='integer')
mapping_w_i["wikipedia"] = pd.to_numeric(mapping_w_i["wikipedia"], downcast='integer')
cmu_imdb_movies["Wikipedia movie ID"] = pd.to_numeric(cmu_imdb_movies["Wikipedia movie ID"], downcast='integer')

del merged_df
gc.collect()

cmu_imdb_movies.info()
cmu_imdb_movies.head(1)

### 2) Using "freebase" and "imdb" of mapping_f_i

In [None]:
merged_df = pd.merge(cmu_movies, mapping_f_i, left_on='Freebase movie ID', right_on='freebase', how='inner')

cmu_imdb_movies_v2 = pd.merge(merged_df, imdb_info, left_on='imdb', right_on='tconst', how='inner')

cmu_imdb_movies_v2.drop(['freebase', 'imdb'], axis=1, inplace=True)

del merged_df
gc.collect()

cmu_imdb_movies_v2.info()
cmu_imdb_movies_v2.head(1)

### 3) Using "wikipedia" and "imdb" of mapping_w_i_f

In [None]:
cmu_movies['Wikipedia movie ID'] = cmu_movies['Wikipedia movie ID'].astype('int64')
mapping_w_i['wikipedia'] = mapping_w_i['wikipedia'].astype('int64')

merged_df = pd.merge(cmu_movies, mapping_w_i_f, left_on='Wikipedia movie ID', right_on='wikipedia', how='inner')

cmu_imdb_movies_v3 = pd.merge(merged_df, imdb_info, left_on='imdb', right_on='tconst', how='inner')
cmu_imdb_movies_v3.drop(['wikipedia', 'imdb'], axis=1, inplace=True)

cmu_movies["Wikipedia movie ID"] = pd.to_numeric(cmu_movies["Wikipedia movie ID"], downcast='integer')
mapping_w_i["wikipedia"] = pd.to_numeric(mapping_w_i["wikipedia"], downcast='integer')
cmu_imdb_movies_v3["Wikipedia movie ID"] = pd.to_numeric(cmu_imdb_movies_v3["Wikipedia movie ID"], downcast='integer')

del merged_df
gc.collect()

cmu_imdb_movies_v3.info()
cmu_imdb_movies_v3.head(1)

### 4) Using "freebase" and "imdb" of mapping_w_i_f

In [None]:
merged_df = pd.merge(cmu_movies, mapping_w_i_f, left_on='Freebase movie ID', right_on='freebase', how='inner')

cmu_imdb_movies_v4 = pd.merge(merged_df, imdb_info, left_on='imdb', right_on='tconst', how='inner')

cmu_imdb_movies_v4.drop(['freebase', 'imdb'], axis=1, inplace=True)

del merged_df
gc.collect()

cmu_imdb_movies_v4.info()
cmu_imdb_movies_v4.head(1)

## CMU IMDb MovieLens movie merge

### 1) Direct approach 

In [None]:
# arbitrary choice of using cmu_imdb_movies_v4
cmu_imdb_movieLens_movies = pd.merge(cmu_imdb_movies_v4, movieLens_movies, left_on="tconst", right_on="imdb_id", how="inner")
cmu_imdb_movieLens_movies.drop(['imdb_id'], axis=1, inplace=True)

cmu_imdb_movieLens_movies.info()
cmu_imdb_movieLens_movies.head(1)

### 2) Use movieLens_links

In [None]:
movieLens_movies_merge = pd.merge(movieLens_links, movieLens_movies, left_on="imdbId", right_on="imdb_id", how="inner")
#cmu_imdb_movieLens_movies.drop(['imdbId'], axis=1, inplace=True)

#cmu_imdb_movieLens_movies_v2.info()
#cmu_imdb_movieLens_movies_v2.head(1)

## CMU Movies IMDb NLP characters  (+ MovieLens?)

In [None]:
### add merged dataframes that we will use to solve our questions