In [1]:
import pandas as pd
import gc

from helpers.readers import read_dataframe

<a id="Contents"></a> <br>
# Content
* [1 - Loading default dataframes](#default)
<br>
* [2 - Merged Dataframes](#merged)
<br>

- **Q1: How impactful is the team surrounding the director on the success of a movie?**
- **Q2: To what extent does the director’s choice of movie genre affect the success of the movie?**
- **Q3: What is the impact of the director’s character choices on the success of the movie?**
- **Q4 (opt.): Can we spot bright underrated directors who are in the early stages of their career?**

<img src="https://i.postimg.cc/d3X75w5g/cmu-with-nlp.png" width="600"/>
<img src="https://i.postimg.cc/NjvFFNS8/imdb.png" width="400"/>
<img src="https://i.postimg.cc/yNVNXDVk/mappings.png" width="500"/>
<img src="https://i.postimg.cc/KvTnQPp9/movielens.png" width="550"/>

<a class="anchor" id="default"></a>
## Loading default dataframes
[Back to Table of Contents](#Contents)

### CMU Metadata

In [2]:
cmu_movies = read_dataframe(name='cmu/movies', preprocess=True, usecols=[
    "Wikipedia movie ID", 
    "Freebase movie ID", 
    "Movie name", 
    "Movie release date", 
    "Movie box office revenue", 
    "Movie runtime", 
    "Movie languages", 
    "Movie countries", 
    "Movie genres",
])

cmu_movies.info()
cmu_movies.head(1)

Preprocess logs:
✅ Fixed Movie Languages inside Movie Countries
✅ Removed Deseret characters
✅ Movie release date splitted to three columns: Movie release Year, Movie release Month, Movie release Day
✅ Seperated freebase identifiers from Movie Languages, Movie Countries and Movie Genres
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81741 entries, 0 to 81740
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Wikipedia movie ID        81741 non-null  int32  
 1   Freebase movie ID         81741 non-null  string 
 2   Movie name                81741 non-null  string 
 3   Movie box office revenue  8401 non-null   float64
 4   Movie runtime             61291 non-null  float32
 5   Movie release Year        74839 non-null  Int16  
 6   Movie release Month       42667 non-null  Int8   
 7   Movie release Day         39373 non-null  Int8   
 8   Movie languages           81741 non-null  string

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie box office revenue,Movie runtime,Movie release Year,Movie release Month,Movie release Day,Movie languages,Movie countries,Movie genres
0,975900,/m/03vyhn,Ghosts of Mars,14010832.0,98.0,2001,8,24,English,United States of America,"Thriller,Science Fiction,Horror,Adventure,Supe..."


In [3]:
cmu_characters = read_dataframe(name='cmu/characters', preprocess=True, usecols=[
    "Wikipedia movie ID",
    "Freebase movie ID",
    "Movie release date",
    "Character name",
    "Actor DOB",
    "Actor gender",
    "Actor height",
    "Actor ethnicity",
    "Actor name",
    "Actor age at movie release",
    "Freebase character/actor map ID",
    "Freebase character ID",
    "Freebase actor ID",
])

cmu_characters.info()
cmu_characters.head(1)

Preprocess logs:
✅ Movie release date splitted to three columns: Movie release Year, Movie release Month, Movie release Day
✅ Actor DOB splitted to three columns: Actor DOB Year, Actor DOB Month, Actor DOB Day
✅ Dropped Freebase character/actor map ID and Freebase character ID
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450669 entries, 0 to 450668
Data columns (total 15 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   Wikipedia movie ID               450669 non-null  int32  
 1   Freebase movie ID                450669 non-null  string 
 2   Character name                   192794 non-null  string 
 3   Actor gender                     405060 non-null  string 
 4   Actor height                     154824 non-null  float32
 5   Actor ethnicity                  106058 non-null  string 
 6   Actor name                       449441 non-null  string 
 7   Actor age at movie release       292556

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Character name,Actor gender,Actor height,Actor ethnicity,Actor name,Actor age at movie release,Freebase character/actor map ID,Movie release Year,Movie release Month,Movie release Day,Actor DOB Year,Actor DOB Month,Actor DOB Day
0,975900,/m/03vyhn,Akooshay,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,2001,8,24,1958,8,26


### IMDb (https://developer.imdb.com/non-commercial-datasets/) (4/7)

In [4]:
imdb_people = read_dataframe(name='imdb/names')
imdb_people.info()
imdb_people.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12904751 entries, 0 to 12904750
Data columns (total 6 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   nconst             string
 1   primaryName        string
 2   birthYear          Int16 
 3   deathYear          Int16 
 4   primaryProfession  string
 5   knownForTitles     string
dtypes: Int16(2), string(4)
memory usage: 467.7 MB


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0053137,tt0050419,tt0072308,tt0031983"


In [5]:
imdb_info = read_dataframe(name='imdb/movies', preprocess=True)
imdb_info.info()
imdb_info.head(1)

Preprocess logs:
✅ Moved genres from runtimeMinutes to genres column
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10218119 entries, 0 to 10218118
Data columns (total 9 columns):
 #   Column          Dtype   
---  ------          -----   
 0   tconst          string  
 1   titleType       category
 2   primaryTitle    string  
 3   originalTitle   string  
 4   isAdult         Int16   
 5   startYear       Int16   
 6   endYear         Int16   
 7   runtimeMinutes  Int32   
 8   genres          string  
dtypes: Int16(3), Int32(1), category(1), string(4)
memory usage: 458.0 MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"


In [6]:
imdb_principals = read_dataframe(name='imdb/principals')
imdb_principals.info()
imdb_principals.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58535121 entries, 0 to 58535120
Data columns (total 6 columns):
 #   Column      Dtype   
---  ------      -----   
 0   tconst      string  
 1   ordering    int8    
 2   nconst      string  
 3   category    category
 4   job         string  
 5   characters  string  
dtypes: category(1), int8(1), string(4)
memory usage: 1.9 GB


Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,,"[""Self""]"


In [7]:
imdb_ratings = read_dataframe(name='imdb/ratings')
imdb_ratings.info()
imdb_ratings.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1356511 entries, 0 to 1356510
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1356511 non-null  string 
 1   averageRating  1356511 non-null  float32
 2   numVotes       1356511 non-null  int32  
dtypes: float32(1), int32(1), string(1)
memory usage: 20.7 MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1997


### Unused IMDb (3/7):

In [8]:
imdb_akas = read_dataframe(name='imdb/akas')
imdb_akas.info()
imdb_akas.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37422067 entries, 0 to 37422066
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          string
 1   ordering         int16 
 2   title            string
 3   region           string
 4   language         string
 5   types            string
 6   attributes       string
 7   isOriginalTitle  Int8  
dtypes: Int8(1), int16(1), string(6)
memory usage: 1.8 GB


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0


In [9]:
imdb_crew = read_dataframe(name='imdb/crew')
imdb_crew.info()
imdb_crew.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10218119 entries, 0 to 10218118
Data columns (total 3 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   tconst     string
 1   directors  string
 2   writers    string
dtypes: string(3)
memory usage: 233.9 MB


Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,


In [10]:
imdb_episode = read_dataframe(name='imdb/episode')
imdb_episode.info()
imdb_episode.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7788792 entries, 0 to 7788791
Data columns (total 4 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   tconst         string
 1   parentTconst   string
 2   seasonNumber   Int16 
 3   episodeNumber  Int32 
dtypes: Int16(1), Int32(1), string(2)
memory usage: 178.3 MB


Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,tt0041951,tt0041038,1,9


### Mappings

In [11]:
mapping_w_i_f = read_dataframe(name='mapping_wikipedia_imdb_freebase')
mapping_w_i_f.info()
mapping_w_i_f.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76954 entries, 0 to 76953
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   wikipedia  72189 non-null  Int32 
 1   imdb       76954 non-null  string
 2   freebase   73947 non-null  string
dtypes: Int32(1), string(2)
memory usage: 1.5 MB


Unnamed: 0,wikipedia,imdb,freebase
0,975900,tt0228333,/m/03vyhn


In [12]:
mapping_w_i = read_dataframe(name='mapping_wikipedia_imdb')
mapping_w_i.info()
mapping_w_i.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72180 entries, 0 to 72179
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   wikipedia  72180 non-null  int32 
 1   imdb       72180 non-null  string
dtypes: int32(1), string(1)
memory usage: 846.0 KB


Unnamed: 0,wikipedia,imdb
0,975900,tt0228333


In [13]:
mapping_f_i = read_dataframe(name='mapping_freebase_imdb')
mapping_f_i.info()
mapping_f_i.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73894 entries, 0 to 73893
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   freebase  73894 non-null  string
 1   imdb      73894 non-null  string
dtypes: string(2)
memory usage: 1.1 MB


Unnamed: 0,freebase,imdb
0,/m/0kcn7,tt0058331


### MovieLens (https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset) 1/5

In [14]:
movieLens_movies = read_dataframe(name='movieLens/movies', preprocess=True)
movieLens_movies.info()
movieLens_movies.head(1)

Preprocess logs:
✅ Aligned bad rows
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   adult                  45463 non-null  category
 1   belongs_to_collection  4491 non-null   string  
 2   budget                 45463 non-null  Int32   
 3   genres                 45463 non-null  string  
 4   homepage               7779 non-null   string  
 5   id                     45463 non-null  Int32   
 6   imdb_id                45446 non-null  string  
 7   original_language      45452 non-null  string  
 8   original_title         45463 non-null  string  
 9   overview               44512 non-null  string  
 10  popularity             45463 non-null  float32 
 11  poster_path            45080 non-null  string  
 12  production_companies   45463 non-null  string  
 13  production_countries   45463 non-null  string  
 14  re

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033,81,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415


### Unused MovieLens (4/5)

In [15]:
movieLens_credits = read_dataframe('movieLens/credits')
movieLens_credits.info()
movieLens_credits.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  string
 1   crew    45476 non-null  string
 2   id      45476 non-null  int32 
dtypes: int32(1), string(2)
memory usage: 888.3 KB


Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862


In [16]:
movieLens_keywords = read_dataframe('movieLens/keywords')
movieLens_keywords.info()
movieLens_keywords.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int32 
 1   keywords  46419 non-null  string
dtypes: int32(1), string(1)
memory usage: 544.1 KB


Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."


In [17]:
movieLens_links = read_dataframe('movieLens/links', preprocess=True)
movieLens_links.info()
movieLens_links.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45843 entries, 0 to 45842
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  45843 non-null  int32 
 1   imdbId   45843 non-null  string
 2   tmdbId   45624 non-null  Int32 
dtypes: Int32(1), int32(1), string(1)
memory usage: 761.2 KB


Unnamed: 0,movieId,imdbId,tmdbId
0,1,tt0114709,862


In [18]:
movieLens_ratings = read_dataframe('movieLens/ratings')
movieLens_ratings.info()
movieLens_ratings.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int32  
 1   movieId    int32  
 2   rating     float32
 3   timestamp  int32  
dtypes: float32(1), int32(3)
memory usage: 397.1 MB


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529


### CMU Summaries NLP

In [19]:
cmu_summaries = read_dataframe(name='cmu/summaries', usecols=[
    "Wikipedia movie ID", 
    "Plot Summary"
])
cmu_summaries.info()

cmu_nameclusters = read_dataframe(name='cmu/nameclusters', usecols=['Character name', 'Freebase character/actor map ID'])
cmu_nameclusters.info()

cmu_tvtropes = read_dataframe(name='cmu/tvtropes')
cmu_tvtropes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42303 entries, 0 to 42302
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Wikipedia movie ID  42303 non-null  int32 
 1   Plot Summary        42303 non-null  string
dtypes: int32(1), string(1)
memory usage: 495.9 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 2 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Character name                   2666 non-null   string
 1   Freebase character/actor map ID  2666 non-null   string
dtypes: string(2)
memory usage: 41.8 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501 entries, 0 to 500
Data columns (total 5 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Character type              

In [20]:
cmu_characters = read_dataframe('cmu/characters_2023')
cmu_characters.info()
cmu_characters.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229420 entries, 0 to 229419
Data columns (total 5 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Wikipedia_movie_id  229420 non-null  int64 
 1   Character           229420 non-null  object
 2   AV                  229420 non-null  object
 3   PV                  229420 non-null  object
 4   Att                 229420 non-null  object
dtypes: int64(1), object(4)
memory usage: 8.8+ MB


Unnamed: 0,Wikipedia_movie_id,Character,AV,PV,Att
0,11784534,Ingrid Bergman,[],[],"[Ingrid, Bergman]"


## Merged Dataframes

### Mappings preliminary analysis

In [21]:
mapping_w_i_f.isna().any()

wikipedia     True
imdb         False
freebase      True
dtype: bool

In [22]:
mapping_w_i.isna().any()

wikipedia    False
imdb         False
dtype: bool

In [23]:
mapping_f_i.isna().any()

freebase    False
imdb        False
dtype: bool

In [24]:
len(mapping_w_i_f), len(mapping_w_i), len(mapping_f_i)

(76954, 72180, 73894)

In [25]:
w_ids_from_w_i_f = set(mapping_w_i_f['wikipedia'])
i_ids_from_w_i_f = set(mapping_w_i_f['imdb']) # tconst
f_ids_from_w_i_f = set(mapping_w_i_f['freebase'])

w_ids_from_w_i = set(mapping_w_i['wikipedia'])
i_ids_from_w_i = set(mapping_w_i['imdb']) # tconst

f_ids_from_f_i = set(mapping_f_i['freebase'])
i_ids_from_f_i = set(mapping_f_i['imdb']) # # tconst


# Intersections involving Wikipedia IDs
intersection_w = w_ids_from_w_i_f.intersection(w_ids_from_w_i)

# Intersections involving IMDb IDs
intersection_i = i_ids_from_w_i_f.intersection(i_ids_from_w_i, i_ids_from_f_i)

# Intersections involving Freebase IDs
intersection_f = f_ids_from_w_i_f.intersection(f_ids_from_f_i)

len(intersection_w), len(intersection_i), len(intersection_f)

(72180, 69121, 73759)

In [26]:
len(mapping_w_i_f.dropna())
# equivalent to pd.merge(mapping_w_i, mapping_f_i, left_on="imdb", right_on="imdb", how="inner")

69182

<a class="anchor" id="merged"></a>
## CMU IMDb movie merge

[Back to Table of Contents](#Contents)

### 1) Using "wikipedia" and "imdb" of mapping_w_i

In [27]:
cmu_movies['Wikipedia movie ID'] = cmu_movies['Wikipedia movie ID'].astype('int64')
mapping_w_i['wikipedia'] = mapping_w_i['wikipedia'].astype('int64')

merged_df = pd.merge(cmu_movies, mapping_w_i, left_on='Wikipedia movie ID', right_on='wikipedia', how='inner')

cmu_imdb_movies = pd.merge(merged_df, imdb_info, left_on='imdb', right_on='tconst', how='inner')
cmu_imdb_movies.drop(['wikipedia', 'imdb'], axis=1, inplace=True)

cmu_movies["Wikipedia movie ID"] = pd.to_numeric(cmu_movies["Wikipedia movie ID"], downcast='integer')
mapping_w_i["wikipedia"] = pd.to_numeric(mapping_w_i["wikipedia"], downcast='integer')
cmu_imdb_movies["Wikipedia movie ID"] = pd.to_numeric(cmu_imdb_movies["Wikipedia movie ID"], downcast='integer')

del merged_df
gc.collect()

cmu_imdb_movies.info()
cmu_imdb_movies.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72003 entries, 0 to 72002
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Wikipedia movie ID        72003 non-null  int32   
 1   Freebase movie ID         72003 non-null  string  
 2   Movie name                72003 non-null  string  
 3   Movie box office revenue  8329 non-null   float64 
 4   Movie runtime             56527 non-null  float32 
 5   Movie release Year        67169 non-null  Int16   
 6   Movie release Month       39293 non-null  Int8    
 7   Movie release Day         36235 non-null  Int8    
 8   Movie languages           72003 non-null  string  
 9   Movie countries           72003 non-null  string  
 10  Movie genres              72003 non-null  string  
 11  tconst                    72003 non-null  string  
 12  titleType                 72003 non-null  category
 13  primaryTitle              72003 non-null  stri

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie box office revenue,Movie runtime,Movie release Year,Movie release Month,Movie release Day,Movie languages,Movie countries,Movie genres,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,975900,/m/03vyhn,Ghosts of Mars,14010832.0,98.0,2001,8,24,English,United States of America,"Thriller,Science Fiction,Horror,Adventure,Supe...",tt0228333,movie,Ghosts of Mars,Ghosts of Mars,0,2001,,98,"Action,Horror,Sci-Fi"


### 2) Using "freebase" and "imdb" of mapping_f_i

In [28]:
merged_df = pd.merge(cmu_movies, mapping_f_i, left_on='Freebase movie ID', right_on='freebase', how='inner')

cmu_imdb_movies_v2 = pd.merge(merged_df, imdb_info, left_on='imdb', right_on='tconst', how='inner')

cmu_imdb_movies_v2.drop(['freebase', 'imdb'], axis=1, inplace=True)

del merged_df
gc.collect()

cmu_imdb_movies_v2.info()
cmu_imdb_movies_v2.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73881 entries, 0 to 73880
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Wikipedia movie ID        73881 non-null  int32   
 1   Freebase movie ID         73881 non-null  string  
 2   Movie name                73881 non-null  string  
 3   Movie box office revenue  8329 non-null   float64 
 4   Movie runtime             56932 non-null  float32 
 5   Movie release Year        68860 non-null  Int16   
 6   Movie release Month       39996 non-null  Int8    
 7   Movie release Day         36964 non-null  Int8    
 8   Movie languages           73881 non-null  string  
 9   Movie countries           73881 non-null  string  
 10  Movie genres              73881 non-null  string  
 11  tconst                    73881 non-null  string  
 12  titleType                 73881 non-null  category
 13  primaryTitle              73881 non-null  stri

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie box office revenue,Movie runtime,Movie release Year,Movie release Month,Movie release Day,Movie languages,Movie countries,Movie genres,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,975900,/m/03vyhn,Ghosts of Mars,14010832.0,98.0,2001,8,24,English,United States of America,"Thriller,Science Fiction,Horror,Adventure,Supe...",tt0228333,movie,Ghosts of Mars,Ghosts of Mars,0,2001,,98,"Action,Horror,Sci-Fi"


### 3) Using "wikipedia" and "imdb" of mapping_w_i_f

In [29]:
cmu_movies['Wikipedia movie ID'] = cmu_movies['Wikipedia movie ID'].astype('int64')
mapping_w_i['wikipedia'] = mapping_w_i['wikipedia'].astype('int64')

merged_df = pd.merge(cmu_movies, mapping_w_i_f, left_on='Wikipedia movie ID', right_on='wikipedia', how='inner')

cmu_imdb_movies_v3 = pd.merge(merged_df, imdb_info, left_on='imdb', right_on='tconst', how='inner')
cmu_imdb_movies_v3.drop(['wikipedia', 'imdb'], axis=1, inplace=True)

cmu_movies["Wikipedia movie ID"] = pd.to_numeric(cmu_movies["Wikipedia movie ID"], downcast='integer')
mapping_w_i["wikipedia"] = pd.to_numeric(mapping_w_i["wikipedia"], downcast='integer')
cmu_imdb_movies_v3["Wikipedia movie ID"] = pd.to_numeric(cmu_imdb_movies_v3["Wikipedia movie ID"], downcast='integer')

del merged_df
gc.collect()

cmu_imdb_movies_v3.info()
cmu_imdb_movies_v3.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72012 entries, 0 to 72011
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Wikipedia movie ID        72012 non-null  int32   
 1   Freebase movie ID         72012 non-null  string  
 2   Movie name                72012 non-null  string  
 3   Movie box office revenue  8330 non-null   float64 
 4   Movie runtime             56532 non-null  float32 
 5   Movie release Year        67178 non-null  Int16   
 6   Movie release Month       39299 non-null  Int8    
 7   Movie release Day         36241 non-null  Int8    
 8   Movie languages           72012 non-null  string  
 9   Movie countries           72012 non-null  string  
 10  Movie genres              72012 non-null  string  
 11  freebase                  69177 non-null  string  
 12  tconst                    72012 non-null  string  
 13  titleType                 72012 non-null  cate

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie box office revenue,Movie runtime,Movie release Year,Movie release Month,Movie release Day,Movie languages,Movie countries,...,freebase,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,975900,/m/03vyhn,Ghosts of Mars,14010832.0,98.0,2001,8,24,English,United States of America,...,/m/03vyhn,tt0228333,movie,Ghosts of Mars,Ghosts of Mars,0,2001,,98,"Action,Horror,Sci-Fi"


### 4) Using "freebase" and "imdb" of mapping_w_i_f

In [30]:
merged_df = pd.merge(cmu_movies, mapping_w_i_f, left_on='Freebase movie ID', right_on='freebase', how='inner')

cmu_imdb_movies_v4 = pd.merge(merged_df, imdb_info, left_on='imdb', right_on='tconst', how='inner')

cmu_imdb_movies_v4.drop(['freebase', 'imdb'], axis=1, inplace=True)

del merged_df
gc.collect()

cmu_imdb_movies_v4.info()
cmu_imdb_movies_v4.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73934 entries, 0 to 73933
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Wikipedia movie ID        73934 non-null  int32   
 1   Freebase movie ID         73934 non-null  string  
 2   Movie name                73934 non-null  string  
 3   Movie box office revenue  8338 non-null   float64 
 4   Movie runtime             56980 non-null  float32 
 5   Movie release Year        68911 non-null  Int16   
 6   Movie release Month       40030 non-null  Int8    
 7   Movie release Day         36996 non-null  Int8    
 8   Movie languages           73934 non-null  string  
 9   Movie countries           73934 non-null  string  
 10  Movie genres              73934 non-null  string  
 11  wikipedia                 69177 non-null  Int32   
 12  tconst                    73934 non-null  string  
 13  titleType                 73934 non-null  cate

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie box office revenue,Movie runtime,Movie release Year,Movie release Month,Movie release Day,Movie languages,Movie countries,...,wikipedia,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,975900,/m/03vyhn,Ghosts of Mars,14010832.0,98.0,2001,8,24,English,United States of America,...,975900,tt0228333,movie,Ghosts of Mars,Ghosts of Mars,0,2001,,98,"Action,Horror,Sci-Fi"


## CMU IMDb MovieLens movie merge

### 1) Direct approach 

In [31]:
# arbitrary choice of using cmu_imdb_movies_v4
cmu_imdb_movieLens_movies = pd.merge(cmu_imdb_movies_v4, movieLens_movies, left_on="tconst", right_on="imdb_id", how="inner")
cmu_imdb_movieLens_movies.drop(['imdb_id'], axis=1, inplace=True)

cmu_imdb_movieLens_movies.info()
cmu_imdb_movieLens_movies.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27468 entries, 0 to 27467
Data columns (total 44 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Wikipedia movie ID        27468 non-null  int32   
 1   Freebase movie ID         27468 non-null  string  
 2   Movie name                27468 non-null  string  
 3   Movie box office revenue  7577 non-null   float64 
 4   Movie runtime             26192 non-null  float32 
 5   Movie release Year        26804 non-null  Int16   
 6   Movie release Month       17455 non-null  Int8    
 7   Movie release Day         16537 non-null  Int8    
 8   Movie languages           27468 non-null  string  
 9   Movie countries           27468 non-null  string  
 10  Movie genres              27468 non-null  string  
 11  wikipedia                 26882 non-null  Int32   
 12  tconst                    27468 non-null  string  
 13  titleType                 27468 non-null  cate

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie box office revenue,Movie runtime,Movie release Year,Movie release Month,Movie release Day,Movie languages,Movie countries,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,975900,/m/03vyhn,Ghosts of Mars,14010832.0,98.0,2001,8,24,English,United States of America,...,2001-08-24,14010832,98,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Terror is the same on any planet.,Ghosts of Mars,False,4.8,299


### 2) Use movieLens_links

In [43]:
movieLens_movies_merge = pd.merge(movieLens_links, movieLens_movies, left_on="imdbId", right_on="imdb_id", how="inner")
movieLens_movies_merge.drop(['id','imdbId'], axis=1, inplace=True)

cmu_imdb_movieLens_movies_v2 = pd.merge(movieLens_movies_merge, cmu_imdb_movies_v4, left_on="imdb_id", right_on="tconst", how="inner")

del movieLens_movies_merge
gc.collect()

cmu_imdb_movieLens_movies_v2.info()
cmu_imdb_movieLens_movies_v2.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27449 entries, 0 to 27448
Data columns (total 46 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   movieId                   27449 non-null  int32   
 1   tmdbId                    27449 non-null  Int32   
 2   adult                     27449 non-null  category
 3   belongs_to_collection     3315 non-null   string  
 4   budget                    27449 non-null  Int32   
 5   genres_x                  27449 non-null  string  
 6   homepage                  3713 non-null   string  
 7   imdb_id                   27449 non-null  string  
 8   original_language         27445 non-null  string  
 9   original_title            27449 non-null  string  
 10  overview                  27279 non-null  string  
 11  popularity                27449 non-null  float32 
 12  poster_path               27408 non-null  string  
 13  production_companies      27449 non-null  stri

Unnamed: 0,movieId,tmdbId,adult,belongs_to_collection,budget,genres_x,homepage,imdb_id,original_language,original_title,...,wikipedia,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres_y
0,1,862,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,tt0114709,en,Toy Story,...,53085,tt0114709,movie,Toy Story,Toy Story,0,1995,,81,"Adventure,Animation,Comedy"


## IMDb People Info Principals Ratings merge

In [52]:
merged_df = pd.merge(imdb_principals, imdb_ratings, left_on="tconst", right_on="tconst", how="inner")
merged_df = pd.merge(merged_df, imdb_people, left_on="nconst",right_on="nconst", how="inner")
imdb_merged = pd.merge(merged_df, imdb_info, left_on="tconst", right_on="tconst", how="inner")

del merged_df
gc.collect()

imdb_merged.info(1)
imdb_merged.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11295521 entries, 0 to 11295520
Data columns (total 21 columns):
 #   Column             Dtype   
---  ------             -----   
 0   tconst             string  
 1   ordering           int8    
 2   nconst             string  
 3   category           category
 4   job                string  
 5   characters         string  
 6   averageRating      float32 
 7   numVotes           int32   
 8   primaryName        string  
 9   birthYear          Int16   
 10  deathYear          Int16   
 11  primaryProfession  string  
 12  knownForTitles     string  
 13  titleType          category
 14  primaryTitle       string  
 15  originalTitle      string  
 16  isAdult            Int16   
 17  startYear          Int16   
 18  endYear            Int16   
 19  runtimeMinutes     Int32   
 20  genres             string  
dtypes: Int16(5), Int32(1), category(2), float32(1), int32(1), int8(1), string(10)
memory usage: 1.2 GB


Unnamed: 0,tconst,ordering,nconst,category,job,characters,averageRating,numVotes,primaryName,birthYear,...,primaryProfession,knownForTitles,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,1,nm1588970,self,,"[""Self""]",5.7,1997,Carmencita,1868,...,soundtrack,"tt0000001,tt0057728",short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"


In [53]:
imdb_merged.columns

Index(['tconst', 'ordering', 'nconst', 'category', 'job', 'characters',
       'averageRating', 'numVotes', 'primaryName', 'birthYear', 'deathYear',
       'primaryProfession', 'knownForTitles', 'titleType', 'primaryTitle',
       'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes',
       'genres'],
      dtype='object')

## MovieLens merge

In [None]:
movieLens_links['movieId'] = movieLens_links['movieId'].astype('int64')
movieLens_ratings['movieId'] = movieLens_ratings['movieId'].astype('int64')

merged_df = pd.merge(movieLens_links, movieLens_ratings, left_on="movieId", right_on="movieId", how="inner")
merged_df = pd.merge(merged_df, movieLens_keywords, left_on="tmdbId", right_on="id", how="inner")
merged_df.drop(['id'], axis=1, inplace=True)

merged_df = pd.merge(merged_df, movieLens_credits, left_on="tmdbId", right_on="id", how="inner")
merged_df.drop(['id'], axis=1, inplace=True)

movieLens_merged = pd.merge(merged_df, movieLens_movies, left_on=['tmdbId', 'imdbId'], right_on=['id', 'imdb_id'], how="inner")
movieLens_merged.drop(['id', 'imdb_id'], axis=1, inplace=True)

del merged_df
gc.collect()

movieLens_links["movieId"] = pd.to_numeric(movieLens_links["movieId"], downcast='integer')
movieLens_ratings["movieId"] = pd.to_numeric(movieLens_ratings["movieId"], downcast='integer')
movieLens_merged["movieId"] = pd.to_numeric(movieLens_merged["movieId"], downcast='integer')

movieLens_merged.info()
movieLens_merged.head(1)