In [1]:
import pandas as pd

from helpers.readers import read_dataframe

<img src="https://i.postimg.cc/d3X75w5g/cmu-with-nlp.png" width="600"/>
<img src="https://i.postimg.cc/NjvFFNS8/imdb.png" width="400"/>
<img src="https://i.postimg.cc/yNVNXDVk/mappings.png" width="500"/>
<img src="https://i.postimg.cc/KvTnQPp9/movielens.png" width="500"/>

<a id="Contents"></a> <br>
# Content
* [1 - Loading default dataframes](#default)
<br>
* [2 - Merged Dataframes](#merged)
<br>

<a class="anchor" id="default"></a>
## Loading default dataframes
[Back to Table of Contents](#Contents)

### CMU Metadata

In [None]:
cmu_movies = read_dataframe(name='cmu/movies', preprocess=True, usecols=[
    "Wikipedia movie ID", 
    "Freebase movie ID", 
    "Movie name", 
    "Movie release date", 
    "Movie box office revenue", 
    "Movie runtime", 
    "Movie languages", 
    "Movie countries", 
    "Movie genres",
])

cmu_movies.info()
cmu_movies.head(1)

In [None]:
cmu_characters = read_dataframe(name='cmu/characters', preprocess=True, usecols=[
    "Wikipedia movie ID",
    "Freebase movie ID",
    "Movie release date",
    "Character name",
    "Actor DOB",
    "Actor gender",
    "Actor height",
    "Actor ethnicity",
    "Actor name",
    "Actor age at movie release",
    "Freebase character/actor map ID",
    "Freebase character ID",
    "Freebase actor ID",
])

cmu_characters.info()
cmu_characters.head(1)

### IMDb (https://developer.imdb.com/non-commercial-datasets/) (4/7)

In [None]:
imdb_people = read_dataframe(name='imdb/names')
imdb_people.info()
imdb_people.head(1)

In [None]:
imdb_info = read_dataframe(name='imdb/movies', preprocess=True)
imdb_info.info()
imdb_info.head(1)

In [None]:
imdb_principals = read_dataframe(name='imdb/principals')
imdb_principals.info()
imdb_principals.head(1)

In [None]:
imdb_ratings = read_dataframe(name='imdb/ratings')
imdb_ratings.info()
imdb_ratings.head(1)

### Unused IMDb (3/7):

In [None]:
imdb_akas = read_dataframe(name='imdb/akas')
imdb_akas.info()
imdb_akas.head(1)

In [None]:
imdb_crew = read_dataframe(name='imdb/crew')
imdb_crew.info()
imdb_crew.head(1)

In [None]:
imdb_episode = read_dataframe(name='imdb/episode')
imdb_episode.info()
imdb_episode.head(1)

### Mappings

In [None]:
mapping_w_i_f = read_dataframe(name='mapping_wikipedia_imdb_freebase')
mapping_w_i_f.info()
mapping_w_i_f.head(1)

In [None]:
mapping_w_i = read_dataframe(name='mapping_wikipedia_imdb')
mapping_w_i.info()
mapping_w_i.head(1)

In [None]:
mapping_f_i = read_dataframe(name='mapping_freebase_imdb')
mapping_f_i.info()
mapping_f_i.head(1)

### MovieLens (https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset) 1/5

In [6]:
movieLens_movies = read_dataframe(name='movieLens/movies', preprocess=True)
movieLens_movies.info()
movieLens_movies.head(1)

Preprocess logs:
✅ Aligned bad rows
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   adult                  45463 non-null  category
 1   belongs_to_collection  4491 non-null   string  
 2   budget                 45463 non-null  Int32   
 3   genres                 45463 non-null  string  
 4   homepage               7779 non-null   string  
 5   id                     45463 non-null  Int32   
 6   imdb_id                45446 non-null  string  
 7   original_language      45452 non-null  string  
 8   original_title         45463 non-null  string  
 9   overview               44512 non-null  string  
 10  popularity             45463 non-null  float32 
 11  poster_path            45080 non-null  string  
 12  production_companies   45463 non-null  string  
 13  production_countries   45463 non-null  string  
 14  re

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033,81,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415


### Unused MovieLens (4/5)

In [2]:
movieLens_credits = read_dataframe('movieLens/credits')
movieLens_credits.info()
movieLens_credits.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  string
 1   crew    45476 non-null  string
 2   id      45476 non-null  int32 
dtypes: int32(1), string(2)
memory usage: 888.3 KB


Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862


In [3]:
movieLens_keywords = read_dataframe('movieLens/keywords')
movieLens_keywords.info()
movieLens_keywords.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int32 
 1   keywords  46419 non-null  string
dtypes: int32(1), string(1)
memory usage: 544.1 KB


Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."


In [4]:
movieLens_links = read_dataframe('movieLens/links')
movieLens_links.info()
movieLens_links.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45843 entries, 0 to 45842
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   movieId  45843 non-null  int32
 1   imdbId   45843 non-null  int32
 2   tmdbId   45624 non-null  Int32
dtypes: Int32(1), int32(2)
memory usage: 582.1 KB


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862


In [5]:
movieLens_ratings = read_dataframe('movieLens/ratings')
movieLens_ratings.info()
movieLens_ratings.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int32  
 1   movieId    int32  
 2   rating     float32
 3   timestamp  int32  
dtypes: float32(1), int32(3)
memory usage: 397.1 MB


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529


### CMU Summaries NLP

In [None]:
cmu_summaries = read_dataframe(name='cmu/summaries', usecols=[
    "Wikipedia movie ID", 
    "Plot Summary"
])
cmu_summaries.info()

cmu_nameclusters = read_dataframe(name='cmu/nameclusters', usecols=['Character name', 'Freebase character/actor map ID'])
cmu_nameclusters.info()

cmu_tvtropes = read_dataframe(name='cmu/tvtropes')
cmu_tvtropes.info()

In [None]:
cmu_characters = read_dataframe('cmu/characters_2023')
cmu_characters.info()
cmu_characters.head(1)

<a class="anchor" id="merged"></a>
## Merged Dataframes
[Back to Table of Contents](#Contents)

### CMU Movies IMDb merge (+ MovieLens?)

### CMU Movies IMDb NLP characters  (+ MovieLens?)

In [None]:
### add merged dataframes that we will use to solve our questions