In [13]:
# auto-reload modules
%load_ext autoreload
%autoreload 2

In [14]:
import os
# change working directory 
if os.getcwd().split("/")[-1] != "ada-2024-project-c1n3mada":
    os.chdir("..")

# Data Loading, Preprocessing, and Merging

## 1. Introduction

- What is the purpose of this notebook? 
- Which datasets are being used?
- What is the end goal of the data preparation process?

## 2. Imports and Setup

In [16]:
import pandas as pd

# custom modules
from src.utils.data_utils import *

## 3.  Loading the Data

In [4]:
PATH_CMU = '../data/MovieSummaries/'
PATH_TMDB = '../data/TMDB/'
PATH_IMDB = '../data/IMDB/'

### 3.1 CMU Movie Summary Corpus Dataset

#### 3.1.1 Movie Metadata

In [8]:
# load movie metadata
df_movies_cmu = pd.read_csv(f"{PATH_CMU}movie.metadata.tsv", sep="\t", header=None)
# assign column names
df_movies_cmu.columns = [
    "wiki_movie_id",
    "freebase_movie_id",
    "movie_name",
    "movie_release_date",
    "movie_box_office_revenue",
    "movie_runtime",
    "movie_languages",
    "movie_countries",
    "movie_genres",
]
# show first 5 rows
df_movies_cmu.head()

Unnamed: 0,wiki_movie_id,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


#### 3.1.2 Character Metadata

In [10]:
# load character metadata
df_characters_cmu = pd.read_csv(f"{PATH_CMU}character.metadata.tsv", sep="\t", header=None)

# assign column names
df_characters_cmu.columns = [
    "wiki_movie_id",
    "freebase_movie_id",
    "movie_release_date",
    "character_name",
    "actor_dob",
    "actor_gender",
    "actor_height",
    "actor_ethnicity",
    "actor_name",
    "actor_age_at_release",
    "freebase_character_actor_map_id",
    "freebase_character_id",
    "freebase_actor_id",
]

# show first 5 rows
df_characters_cmu.head(5)

Unnamed: 0,wiki_movie_id,freebase_movie_id,movie_release_date,character_name,actor_dob,actor_gender,actor_height,actor_ethnicity,actor_name,actor_age_at_release,freebase_character_actor_map_id,freebase_character_id,freebase_actor_id
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


#### 3.1.3 Plot Summaries

In [12]:
# load plot summary data
df_plots_cmu = pd.read_csv(
    f"{PATH_CMU}plot_summaries.txt", sep="\t", header=None, names=["wiki_movie_id", "plot_summary"]
)

# show first 5 rows
df_plots_cmu.head()

Unnamed: 0,wiki_movie_id,plot_summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


### 3.2 TMDb Dataset

In [None]:
# load TBDb dataset
# df_tmdb = pd.read_csv(f"{PATH_TMDB}movies_metadata.csv", low_memory=False)

### 3.3 IMDb Dataset

In [None]:
# load IMDb datasets
# df_title_basics = pd.read_csv(f"{PATH_IMDB}title.basics.tsv.gz", sep="\t", low_memory=False)
# df_title_ratings = pd.read_csv(f"{PATH_IMDB}title.ratings.tsv.gz", sep="\t", low_memory=False)
# df_title_crew = pd.read_csv(f"{PATH_IMDB}title.crew.tsv.gz", sep="\t", low_memory=False)
# df_name_basics = pd.read_csv(f"{PATH_IMDB}name.basics.tsv.gz", sep="\t", low_memory=False)

## 4. Initial Exploration

- `describe()` for each dataset
- Missing values
- Basic distributions
- Keep it relatively simple and short 

## 5. Data Cleaning 



### 5.1 Data Type Conversion

In [None]:
# CMU Movie Summary dataset

# convert to numeric values
df_movies_cmu["movie_box_office_revenue"] = pd.to_numeric(df_movies_cmu["movie_box_office_revenue"], errors="coerce")
df_movies_cmu["movie_runtime"] = pd.to_numeric(df_movies_cmu["movie_runtime"], errors="coerce")

# convert to datetime
df_movies_cmu["movie_release_date"] = df_movies_cmu["movie_release_date"].apply(parse_date)
df_characters_cmu["movie_release_date"] = df_characters_cmu["movie_release_date"].apply(parse_date)
df_characters_cmu["actor_dob"] = df_characters_cmu["actor_dob"].apply(parse_date)

# parse dictionaries
df_movies_cmu["movie_languages"] = df_movies_cmu["movie_languages"].apply(parse_dict)
df_movies_cmu["movie_countries"] = df_movies_cmu["movie_countries"].apply(parse_dict)
df_movies_cmu["movie_genres"] = df_movies_cmu["movie_genres"].apply(parse_dict)

### 5.2 Handling Missing Values

### 5.3 Removing Duplicates or Irrelevant Data

- Check for and remove data that doesn't make sense (e.g. negative age)

## 6. Data Preprocessing

### 6.1 Feature Engineering 

- Add new features if necessay (e.g. parsing dates)

## 7. Dataset Merging

## 8. Inflation correction

## 9. Final Checks and Export