# Section 0: Importing libraries

In [1]:
import pandas as pd
import datetime as dt

# Section 1: Data pre-processing
---
(fix broken links and add numbers)
[Loading the data](#loading-the-data) \
[Dataset properties](#data-properties)

### Loading the data

In [76]:
# main data path and relative paths to the datasets
DATA_PATH = "./data/"
DATASET_PATH = {
    "movie_metadata": DATA_PATH + "movie.metadata.tsv",
    "character_metadata": DATA_PATH + "character.metadata.tsv",
    "movie_budget": DATA_PATH + "movies_metadata.csv"
}

In [77]:
# columns used when loading a specific dataset (for the attribute "names" in pandas.read_csv)
DATASET_COLUMNS = {
    "movie_metadata": ["wikipedia_id", "freebase_movie_id", "name", "release_date", "revenue", "runtime", "languages", "countries", "genres"],
    "character_metadata": ["wikipedia_id", "freebase_movie_id", "release_date", "character_name", "birthday", "gender", "height", "ethnicity", "name", "age_at_release", "freebase_map", "freebase_char_id", "freebase_actor_id"],
    "movie_budget": ["budget", "imdb_id", "original_title", "popularity", "release_date", "revenue", "runtime", "title", "vote_average", "vote_count"]
}

TODO: Add dtypes for all datasets

In [78]:
DATASET_TYPES = {
    "movie_metadata": {
        "wikipedia_id": "string",
        "freebase_movie_id": "string",
        "name": "string",
        "release_date": "string",
        "revenue": "float64",
        "runtime": "float64",
        "languages": "object",
        "countries": "object",
        "genres": "object"
    },
    "movie_budget": {
        "budget": "o",
        "imdb_id": "string",
        "original_title": "object",
        "popularity": "float64",
        "release_date": "object",
        "revenue": "float64",
        "runtime": "float64",
        "title": "object",
        "vote_average": "float64",
        "vote_count": "float64"
    }
}

In [92]:
# load the datasets as pandas dataframes
movie_metadata = pd.read_csv(DATASET_PATH["movie_metadata"], sep='\t', names=DATASET_COLUMNS["movie_metadata"], index_col=False, header=None, dtype=DATASET_TYPES["movie_metadata"])

character_metadata = pd.read_csv(DATASET_PATH["character_metadata"], sep='\t', names=DATASET_COLUMNS["character_metadata"], index_col=False, header=None)

movie_budget = pd.read_csv(DATASET_PATH["movie_budget"], sep=',', index_col=False, header=0, usecols=DATASET_COLUMNS["movie_budget"])

  movie_budget = pd.read_csv(DATASET_PATH["movie_budget"], sep=',', index_col=False, header=0, usecols=DATASET_COLUMNS["movie_budget"])


### Dataset properties

In [80]:
print("Number of movie rows: ", movie_metadata.shape[0])
print("Number of features per movie: ", movie_metadata.shape[1])
# a random movie sample
movie_metadata.sample(1)

Number of movie rows:  81741
Number of features per movie:  9


Unnamed: 0,wikipedia_id,freebase_movie_id,name,release_date,revenue,runtime,languages,countries,genres
14414,22841607,/m/0dysp_,Half-Life,2006,,106.0,{},"{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama"", ""/m/06n90"": ""Science F..."


In [81]:
print("Number of movie budget rows: ", movie_budget.shape[0])
print("Number of features per movie budget: ", movie_budget.shape[1])
# a random movie budget sample
movie_budget.sample(1)

Number of movie budget rows:  45466
Number of features per movie budget:  10


Unnamed: 0,budget,imdb_id,original_title,popularity,release_date,revenue,runtime,title,vote_average,vote_count
3742,0,tt0178050,Sunset Strip,1.172956,2000-08-18,0.0,90.0,Sunset Strip,6.0,1.0


In [82]:
print("Number of character rows: ", character_metadata.shape[0])
print("Number of features per character: ", character_metadata.shape[1])
# a random character sample
character_metadata.sample(1)

Number of character rows:  450669
Number of features per character:  13


Unnamed: 0,wikipedia_id,freebase_movie_id,release_date,character_name,birthday,gender,height,ethnicity,name,age_at_release,freebase_map,freebase_char_id,freebase_actor_id
268569,5566343,/m/0dswnn,1932,,1883-04-05,M,1.83,,Walter Huston,,/m/02vd68w,,/m/012dtw


#### Nan values

In [73]:
# Create dataframe for nan values with actual sum and percentage
pd.DataFrame({'nan_sum': movie_metadata.isna().sum(), 'nan_percentage': movie_metadata.isna().mean() * 100})

Unnamed: 0,nan_sum,nan_percentage
wikipedia_id,0,0.0
freebase_movie_id,0,0.0
name,0,0.0
release_date,6902,8.443743
revenue,73340,89.722416
runtime,20450,25.018045
languages,0,0.0
countries,0,0.0
genres,0,0.0


TODO: Add why we drop the revenue rows with nan

In [74]:
# create a new movies dataframe without nan values in the revenue column and confirm dimensions
movies = movie_metadata.dropna(subset=['revenue']).copy()
assert movies.shape[0] == movie_metadata.shape[0] - movie_metadata['revenue'].isna().sum()

print("Number of movies before filtering: ", movie_metadata.shape[0])
print("Number of movies after filtering: ", movies.shape[0])

Number of movies before filtering:  81741
Number of movies after filtering:  8401
