In [1]:
# Data wrangling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import statsmodels.api as sm

## DATA WRANGLING

In [2]:
# Load datasets
data_folder = './data/'
modified_data_folder = './modified_data/'

headers_character_metadata = ['Wikipedia Movie ID', 'Freebase Movie ID','Movie release date', 'Character name', 'Actor DOB', 
                              'Actor gender', 'Actor height', 'Actor ethnicity', 'Actor name',
                              'Actor age at movie release', 'Freebase character map', 'Freebase character ID', 'Freebase actor ID']
character_metadata = pd.read_csv(data_folder + 'character.metadata.tsv', sep='\t', names=headers_character_metadata)


movie_metadata = pd.read_csv(modified_data_folder +'movie_metadata_TMDB_FINAL.csv', sep=',')
movie_metadata.rename(columns={'Movie box office revenue enriched':'Movie box office revenue'}, inplace=True)

  movie_metadata = pd.read_csv(modified_data_folder +'movie_metadata_TMDB_FINAL.csv', sep=',')


In [3]:
# Add a column of release year to get consistency
character_metadata['Movie release year'] = character_metadata['Movie release date'].str.split('-').str[0].astype('float')
movie_metadata['Movie release year'] = movie_metadata['Movie release date'].str.split('-').str[0].astype('float')

# Add a column of birth year to get consistency
character_metadata['Actor birth year'] = character_metadata['Actor DOB'].str.split('-').str[0].astype('float')

In [4]:
condition1 = (character_metadata['Movie release year'] < 1800)
condition2 = (movie_metadata['Movie release year'] < 1800)

character_metadata.loc[condition1, 'Movie release year'] = np.nan
movie_metadata.loc[condition2, 'Movie release year'] = np.nan

In [5]:
# Define the condition to identify rows where 'Actor birth year' needs to be set to NaN
condition = (character_metadata['Actor birth year'] < 1700) | (character_metadata['Actor birth year'] > 2016)
character_metadata.loc[condition, 'Actor birth year'] = np.nan

In [6]:
# Define the condition to identify rows where height needs to be set to NaN
condition = (character_metadata['Actor height'] > 2.4)
character_metadata.loc[condition, 'Actor height'] = np.nan

In [7]:
# Define the condition to identify rows where age needs to be set to NaN
condition = (character_metadata['Actor age at movie release'] < 0) | (character_metadata['Actor age at movie release'] > 110)
character_metadata.loc[condition, 'Actor age at movie release'] = np.nan

In [8]:
condition = (movie_metadata['Movie runtime'] > 5*60)
movie_metadata.loc[condition, 'Movie runtime'] = np.nan

In [9]:
# Function to extract genre names without Freebase codes
def extract_genre_names(genre_dict):
    genre_names = []
    for code, name in genre_dict.items():
        genre_names.append(name)
    return ", ".join(genre_names)

# Clean the "Movie genres" column
movie_metadata["Movie genres"] = movie_metadata["Movie genres"].apply(eval)  # Convert string representation of dictionary to actual dictionary
movie_metadata["Movie genres"] = movie_metadata["Movie genres"].apply(extract_genre_names)

# Clean the "Movie languages" column (assuming similar structure to genres column)
movie_metadata["Movie languages"] = movie_metadata["Movie languages"].apply(eval)
movie_metadata["Movie languages"] = movie_metadata["Movie languages"].apply(lambda x: ", ".join(x.values()))

# Clean the "Movie countries" column (assuming similar structure to genres column)
movie_metadata["Movie countries"] = movie_metadata["Movie countries"].apply(eval)
movie_metadata["Movie countries"] = movie_metadata["Movie countries"].apply(lambda x: ", ".join(x.values()))