In [1]:
from src.data.dataloader import DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

dataloader = DataLoader()
characters = dataloader.load_characters()
movies_with_characters = dataloader.load_movies_with_characters()

# Function to create dummy variables from a list-like column
def create_dummies_from_list_column(df, column_name):
    # Split the string entries into lists
    split_series = df[column_name].str.split(', ')
    # Create a new DataFrame with dummy variables
    dummy_df = pd.get_dummies(split_series.apply(pd.Series).stack(), prefix=column_name).groupby(level=0).sum()
    # Merge the dummy variables into the original DataFrame
    df = pd.concat([df, dummy_df], axis=1)
    # Drop the original column
    df = df.drop(column_name, axis=1)
    return df

def replace_with_mean_median_std(df, column_name):
    # Split the ages into lists, cleaning up any whitespace or invalid entries
    df[column_name] = df[column_name].str.split(', ').apply(
        lambda x: [float(age.strip()) for age in x if age.strip().replace('.', '', 1).isdigit()]
    )
    # Calculate mean, median, and standard deviation, handling empty lists
    df[f'{column_name}_mean'] = df[column_name].apply(
        lambda x: sum(x) / len(x) if len(x) > 0 else None
    )
    df[f'{column_name}_median'] = df[column_name].apply(
        lambda x: sorted(x)[len(x) // 2] if len(x) % 2 == 1 else 
        (sorted(x)[len(x) // 2 - 1] + sorted(x)[len(x) // 2]) / 2 if len(x) > 0 else None
    )
    df[f'{column_name}_std'] = df[column_name].apply(
        lambda x: (sum((xi - sum(x) / len(x)) * 2 for xi in x) / len(x)) * 0.5 if len(x) > 0 else None
    )
    # Drop the original column
    df = df.drop(column_name, axis=1)
    return df

  tmdb_df = pd.read_csv(self.paths["tmdb_movies"])
  tmdb_df = pd.read_csv(self.paths["tmdb_movies"])
  tmdb_df = pd.read_csv(self.paths["tmdb_movies"])


In [2]:
df = movies_with_characters.drop(["wikipedia_movie_id", "wikidata_movie_id", "Movie name", "character_name", "plot"], axis=1)
df = df.dropna()
df = df[df["Movie box office revenue"] != 0]

df = create_dummies_from_list_column(df, 'actor_gender')
df = df.drop("actor_gender_", axis=1)
df["F ratio"] = df["actor_gender_F"] / (df["actor_gender_M"]+df["actor_gender_F"])


df = replace_with_mean_median_std(df, 'actor_age_at_release')
df = replace_with_mean_median_std(df, 'actor_height_meters')


df_with_countries = create_dummies_from_list_column(df, 'Movie countries')
df = df[df_with_countries["Movie countries_United States of America"] == 1]

df_with_countries = create_dummies_from_list_column(df, 'Movie languages')
df = df[df_with_countries["Movie languages_English Language"] == 1]

df = create_dummies_from_list_column(df, 'ethnicity')
df = create_dummies_from_list_column(df, 'Movie genres')

df = df.drop(["Movie languages", "ethnicity_", "Movie genres_", "Movie countries"], axis=1)
df.dropna(inplace=True)
df.head()

Unnamed: 0,Movie release date,Movie box office revenue,actor_gender_F,actor_gender_M,F ratio,actor_age_at_release_mean,actor_age_at_release_median,actor_age_at_release_std,actor_height_meters_mean,actor_height_meters_median,...,Movie genres_Vampire movies,Movie genres_War film,Movie genres_Werewolf fiction,Movie genres_Western,Movie genres_Whodunit,Movie genres_Women in prison films,Movie genres_Workplace Comedy,Movie genres_World cinema,Movie genres_Wuxia,Movie genres_Zombie Film
1,1992,21502796.0,3,10,0.230769,33.5,34.0,0.0,1.815556,1.85,...,0,0,0,0,0,0,0,0,0,1
2,1915,50000000.0,24,32,0.428571,31.142857,29.0,1.586033e-15,1.665,1.63,...,0,1,0,0,0,0,0,0,0,0
3,1982,33139618.0,3,11,0.214286,38.923077,38.0,3.279428e-15,1.796444,1.78,...,0,0,0,0,0,0,0,0,0,0
4,1974,119500000.0,2,18,0.1,46.5,43.0,0.0,1.791818,1.78,...,0,0,0,1,0,0,0,0,0,0
5,1986,8551228.0,5,10,0.333333,44.909091,50.0,2.583792e-15,1.728571,1.75,...,0,0,0,0,0,0,0,0,0,0


In [6]:
pd.set_option('display.max_columns', None)
print("\nAll columns in dataframe:")
for col in df.columns:
    print(col)





All columns in dataframe:
Movie release date
Movie box office revenue
actor_gender_F
actor_gender_M
F ratio
actor_age_at_release_mean
actor_age_at_release_median
actor_age_at_release_std
actor_height_meters_mean
actor_height_meters_median
actor_height_meters_std
ethnicity_Acadians
ethnicity_African Americans
ethnicity_African people
ethnicity_Afro-Asians
ethnicity_Afro-Cuban
ethnicity_Akan people
ethnicity_Albanian American
ethnicity_Albanians
ethnicity_American Jews
ethnicity_Americans
ethnicity_Anglo-Celtic Australians
ethnicity_Anglo-Irish people
ethnicity_Apache
ethnicity_Arab Americans
ethnicity_Arabs in Bulgaria
ethnicity_Argentines
ethnicity_Armenian American
ethnicity_Armenians
ethnicity_Ashkenazi Jews
ethnicity_Asian Americans
ethnicity_Assyrian people
ethnicity_Australian Americans
ethnicity_Australians
ethnicity_Austrian Americans
ethnicity_Austrians
ethnicity_Aymara
ethnicity_Bahamian Americans
ethnicity_Baltic Russians
ethnicity_Belarusians
ethnicity_Belgians
ethnicity_Be