# Imports and loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
import os
import json
from loads import *

PATH_FOLDER = "MovieSummaries/"

In [None]:
# Loading character metadata DataFrame
character_df = load_character_metadata()
# Loading movie metadata DataFrame
movie_df = load_movie_metadata()
# Loading plot summaries DataFrame
plot_summaries_df = load_plot_summaries()
# Loading TVTropes DataFrame
tvtropes_df = load_tvtropes()
# Loading name clusters DataFrame
name_clusters_df = load_name_clusters()
# Loading inflation DataFrame
inflation = pd.read_excel("external_dataset/inflation.xlsx", header=11)

# Preprocessing

## CLEANING

#### MOVIE METADATA

Extract variables from (ID : variable) tuples

In [None]:
movie_df['Movie Languages'] = movie_df['Movie languages (Freebase ID:name tuples)'].apply(lambda x: list(json.loads(x).values()))
movie_df['Movie Countries'] = movie_df['Movie countries (Freebase ID:name tuples)'].apply(lambda x: list(json.loads(x).values()))
movie_df['Movie Genres'] = movie_df['Movie genres (Freebase ID:name tuples)'].apply(lambda x: list(json.loads(x).values()))

Drop useless columns (IDs)

In [None]:
movie_columns_to_drop = ['Movie languages (Freebase ID:name tuples)', 'Movie countries (Freebase ID:name tuples)','Movie genres (Freebase ID:name tuples)','Freebase movie ID']
movie_df.drop(movie_columns_to_drop, axis=1, inplace=True)

Drop movies with missing (Nan) Box office revenue since it's our main interest

In [None]:
movie_df.dropna(subset=['Movie box office revenue'], inplace=True)

Extracting Year and Month from the Movie release date (date format YYYY and YYYY-MM-DD can't be dealt together)

In [None]:
movie_df.dropna(subset=["Movie release date"], inplace=True)
movie_df['Year'] = movie_df["Movie release date"].apply(lambda x: str(x)[:4]).astype(int)
movie_df['Month'] = pd.to_datetime(movie_df['Movie release date'],errors='coerce').dt.month

### Inflation

In [None]:
inflation["Cumulative Inflation to 2022"] = inflation.iloc[-1].Annual / inflation["Annual"]
movie_with_inflation = pd.merge(movie_df, inflation.drop(['Annual'], axis=1), on='Year')
movie_with_inflation['Revenue with inflation'] = movie_with_inflation['Movie box office revenue'] * movie_with_inflation['Cumulative Inflation to 2022']

#### CHARACTER METADATA

Drop useless columns (IDs and actor height)

In [None]:
character_columns_to_drop = ['Freebase movie ID', 'Actor ethnicity (Freebase ID)','Freebase character ID','Freebase actor ID','Actor height (in meters)']
character_df.drop(character_columns_to_drop, axis=1, inplace=True)

## Splitting

Split the movie dataframe in many small different df for each feature. Each small df has one feature and 'Movie Box office revenue'

In [None]:
movie_runtime_df = movie_df[['Movie runtime', 'Movie box office revenue']].copy()
movie_languages_df = movie_df[['Movie Languages', 'Movie box office revenue']].copy()
movie_countries_df = movie_df[['Movie Countries', 'Movie box office revenue']].copy()
movie_genres_df = movie_df[['Movie Genres', 'Movie box office revenue']].copy()
movie_release_date_df = movie_df[['Movie release date', 'Movie box office revenue']].copy()

Drop movies in each small dataframe, only for the missing values (Nan) of the concerned feature

In [None]:
movie_runtime_df.dropna(subset=['Movie runtime'], inplace=True)
movie_languages_df.dropna(subset=['Movie Languages'], inplace=True)
movie_countries_df.dropna(subset=['Movie Countries'], inplace=True)
movie_genres_df.dropna(subset=['Movie Genres'], inplace=True)
movie_release_date_df.dropna(subset=["Movie release date"], inplace=True)

Extracting Year and Month from the Movie release date (date format YYYY and YYYY-MM-DD can't be dealt together)

In [None]:
movie_release_date_df['Year'] = movie_release_date_df["Movie release date"].apply(lambda x: str(x)[:4]).astype(int)
movie_release_date_df['Month'] = pd.to_datetime(movie_release_date_df['Movie release date'],errors='coerce').dt.month

New Year and Month feature df & erases nans

In [None]:
years_df = movie_release_date_df[['Year','Movie box office revenue']].copy()
months_df = movie_release_date_df[['Month','Movie box office revenue']].copy()
years_df.dropna(subset=['Year'], inplace=True) #no nan but safer
months_df.dropna(subset=['Month'], inplace=True)

je merge characters et movie

In [None]:
character_movie_merged_df = pd.merge(character_df.drop(['Movie release date'],axis=1), movie_df, on=['Wikipedia movie ID'])


In [None]:
character_movie_merged_df.head()

je split sur gender et pour ce faire je prends d'abord avec l'acteur pour drop les vrai duplicates

In [None]:
actor_gender_df = character_movie_merged_df[['Actor gender','Actor name','Movie box office revenue']].copy()
actor_gender_df.drop_duplicates()
gender_df = actor_gender_df[['Actor gender','Movie box office revenue']].copy()
gender_df.dropna(subset=['Actor gender'], inplace=True)

In [None]:
actor_df = character_movie_merged_df[['Actor name','Actor age at movie release','Movie box office revenue']].copy()
actor_df.dropna()