# Imports and loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import os
import json
from loads import *

PATH_FOLDER = "MovieSummaries/"

In [2]:
# Loading character metadata DataFrame
character_metadata_df = load_character_metadata()
# Loading movie metadata DataFrame
movie_metadata_df = load_movie_metadata()
# Loading plot summaries DataFrame
plot_summaries_df = load_plot_summaries()
# Loading TVTropes DataFrame
tvtropes_df = load_tvtropes()
# Loading name clusters DataFrame
name_clusters_df = load_name_clusters()

# Preprocessing

## CLEANING  

MOVIE METADATA

expliquer: jprends les list des dict

In [3]:
movie_metadata_df['Movie Languages'] = movie_metadata_df['Movie languages (Freebase ID:name tuples)'].apply(lambda x: list(json.loads(x).values()))
movie_metadata_df['Movie Countries'] = movie_metadata_df['Movie countries (Freebase ID:name tuples)'].apply(lambda x: list(json.loads(x).values()))
movie_metadata_df['Movie Genres'] = movie_metadata_df['Movie genres (Freebase ID:name tuples)'].apply(lambda x: list(json.loads(x).values()))

expliquer: j'enleve les colonnes useless

In [4]:
columns_to_drop = ['Movie languages (Freebase ID:name tuples)', 'Movie countries (Freebase ID:name tuples)','Movie genres (Freebase ID:name tuples)','Freebase movie ID']
movie_metadata_df = movie_metadata_df.drop(columns_to_drop, axis=1)


expliquer: jdrop les nan de box office

In [5]:
movie_metadata_df.dropna(subset=['Movie box office revenue'], inplace=True)
movie_metadata_df.head()

Unnamed: 0,Wikipedia movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie Languages,Movie Countries,Movie Genres
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
7,10408933,Alexander's Ragtime Band,1938-08-16,3600000.0,106.0,[English Language],[United States of America],"[Musical, Comedy, Black-and-white]"
13,171005,Henry V,1989-11-08,10161099.0,137.0,[English Language],[United Kingdom],"[Costume drama, War film, Epic, Period piece, ..."
17,77856,Mary Poppins,1964-08-27,102272727.0,139.0,[English Language],[United States of America],"[Children's/Family, Musical, Fantasy, Comedy, ..."
21,612710,New Rose Hotel,1999-10-01,21521.0,92.0,[English Language],[United States of America],"[Thriller, Science Fiction, Future noir, Indie..."


CHARACTER METADATA

expliquer: j'enleve les colonnes useless

In [6]:
columns_to_drop = ['Freebase movie ID', 'Actor ethnicity (Freebase ID)','Freebase character ID','Freebase actor ID','Actor height (in meters)']
character_metadata_df = character_metadata_df.drop(columns_to_drop, axis=1)
character_metadata_df.head()

Unnamed: 0,Wikipedia movie ID,Movie release date,Character name,Actor date of birth,Actor gender,Actor name,Actor age at movie release,Freebase character/actor map ID
0,975900,2001-08-24,Akooshay,1958-08-26,F,Wanda De Jesus,42.0,/m/0bgchxw
1,975900,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,Natasha Henstridge,27.0,/m/0jys3m
2,975900,2001-08-24,Desolation Williams,1969-06-15,M,Ice Cube,32.0,/m/0jys3g
3,975900,2001-08-24,Sgt Jericho Butler,1967-09-12,M,Jason Statham,33.0,/m/02vchl6
4,975900,2001-08-24,Bashira Kincaid,1977-09-25,F,Clea DuVall,23.0,/m/02vbb3r


## Splitting

expliquer: je split df de movie en plusieurs sous df contenant le revenu

In [7]:
movie_runtime_df = movie_metadata_df[['Movie runtime','Movie box office revenue']].copy()
movie_languages_df = movie_metadata_df[['Movie Languages','Movie box office revenue']].copy()
movie_countries_df = movie_metadata_df[['Movie Countries','Movie box office revenue']].copy()
movie_genres_df = movie_metadata_df[['Movie Genres','Movie box office revenue']].copy()

In [8]:
movie_runtime_df.dropna(subset=['Movie runtime'], inplace=True)

j'prends les mois et années

In [9]:
movie_release_date_df = movie_metadata_df[['Movie release date','Movie box office revenue']].copy()
movie_release_date_df = movie_release_date_df.dropna(subset=["Movie release date"])

movie_release_date_df = movie_release_date_df.assign(Year=movie_release_date_df["Movie release date"].apply(lambda x: str(x)[:4]).astype(int))
movie_release_date_df['Movie release date'] = pd.to_datetime(movie_release_date_df['Movie release date'],errors='coerce')
movie_release_date_df['Month'] = movie_release_date_df['Movie release date'].dt.month
movie_release_date_df.sample(10)

Unnamed: 0,Movie release date,Movie box office revenue,Year,Month
67520,NaT,228888.0,1992,
23534,2004-06-03,107212751.0,2004,6.0
53451,1987-02-27,18553948.0,1987,2.0
32465,2012-04-06,16863583.0,2012,4.0
41046,NaT,1450000.0,1951,
22866,1987-08-14,31623833.0,1987,8.0
22023,1949-05-26,2340336.0,1949,5.0
79551,2001-12-13,313542341.0,2001,12.0
31065,2007-07-10,15524680.0,2007,7.0
7571,2000-02-18,51880044.0,2000,2.0


In [10]:
years_df = movie_release_date_df[['Year','Movie box office revenue']].copy()
months_df = movie_release_date_df[['Month','Movie box office revenue']].copy()
months_df.dropna(subset=['Month'], inplace=True)

je merge characters et movie

In [11]:
character_movie_merged_df = pd.merge(character_metadata_df.drop(['Movie release date'],axis=1), movie_metadata_df, on=['Wikipedia movie ID'])


In [12]:
character_movie_merged_df.head()

Unnamed: 0,Wikipedia movie ID,Character name,Actor date of birth,Actor gender,Actor name,Actor age at movie release,Freebase character/actor map ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie Languages,Movie Countries,Movie Genres
0,975900,Akooshay,1958-08-26,F,Wanda De Jesus,42.0,/m/0bgchxw,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
1,975900,Lieutenant Melanie Ballard,1974-08-15,F,Natasha Henstridge,27.0,/m/0jys3m,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
2,975900,Desolation Williams,1969-06-15,M,Ice Cube,32.0,/m/0jys3g,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
3,975900,Sgt Jericho Butler,1967-09-12,M,Jason Statham,33.0,/m/02vchl6,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."
4,975900,Bashira Kincaid,1977-09-25,F,Clea DuVall,23.0,/m/02vbb3r,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],"[Thriller, Science Fiction, Horror, Adventure,..."


je split sur gender et pour ce faire je prends d'abord avec l'acteur pour drop les vrai duplicates

In [13]:
actor_gender_df = character_movie_merged_df[['Actor gender','Actor name','Movie box office revenue']].copy()
actor_gender_df.drop_duplicates()
gender_df = actor_gender_df[['Actor gender','Movie box office revenue']].copy()
gender_df.dropna(subset=['Actor gender'], inplace=True)

In [14]:
actor_df = character_movie_merged_df[['Actor name','Actor age at movie release','Movie box office revenue']].copy()
actor_df.dropna()

Unnamed: 0,Actor name,Actor age at movie release,Movie box office revenue
0,Wanda De Jesus,42.0,14010832.0
1,Natasha Henstridge,27.0,14010832.0
2,Ice Cube,32.0,14010832.0
3,Jason Statham,33.0,14010832.0
4,Clea DuVall,23.0,14010832.0
...,...,...,...
101023,Roland Culver,64.0,5400000.0
101024,Michael Hordern,53.0,5400000.0
101025,Reginald Beckwith,56.0,5400000.0
101026,Gong Yoo,32.0,30723856.0
