In [2]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import pickle

In [3]:
folder = '../data/CMU/'
pickle_folder = "../data/pickle/"

In [4]:
movie_original_data = pd.read_csv(folder + 'movie.metadata.tsv' ,sep='\t',names=['Wikipedia_movie_ID',
'Freebase_movie_ID',
'Movie_name',
'Movie_release_date',
'Movie_box_office_revenue',
'Movie_runtime',
'Movie_languages_(Freebase_ID:name_tuples)',
'Movie_countries_(Freebase_ID:name_tuples)',
'Movie_genres_(Freebase_ID:name_tuples)'])

# 1. Movie Dataset Cleaning

First, we will keep our focus on cleaning the "movie.metadata" dataset. 
Hence to do this, we first need to have a bit of fun with the dataset to analyze where could there be any problems or missing data to answer our main project question. 

In [5]:
movies = movie_original_data.copy()

pickle.dump( movies, open(pickle_folder + "movies.p", "wb" ) )

In [6]:
print(movies.shape[0])

81741


## PARTIE 1 : Droping NaN Movie box office revenue (PAS ENCORE SUR) 

Most important, because our main question is focusing on the implications of characteristics on the overall box office performance of movies, the first goal is to drop all the movie lines where the box office is not detailed. 

In [7]:
movies_with_box_office = movies.dropna(subset=['Movie_box_office_revenue'])

pickle.dump( movies_with_box_office, open(pickle_folder + "movies_with_box_office_revenue.p", "wb" ) )

In [8]:
movies_with_box_office.shape[0]

8401

##  PARTIE 2 : Cleaning Data (en gardant les NaN values pour le box_office (dataset : movies))

###  2. 1 Cleaning the years

- First, we saw that for the movie "Hunting Season" the release date written was "1010-12-02" but the real release date is  "2010-12-02" :

In [9]:
movies.loc[movies['Movie_name'] == 'Hunting Season']

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages_(Freebase_ID:name_tuples),Movie_countries_(Freebase_ID:name_tuples),Movie_genres_(Freebase_ID:name_tuples)
62836,29666067,/m/0fphzrf,Hunting Season,1010-12-02,12160978.0,140.0,"{""/m/02hwyss"": ""Turkish Language"", ""/m/02h40lc...","{""/m/01znc_"": ""Turkey""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/02n4kr"": ""My..."


Hence, we modified this value by the true one : 

In [10]:
movies.loc[62836, 'Movie_release_date'] = '2010-12-02'
print(movies.loc[62836, 'Movie_release_date'])

2010-12-02


- Drop all the movies where no release date are written

In [11]:
movies_date = movies.dropna(inplace = False, subset = 'Movie_release_date').copy()

In [12]:
movies_date.shape[0]

74839

- Create a column 'Year' where we only have the released year of the movie (to do year by year analysis) : 

In [13]:
movies_date['Year'] = movies_date['Movie_release_date'].str[:4]
movies_date['Year'] = movies_date['Year'].astype(int)

intervals = [(1910, 1930), (1930, 1950), (1950, 1970), (1970, 1990), (1990, 2016)]
movies_date['Year_Interval'] = pd.cut(movies_date['Year'], bins=[1910, 1930, 1950, 1970, 1990, 2016], labels=['1910-1930', '1930-1950', '1950-1970', '1970-1990', '1990-2016'])
movies_date['Year_Interval'] = movies_date['Year_Interval'].astype(str)

movies_date = movies_date.query(" 2016 > Year >1910")

pickle.dump( movies_date, open(pickle_folder + "movies_date.p", "wb" ) )

- Now, because we will do some analysis using the seasons (summer, winter, fall, spring) of movies' release date ( j'ai cree une copie de movie_data ou j'ai drop les movie ou seulement l'annee est ecris et garde celles ou le mois et le jour sont annoncé), j'ai aussi creer une nouvelle case pour dire la saison du movie

In [14]:
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)

from src.utils.utils import categorize_release_season

In [15]:
md_release_season = movies_date.copy()
md_release_season['Movie_release_date'] = pd.to_datetime(md_release_season['Movie_release_date'], errors='coerce')
md_release_season.dropna(subset = 'Movie_release_date', inplace = True)

In [16]:
md_release_season['release_season'] = md_release_season['Movie_release_date'].apply(categorize_release_season)
md_release_season.shape[0]

release_season
Autumn    11168
Spring     9900
Winter     9599
Summer     8550
Name: count, dtype: int64

In [320]:
md_release_season
pickle.dump( md_release_season, open(pickle_folder + "movies_season.p", "wb" ) )

### Clean 'Genres' Column 

In [321]:
from src.utils.utils import extract_info

# Assuming 'movies' is your DataFrame
# First, ensure that the column names are correctly referenced:
column_name = 'Movie_genres_(Freebase_ID:name_tuples)'  # Adjust if the actual column name differs

# Apply the function to the column
md_Genres = movies.copy()
md_Genres['Genres'] = md_language[column_name].apply(extract_genres)

In [322]:
genres_split = pd.DataFrame(md_Genres["Genres"].tolist(), index=md_Genres.index)
md_Genres['nb_of_Genres'] = md_Genres["Genres"].apply(lambda x:len(x))
md_Genres['nb_of_Genres'].describe()

count    81741.000000
mean         2.976829
std          2.107219
min          0.000000
25%          1.000000
50%          2.000000
75%          4.000000
max         17.000000
Name: nb_of_Genres, dtype: float64

In [323]:
# On garde seulement les 3 première colonnes de genres_split
genres_split = genres_split.iloc[:, :3]
genres_split

Unnamed: 0,0,1,2
0,"""Thriller""","""Science Fiction""","""Horror"""
1,"""Mystery""","""Biographical film""","""Drama"""
2,"""Crime Fiction""","""Drama""",
3,"""Thriller""","""Erotic thriller""","""Psychological thriller"""
4,"""Drama""",,
...,...,...,...
81736,"""Drama""",,
81737,"""Biographical film""","""Drama""","""Documentary"""
81738,"""Satire""","""Comedy""",
81739,"""Science Fiction""","""Japanese Movies""","""Adventure"""


In [324]:
md_genres_equal17 = md_Genres.query("nb_of_Genres == 17")
md_genres_equal17

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages_(Freebase_ID:name_tuples),Movie_countries_(Freebase_ID:name_tuples),Movie_genres_(Freebase_ID:name_tuples),Genres,nb_of_Genres
63356,1304986,/m/04rg8v,Young Sherlock Holmes,1985-12-06,19739575.0,105.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/01jfsb"": ""Thriller"", ""/m/028v3"": ""Detecti...","[""Thriller"", ""Detective fiction"", ""Adventure"",...",17


In [325]:
genres_split = genres_split.add_prefix("Genres_")
genres_split

Unnamed: 0,Genres_0,Genres_1,Genres_2
0,"""Thriller""","""Science Fiction""","""Horror"""
1,"""Mystery""","""Biographical film""","""Drama"""
2,"""Crime Fiction""","""Drama""",
3,"""Thriller""","""Erotic thriller""","""Psychological thriller"""
4,"""Drama""",,
...,...,...,...
81736,"""Drama""",,
81737,"""Biographical film""","""Drama""","""Documentary"""
81738,"""Satire""","""Comedy""",
81739,"""Science Fiction""","""Japanese Movies""","""Adventure"""


In [326]:
data_genres = md_Genres.join(genres_split).drop(columns=["Genres","Movie_genres_(Freebase_ID:name_tuples)"])
pickle.dump( data_genres, open(pickle_folder + "movies_genres_split.p", "wb" ) )
data_genres

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages_(Freebase_ID:name_tuples),Movie_countries_(Freebase_ID:name_tuples),nb_of_Genres,Genres_0,Genres_1,Genres_2
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}",7,"""Thriller""","""Science Fiction""","""Horror"""
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}",4,"""Mystery""","""Biographical film""","""Drama"""
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}",2,"""Crime Fiction""","""Drama""",
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}",3,"""Thriller""","""Erotic thriller""","""Psychological thriller"""
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}",1,"""Drama""",,
...,...,...,...,...,...,...,...,...,...,...,...,...
81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}",1,"""Drama""",,
81737,34980460,/m/0g4pl34,Knuckle,2011-01-21,,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...",3,"""Biographical film""","""Drama""","""Documentary"""
81738,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,,66.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}",2,"""Satire""","""Comedy""",
81739,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}",7,"""Science Fiction""","""Japanese Movies""","""Adventure"""


In [327]:
md_genres_exploded = md_Genres.explode('Genres')
pickle.dump( md_genres_exploded, open(pickle_folder + "movies_genres_exploded.p", "wb" ) )
md_genres_exploded.value_counts('Genres')

Genres
"Drama"               34007
"Comedy"              16349
"Romance Film"        10234
"Black-and-white"      9094
"Action"               8798
                      ...  
"Comdedy"                 1
"New Queer Cinema"        1
"Linguistics"             1
"Statutory rape"          1
"C-Movie"                 1
Name: count, Length: 363, dtype: int64

### Clean 'Languages' Column

In [328]:
from src.utils.utils import extract_info

# Assuming 'movies' is your DataFrame
# First, ensure that the column names are correctly referenced:
column_name = 'Movie_languages_(Freebase_ID:name_tuples)'  # Adjust if the actual column name differs

# Apply the function to the column
md_language = movies.copy()
md_language['Language'] = md_language[column_name].apply(extract_language_names)

In [329]:
language_split = pd.DataFrame(md_language["Language"].tolist(), index=md_language.index)
md_language['nb_of_Languages'] = md_language["Language"].apply(lambda x:len(x))
md_language['nb_of_Languages'].describe()

count    81741.000000
mean         0.992550
std          0.687478
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max         15.000000
Name: nb_of_Languages, dtype: float64

In [330]:
md_languages_equal15 = md_language.query("nb_of_Languages == 15")
md_languages_equal15

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages_(Freebase_ID:name_tuples),Movie_countries_(Freebase_ID:name_tuples),Movie_genres_(Freebase_ID:name_tuples),Language,nb_of_Languages
9268,21869986,/m/05p5bn8,Jai Jagannatha,2007-07-12,,127.0,"{""/m/019vzt"": ""Assamese Language"", ""/m/09s02"":...","{""/m/03rk0"": ""India""}","{""/m/07s9rl0"": ""Drama""}","[""Assamese Language"", ""Telugu language"", ""Chha...",15


In [331]:
language_split = language_split.add_prefix("Language_")
language_split

Unnamed: 0,Language_0,Language_1,Language_2,Language_3,Language_4,Language_5,Language_6,Language_7,Language_8,Language_9,Language_10,Language_11,Language_12,Language_13,Language_14
0,"""English Language""",,,,,,,,,,,,,,
1,"""English Language""",,,,,,,,,,,,,,
2,"""Norwegian Language""",,,,,,,,,,,,,,
3,"""English Language""",,,,,,,,,,,,,,
4,"""German Language""",,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81736,"""English Language""",,,,,,,,,,,,,,
81737,"""English Language""",,,,,,,,,,,,,,
81738,"""English Language""",,,,,,,,,,,,,,
81739,"""Japanese Language""",,,,,,,,,,,,,,


In [332]:
data_language = md_language.join(language_split).drop(columns=["Language","Movie_languages_(Freebase_ID:name_tuples)"])
pickle.dump( data_language, open(pickle_folder + "movies_languages_split.p", "wb" ) )
data_language

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_countries_(Freebase_ID:name_tuples),Movie_genres_(Freebase_ID:name_tuples),nb_of_Languages,Language_0,...,Language_5,Language_6,Language_7,Language_8,Language_9,Language_10,Language_11,Language_12,Language_13,Language_14
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",1,"""English Language""",...,,,,,,,,,,
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",1,"""English Language""",...,,,,,,,,,,
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",1,"""Norwegian Language""",...,,,,,,,,,,
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",1,"""English Language""",...,,,,,,,,,,
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",1,"""German Language""",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,,120.0,"{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",1,"""English Language""",...,,,,,,,,,,
81737,34980460,/m/0g4pl34,Knuckle,2011-01-21,,96.0,"{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0...",1,"""English Language""",...,,,,,,,,,,
81738,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,,66.0,"{""/m/09c7w0"": ""United States of America""}","{""/m/06nbt"": ""Satire"", ""/m/01z4y"": ""Comedy""}",1,"""English Language""",...,,,,,,,,,,
81739,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ...",1,"""Japanese Language""",...,,,,,,,,,,


In [333]:
md_languages_exploded = md_language.explode('Language')
pickle.dump( md_languages_exploded, open(pickle_folder + "movies_languages_exploded.p", "wb" ) )
md_languages_exploded.value_counts('Language')

Language
"English Language"          40872
"Hindi Language"             3743
"Spanish Language"           3668
"French Language"            3414
"Silent film"                3183
                            ...  
"Deutsch"                       1
"Sumerian"                      1
"Sunda Language"                1
"Chhattisgarhi Language"        1
"Guanzhong Hua"                 1
Name: count, Length: 193, dtype: int64

### Clean 'Countries' Column 

In [334]:
from src.utils.utils import extract_info
md_countries = movies.copy().dropna(subset=['Movie_countries_(Freebase_ID:name_tuples)'])
md_countries["Countries"] = md_countries['Movie_countries_(Freebase_ID:name_tuples)'].apply(extract_country_names)

In [335]:
countries_split = pd.DataFrame(md_countries["Countries"].tolist(), index=md_countries.index)
md_countries['nb_of_Countries'] = md_countries["Countries"].apply(lambda x:len(x))
md_countries['nb_of_Countries'].describe()

count    81741.000000
mean         1.057474
std          0.623572
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max         14.000000
Name: nb_of_Countries, dtype: float64

In [336]:
md_countries_equal14 = md_countries.query("nb_of_Countries > 13")
md_countries_equal14

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages_(Freebase_ID:name_tuples),Movie_countries_(Freebase_ID:name_tuples),Movie_genres_(Freebase_ID:name_tuples),Countries,nb_of_Countries
33207,31430255,/m/0g42sjq,My Reincarnation,2010,,82.0,"{""/m/01kbdv"": ""Tibetan languages"", ""/m/02bjrlw...","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/03bxz7"": ""Biographical film"", ""/m/017fp"":...","[""United States of America"", ""Austria"", ""Switz...",14
51422,9894962,/m/02pw5yr,The Mahabharata,1989,,,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America"", ""/m/...","{""/m/0gw5n2f"": ""Japanese Movies"", ""/m/07s9rl0""...","[""United States of America"", ""Belgium"", ""Irela...",14


In [337]:
countries_split = countries_split.add_prefix("Countries_")
countries_split

Unnamed: 0,Countries_0,Countries_1,Countries_2,Countries_3,Countries_4,Countries_5,Countries_6,Countries_7,Countries_8,Countries_9,Countries_10,Countries_11,Countries_12,Countries_13
0,"""United States of America""",,,,,,,,,,,,,
1,"""United States of America""",,,,,,,,,,,,,
2,"""Norway""",,,,,,,,,,,,,
3,"""United Kingdom""",,,,,,,,,,,,,
4,"""Germany""",,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81736,"""United States of America""",,,,,,,,,,,,,
81737,"""Ireland""","""United Kingdom""",,,,,,,,,,,,
81738,"""United States of America""",,,,,,,,,,,,,
81739,"""Japan""",,,,,,,,,,,,,


In [338]:
data_countries = md_countries.join(countries_split).drop(columns=["Countries","Movie_countries_(Freebase_ID:name_tuples)"])
pickle.dump( data_countries, open(pickle_folder + "movies_countries_split.p", "wb" ) )

In [339]:
md_countries_exploded = md_countries.explode('Countries')
pickle.dump( md_countries_exploded, open(pickle_folder + "movies_countries_exploded.p", "wb" ) )
md_countries_exploded.value_counts('Countries')

Countries
"United States of America"    34408
"India"                        8411
"United Kingdom"               7868
"France"                       4395
"Italy"                        3163
                              ...  
"Ukranian SSR"                    1
"Iraqi Kurdistan"                 1
"Macau"                           1
"Palestinian Territories"         1
"Turkmenistan"                    1
Name: count, Length: 147, dtype: int64

### Movie Runtime : On dropera ca 