In [1]:
# Some basic imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from statsmodels.stats import diagnostic
from scipy import stats
import networkx as nx
import statsmodels.api as sm
import statsmodels.formula.api as smf
import plotly.express as px
import sys
import warnings
import ast
import re

In [2]:
# Import some python modules
import src.data.cleaning_data as cleandata
import src.data.diversity as diversity_calc
import src.data.success as success

## Clean the dataset

In [3]:
movie_path = 'data/raw_data/movie.metadata.tsv'
character_path = 'data/raw_data/character.metadata.tsv'
ethnicity_mapping_path = 'data/raw_data/fb_wiki_mapping.tsv'
movie_df, box_office_df = cleandata.main(movie_path,character_path,ethnicity_mapping_path) # box_office_df will be used in our definition of success

# Preview the cleaned data
if  movie_df is not None:
    display(movie_df.head())

Cleaned data saved to data/preprocess_data/clean_dataset.csv


Unnamed: 0,Wikipedia_movie_ID,Movie_release_date,Actor_ethnicity,Movie_name,Movie_runtime,Movie_languages,Movie_countries
10,3196793,2000,African Americans,Getting Away with Murder: The JonBenét Ramsey ...,95.0,English Language,United States of America
57,18768079,1938,Jewish people,Fast Company,75.0,English Language,United States of America
59,612710,1999,Italians,New Rose Hotel,92.0,English Language,United States of America
60,612710,1999,German Americans,New Rose Hotel,92.0,English Language,United States of America
83,156558,2001,African Americans,Baby Boy,123.0,English Language,United States of America


## Our definition of success

In [4]:
# Import the datasets we need for our definition of success
ratings_df = success.ratings_setup()
awards_df = success.nominations_setup()

# Merge the datasets to get our success definition
success_df = success.merge_success_df(box_office_df, awards_df, ratings_df)
success_df = success.drop_NaN_on_success(success_df)
display(success_df)

Unnamed: 0,Movie_name,Movie_release_date,Ratings,Wikipedia_movie_ID,Movie_box_office_revenue,Nomination
0,!Women Art Revolution,2010,6.9,29988427.0,,False
1,$,1971,6.3,4213160.0,,False
3,$9.99,2008,6.7,20624798.0,,False
4,'68,1988,5.8,2250713.0,,False
5,'Neath the Arizona Skies,1934,5.0,3610422.0,,False
...,...,...,...,...,...,...
42927,È l'amor che mi rovina,1951,5.0,23687589.0,,False
42928,Échangistes,2007,4.1,27932113.0,,False
42929,Édes Anna,1958,7.4,21534981.0,,False
42930,Élisa,1995,6.6,1719500.0,,False


In [5]:
# Define the success threshold
# Ratings and box office of success movies are above this quantile : 
ratings_quantile = 0.75
box_office_quantile = 0.75
success_df = success.define_success(success_df, ratings_quantile, box_office_quantile)
success.save_success_df(success_df, "success_movies")
display(success_df)

Proportion of success movies: 24.837229949689256
DataFrame saved successfully.


Unnamed: 0,Movie_name,Movie_release_date,Ratings,Wikipedia_movie_ID,Movie_box_office_revenue,Nomination,Success
0,!Women Art Revolution,2010,6.9,29988427.0,,False,0
1,$,1971,6.3,4213160.0,,False,0
3,$9.99,2008,6.7,20624798.0,,False,0
4,'68,1988,5.8,2250713.0,,False,0
5,'Neath the Arizona Skies,1934,5.0,3610422.0,,False,0
...,...,...,...,...,...,...,...
42927,È l'amor che mi rovina,1951,5.0,23687589.0,,False,0
42928,Échangistes,2007,4.1,27932113.0,,False,0
42929,Édes Anna,1958,7.4,21534981.0,,False,1
42930,Élisa,1995,6.6,1719500.0,,False,0


## Diversity definition

In [6]:
actors_df = diversity_calc.load_df('data/processed_data/clean_dataset.csv')
actors_diversity = diversity_calc.ethnic_groups(actors_df)
diversity_calc.check_nan_Ethnicity(actors_diversity)
diversity = diversity_calc.naive_diversity(actors_diversity)
diversity = diversity_calc.ethnic_entropy(actors_df,diversity)
actors_df = diversity_calc.merge_on_movies(actors_df,diversity)
actors_df = actors_df[actors_df['actor_number'] != 1]
actors_df = actors_df.drop(columns='Actor_ethnicity').drop_duplicates(subset='Wikipedia_movie_ID')
actors_df.to_csv("data/processed_data/clean_div_dataset.csv", index=False, encoding='utf-8-sig')