In [25]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from google.cloud import storage
from plotly.subplots import make_subplots

from tqdm import tqdm
import re

In [9]:
# project id - GCP project id
PROJECT_ID="le-wagon-bootcamp-328018"

# bucket name - GCP bucket name
BUCKET_NAME="diversity-in-cinema-735"

# train data file location
BUCKET_TRAIN_DATA_PATH = "data/training_data"

# Streamlit bucket name - GCP bucket name
BUCKET_NAME_STREAMLIT = "wagon-data-735-movie-diversity"

In [10]:
def get_movie_list(subfolders):

    """
    Function ro grab file names from a GCP bucket directory

    Parameters:

    bucket_name: Name of GCP bucket
    subfolders: complete subfolder path as a string where file names should
                be retrieved from in the format folder_1/folder_2/.../folder_n

    """

    # check if movie was already processed:

    client = storage.Client()
    file_names = [str(x).split(f"{subfolders}/")[1].\
        split("/statistics")[0]\
                for x in \
                    client.list_blobs(BUCKET_NAME_STREAMLIT, prefix=subfolders)]
    # replace first entry with empty string

    return file_names

In [11]:
def get_evolution_data():
    
    df_stats_list = []
    
    regex = re.compile(r'\((\d{4})\)')
    
    movie_list = get_movie_list("CSVs")[1:]
    
    for movie in tqdm(movie_list):
        
        if movie == "":
            continue

        year = regex.findall(movie)[0]

        df = pd.read_csv(f"gs://{BUCKET_NAME_STREAMLIT}/CSVs/{movie}/statistics", index_col=None)

        df["title"] = movie
        df["year"] = year

        df["year"] = pd.to_datetime(df["year"].values)
        df.sort_values("year", inplace=True)
        df_stats_list.append(df)
        
    df_stats_total = pd.concat(df_stats_list, axis=0)
    df_stats_total.sort_values("year", inplace=True)
        
    def add_revenue(column):
    
        column = column.replace("_", " ")
        return fetch_movie_details(column).get("revenue", None)
    
    def add_runtime(column):
    
        column = column.replace("_", " ")
        return fetch_movie_details(column).get("runtime", None)


    df_stats_total["revenue"] = df_stats_total["title"].apply(add_revenue)
    df_stats_total["runtime"] = df_stats_total["title"].apply(add_runtime)
    
    return df_stats_total.reset_index()

In [26]:
def plot_gender_timeline(df, plot_type="bar"):
    
    # group by decade
    df_grouped = df.groupby(pd.cut(df["year"], pd.date_range('1920', '2030', freq='5YS'), right=False)).mean()

    new_df = pd.DataFrame({'year':pd.date_range(start='01-01-1920', end='01-01-2030', freq='5YS')})
    df_stats_total = new_df.merge(df, on ='year', how='left')
    df_grouped = df_stats_total.groupby(pd.Grouper(key='year', freq = '5AS')).mean()
    df_grouped = df_grouped.dropna()
    
    if plot_type == "bar":
    # plot gender over time
        fig = px.bar(df_grouped, x=df_grouped.index,
                     y = ['man_screentime', 'woman_screentime', 'only_men', 'only_women'],
                     barmode="overlay", labels={"value": "Screentime [%]", "year":""} )
        
    elif plot_type == "line":
        fig = px.line(df_grouped, x=df_grouped.index,
             y = ['man_screentime', 'woman_screentime', 'only_men', 'only_women'],
                      labels={"value": "Screentime [%]", "year":""} )


In [91]:
def plot_race_timeline(df, plot_type="bar", step=5):
    
    # group by decade
    df_grouped = df.groupby(pd.cut(df["year"], pd.date_range('1920', '2030', freq=f'{step}YS'), right=False)).mean()

    new_df = pd.DataFrame({'year':pd.date_range(start='01-01-1920', end='01-01-2030', freq=f'{step}YS')})
    df_stats_total = new_df.merge(df, on ='year', how='left')
    df_grouped = df_stats_total.groupby(pd.Grouper(key='year', freq=f'{step}YS')).mean()
    df_grouped = df_grouped.dropna()
    
    if plot_type == "bar":
    # plot gender over time
        fig = px.bar(df_grouped, x=df_grouped.index,
                     y = ['asian_screentime', 'black_screentime', 'indian_screentime','latino_hispanic_screentime', 'middle_eastern_screentime', 'white_screentime', 'women_of_color'],
                     barmode="overlay", labels={"value": "Screentime [%]", "year":""} )
        
    elif plot_type == "line":
        fig = px.line(df_grouped, x=df_grouped.index,
             y = ['asian_screentime', 'black_screentime', 'indian_screentime', 'latino_hispanic_screentime', 'middle_eastern_screentime', 'white_screentime', 'women_of_color'],
                      labels={"value": "Screentime [%]", "year":""} )
    
    return fig

df_evo = get_evolution_data()

In [93]:
plot_race_timeline(df_evo,plot_type="bar",step=2)

In [82]:
df = df_evo

In [63]:
df_grouped.transpose()

year,1920-01-01,1925-01-01,1955-01-01,1960-01-01,1965-01-01,1980-01-01,1990-01-01,1995-01-01,2000-01-01,2005-01-01,2010-01-01,2015-01-01
index,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
man_screentime,97.765618,96.304719,94.309638,76.17049,94.62779,90.95636,93.1054,93.75573,89.674,88.78203,85.50824,84.41423
woman_screentime,2.234382,3.695281,5.690362,23.82951,5.372205,9.04364,6.8946,6.244268,10.326,11.21797,14.49176,15.58577
only_men,96.153846,90.707965,91.096481,70.38835,87.29201,83.94309,89.29093,90.88262,85.74261,82.67467,80.04515,74.26239
only_women,2.354788,6.342183,4.479236,22.93689,5.178869,6.605691,5.365456,4.124289,9.098536,11.87114,13.59484,12.72405
asian_screentime,3.009576,2.103468,5.539011,3.77785,5.126525,7.576602,9.210026,5.254223,4.88682,9.012093,7.560834,7.126583
black_screentime,1.003192,7.220011,6.425309,0.7103649,3.963639,2.414113,5.270071,13.74231,2.760193,3.038001,5.501045,8.878458
indian_screentime,3.237574,2.615122,4.35149,2.389409,5.601507,2.135562,4.536001,2.936752,1.631032,2.211741,2.472802,2.317073
latino_hispanic_screentime,5.882353,4.206936,14.55853,14.98224,6.756203,7.2052,11.06449,12.44137,5.189612,6.411443,6.958969,10.12222
middle_eastern_screentime,6.429549,2.728823,7.661231,5.230869,7.53419,4.475395,4.874685,3.237604,3.515877,2.896609,3.051099,4.108964


In [182]:
def women_revenue_scatter(total_stats_df):
    
    total_stats_df.sort_values(by="woman_screentime",
                               ascending=False,
                               inplace=True)

    y = total_stats_df["revenue"].values
    x = total_stats_df["total_Woman"].values
    
    
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=x,
        y=y,
        mode='markers',
        hovertext=total_stats_df["title"],
        marker=dict(color='rgb(255,185,15)')
    ))

    fig.update_layout(title_text='Number of women on screen VS. movie revenue', font=dict(size=18))

    fig.update_layout(
        autosize=True,
        width=700,
        height=500,
        yaxis=dict(
            title_text="Movie revenue [US$]",
            titlefont=dict(size=18),
        ),
        xaxis=dict(
            title_text="Cumulated number of women on screen",
            titlefont=dict(size=18),
        )
        
    )
    
    return fig

In [183]:
women_revenue_scatter(df_evo)

In [206]:
def women_movie_percentage(total_stats_df):
    
    total_stats_df.sort_values(by="woman_screentime",
                               ascending=False,
                               inplace=True)

    y = total_stats_df["title"].values
    x = total_stats_df["woman_screentime"].values
    
    
    fig = go.Figure()

    fig.add_trace(go.Bar(
        x=x,
        y=y,
        hovertext=total_stats_df["revenue"],
        orientation='h',
        marker=dict(color='rgb(24,116,205)')
    ))

    fig.update_layout(title_text='Screentime percentage of women in movies', font=dict(size=10))

    fig.update_layout(
        autosize=True,
        width=800,
        height=500,
        xaxis=dict(
            title_text="Screentime [%]",
            titlefont=dict(size=18),
        )
        
    )
    
    return fig

In [207]:
women_movie_percentage(df_evo)

In [178]:
def overall_gender_dash(total_stats_df):

    total_stats_df.sort_values(by="woman_screentime",
                               ascending=False,
                               inplace=True)

    y_saving = total_stats_df["woman_screentime"].values

    x2 = total_stats_df["revenue"].values

    x1 = total_stats_df["year"].values
    y_net_worth = total_stats_df["total_Woman"].values

    # Creating two subplots
    fig = make_subplots(rows=1,
                        cols=4,
                        specs=[[{}, {}, {}, {}]],
                        column_widths=[25, 5, 30, 2])

    fig.append_trace(
        go.Bar(
            x=y_saving,
            y=x1,
            marker=dict(
                color='rgb(24,116,205)',
                line=dict(color='rgb(0,191,255)', width=0.5),
            ),
            name='Screentime percentage of woman per movie',
            orientation='h',
        ), 1, 3)


    fig.append_trace(
        go.Scatter(
            x=y_net_worth,
            y=x2,
            mode='markers',
            hovertext=total_stats_df["title"],
            marker=dict(color='rgb(255,185,15)'),
            name='Number of women on screen VS. movie revenue',
        ), 1, 1)

    fig.update_layout(
        title='Women screentime and movie revenue',
        yaxis=dict(showgrid=False,
                   showline=False,
                   showticklabels=True,
                   domain=[0, 0.85]),
        yaxis2=dict(showgrid=False,
                    showline=True,
                    showticklabels=True,
                    domain=[0, 0.85]),
        xaxis=dict(zeroline=False,
                   showline=True,
                   showticklabels=True,
                   showgrid=True,
                   domain=[0, 0.42]),
        xaxis2=dict(zeroline=False,
                    showline=True,
                    showticklabels=True,
                    showgrid=True,
                    domain=[0.47, 1]),
        height=600,
        width=1500,
        legend=dict(x=0.029, y=1.038, font_size=15),
        margin=dict(l=100, r=20, t=70, b=70),
    )

    fig.update_layout(font_size=15)


    # y axis labels
    fig['layout'][f'yaxis{1}'].update(title=f'Movie revenue [US$]',
                                      title_font_size=19)

    # x axis labels
    fig['layout'][f'xaxis{1}'].update(
        title=f'Cumulated number of women on screen', title_font_size=19)
    fig['layout'][f'xaxis{3}'].update(
        title=f'Percantage of Screentime - Women', title_font_size=19)



    return fig

## Visualize

In [469]:
def man_woman_screentime_bar(df):
    
    """
    Input: Original movie overview dataframe
    """
    
    one_movie_gender = df[["man_screentime", "woman_screentime"]]
    one_movie_gender = one_movie_gender.T

    fig = px.bar(one_movie_gender,
                 x=one_movie_gender.index,
                 y=one_movie_gender[0],
                 labels={"index":"", "0": "Screentime [%]"},
                 color=one_movie_gender.index)
    return fig

In [531]:
import plotly.graph_objects as go

def race_screentime_bar(df):
    
    """
    Input: Original movie overview dataframe
    """
    
    one_movie_race = df[['asian_screentime', 'black_screentime', 'indian_screentime',
       'latino_hispanic_screentime', 'middle_eastern_screentime',
       'white_screentime']]
    
    one_movie_race = one_movie_race.T
    
    fig = go.Figure()
    fig.add_trace(go.Bar(x=one_movie_race.index,
                 y=one_movie_race[0],
                 marker=dict(color=one_movie_race[0])))
        
    fig.update_layout(legend_title_text = "Race")
    fig.update_xaxes(title_text="")
    fig.update_yaxes(title_text="Screentime [%]")
    
    fig.update_layout(
    title_text="Screentime Percentage Distribution - Gender")
    
    return fig

In [532]:
def race_screentime_bar(df):
    
    """
    Input: Original movie overview dataframe
    """
    
    one_movie_race = df[['asian_screentime', 'black_screentime', 'indian_screentime',
       'latino_hispanic_screentime', 'middle_eastern_screentime',
       'white_screentime']]
    
    one_movie_race = one_movie_race.T

    fig = px.bar(one_movie_race,
                 x=one_movie_race.index,
                 y=one_movie_race[0],
                 labels={"index":"", "0": "Screentime [%]"},
                 color=one_movie_race.index)
    
    fig.update_layout(
    title_text="Screentime Percentage Distribution - Race")
    
    return fig

In [533]:
race_screentime_bar(one_movie)

## Distribution over run time

In [395]:
def gcp_file_names(bucket_name, subfolders):

    """
    Function ro grab file names from a GCP bucket directory

    Parameters:

    bucket_name: Name of GCP bucket
    subfolders: complete subfolder path as a string where file names should
                be retrieved from in the format folder_1/folder_2/.../folder_n

    """

    client = storage.Client()
    file_names = [str(x).split(f"{subfolders}/")[1].\
        split(".csv")[0].\
            strip() + ".csv" for x in \
                client.list_blobs(bucket_name, prefix=subfolders)]

    return file_names

In [460]:
def run_time(movie_title, by="gender"):
    
    movie_title = movie_title.replace("_", " ").replace(".csv", "") + ".csv"
    
    df = pd.read_csv(
    f"gs://{BUCKET_NAME}/output/{movie_title}", index_col=None,)
    
    # add seconds column -> 1 frame = 0.5 seconds
    df["seconds"] = df["frame_number"] / 2

    # add minutes
    df["minutes"] = round((df["seconds"] / 60))
    
    df_grouped = df.groupby(["minutes", by], as_index=False).count()


    fig = px.scatter(df_grouped,
                     x="minutes",
                     y="face_id",
                     size="face_id",
                     color=by,
                     size_max=60, 
                     labels={"face_id": "Number of detected faces", "minutes": "Film length [minutes]"},
                    title=f"Distribution of {by.capitalize()} Over Film Run-time")
    return fig

In [461]:
run_time("2012 (2009)", by="gender")

## Dashboard

In [852]:
df_one = df_evo.iloc[0:1, :]

In [853]:
df_one

Unnamed: 0,index,man_screentime,woman_screentime,only_men,only_women,asian_screentime,black_screentime,indian_screentime,latino_hispanic_screentime,middle_eastern_screentime,...,total_indian,total_latino_hispanic,total_middle_eastern,total_white,title,year,revenue,runtime,non_white_count,non_white_count_percent
0,0,94.101509,5.898491,91.47671,5.946482,4.183813,0.411523,2.057613,2.263374,4.595336,...,30.0,33.0,67.0,1261.0,The_Hunchback_of_Notre_Dame_(1923),1923-01-01,0,113,197.0,0.156225


In [887]:
def only_women_screentime_bar(df):
    x = [df['only_women'].values[0], (100-df['only_women'].values[0])]
    names = ['Only women present', 'Both women and men present']

    fig = px.bar(df,
             x=names,
             y=x,
             labels={"x":"", "y": "Screentime [%]"},
             color=names)
    return fig

In [889]:
only_women_screentime_bar(df_one)

In [886]:
def only_men_screentime_bar(df):
    x = [df['only_men'].values[0], (100 - df['only_men'].values[0])]
    names = ['Only men present', 'Both men and women present']

    fig = px.bar(df,
                 x=names,
                 y=x,
                 labels={
                     "x": "",
                     "y": "Screentime [%]"
                 },
                 color=names)

    return fig

only_men_screentime_bar(df_one)

In [896]:
import plotly.graph_objects as go
import plotly.subplots as sp

def dashboard_gender(movie_title,df_one):
    
    # grab csvs
    
    movie_name = movie_title.replace(" ", "_")
    file_name = f'https://storage.googleapis.com/wagon-data-735-movie-diversity/CSVs/{movie_name}/statistics'
    df = pd.read_csv(file_name)

    # Create figures in Express
    figure1 = run_time(movie_name, by="gender")
    figure2 = only_women_screentime_bar(df_one)
    figure3 = man_woman_screentime_bar(df)
    figure4 = only_men_screentime_bar(df)

    # For as many traces that exist per Express figure, get the traces from each plot and store them in an array.
    # This is essentially breaking down the Express fig into it's traces

    figure1_traces = []
    figure2_traces = []
    figure3_traces = []
    figure4_traces = []


    for trace in range(len(figure1["data"])):
        figure1_traces.append(figure1["data"][trace])

    for trace in range(len(figure2["data"])):
        figure2_traces.append(figure2["data"][trace])


    for trace in range(len(figure3["data"])):
        figure3_traces.append(figure3["data"][trace])
    
    for trace in range(len(figure4["data"])):
        figure4_traces.append(figure4["data"][trace])

    #Create a 1x2 subplot
    this_figure = sp.make_subplots(rows=2, cols=3, specs=[[{'colspan': 3}, None, None],
                                               [{}, {}, {}]]) 

    # Get the Express fig broken down as traces and add the traces to the proper plot within in the subplot
    for traces in figure1_traces:
        this_figure.append_trace(traces, row=1, col=1)

    for traces in figure2_traces:
        this_figure.append_trace(traces, row=2, col=2)

    for traces in figure3_traces:
        this_figure.append_trace(traces, row=2, col=1)
        
    for traces in figure4_traces:
        this_figure.append_trace(traces, row=2, col=3)

    this_figure.update_layout(height=900, width=1100)
    this_figure.update_layout(uniformtext_minsize=15)

    this_figure.update_layout(
        title_text=f"{movie_name.replace('_', ' ')} - Gender Statistics")
        # Add annotations in the center of the donut pies

    #the subplot as shown in the above image
    return this_figure

In [897]:
dashboard_gender("102 Dalmatians (2000)", df_one)

## Horizontal bar plot of movies with the most number of women on screen

In [551]:
from sklearn.preprocessing import OneHotEncoder


In [552]:
def output_preproc(df):
    '''One Hot Encodes gender and race data from output dataframe'''
    ohe_g = OneHotEncoder(sparse=False)

    ohe_g.fit(df[['gender']])
    gender_encoded = ohe_g.transform(df[['gender']])
    results_g = gender_encoded.T

    for i, cat in enumerate(ohe_g.categories_[0]):
        df[cat] = results_g[i]

    ohe_r = OneHotEncoder(sparse=False)

    ohe_r.fit(df[['race']])
    race_encoded = ohe_r.transform(df[['race']])
    results_r = race_encoded.T

    for i, cat in enumerate(ohe_r.categories_[0]):
        df[cat] = results_r[i]

    return df


def woman_of_color(x):
    if 'Woman' in x and 'white' not in x:
        return 1
    return 0

In [588]:
def baseline_stats(df):
    '''Creates a dataframe of engineered/composite features from preprocessed output'''
    df_new = output_preproc(df)

    df_new['women_of_color'] = df_new['gender'] + ' ' + df_new['race']
    df_new['women_of_color'] = df_new['women_of_color'].apply(woman_of_color)

    df_new = df_new.groupby('frame_number').sum()

    df_new['face_count'] = df_new['Man'] + df_new['Woman']

    only_men = len(df_new[df_new['Woman'] == 0])
    only_women = len(df_new[df_new['Man'] == 0])


    dict_stats = {
        'total_frames': [len(df_new)],
        'total_seconds': [len(df_new) / 2],
        'total_faces': [df_new['face_count'].sum()],
        'total_men': [df_new['Man'].sum()],
        'total_women': [df_new['Woman'].sum()],
        'total_women_of_color': [df_new['women_of_color'].sum()],
        'only_men_count': only_men,
        'only_women_count': only_women
    }


    for cat in ["Man",
                "Woman",
                "asian",
                "black",
                "indian",
                "latino hispanic",
                "middle eastern",
                "white"] :

        new_key = "total" + "_" + cat.strip().replace(" ", "_")

        if cat.strip() not in df_new.columns:
            dict_stats[new_key] = [0]

        else:
            dict_stats[new_key] = [df_new[cat].sum()]


    df_stats = pd.DataFrame.from_dict(dict_stats)

    return df_stats


def final_stats(df):
    
    '''Creates final statistical dataframe for use in dashboard'''
    df_new = baseline_stats(df)

    dict_stats = {
        'man_screentime':
        df_new['total_men'] / df_new['total_faces'] * 100,
        'woman_screentime':
        df_new['total_women'] / df_new['total_faces'] * 100,
        'only_men':
        df_new['only_men_count'] / df_new['total_frames'] * 100,
        'only_women':
        df_new['only_women_count'] / df_new['total_frames'] * 100,
        'asian_screentime':
        df_new['total_asian'] / df_new['total_faces'] * 100,
        'black_screentime':
        df_new['total_black'] / df_new['total_faces'] * 100,
        'indian_screentime':
        df_new['total_indian'] / df_new['total_faces'] * 100,
        'latino_hispanic_screentime':
        df_new['total_latino_hispanic'] / df_new['total_faces'] * 100,
        'middle_eastern_screentime':
        df_new['total_middle_eastern'] / df_new['total_faces'] * 100,
        'white_screentime':
        df_new['total_white'] / df_new['total_faces'] * 100,
        'women_of_color':
        df_new['total_women_of_color'] / df_new['total_frames'] * 100
    }

    final_df = pd.concat([pd.DataFrame.from_dict(dict_stats), df_new], axis=1)

    return final_df

In [589]:
movie_title = "102 Dalmatians (2000)"

In [590]:
movie_name = movie_title.replace(" ", "_")
file_name = f'https://storage.googleapis.com/wagon-data-735-movie-diversity/CSVs/{movie_name}/statistics'
df = pd.read_csv(file_name)

In [591]:
df

Unnamed: 0,man_screentime,woman_screentime,only_men,only_women,asian_screentime,black_screentime,indian_screentime,latino_hispanic_screentime,middle_eastern_screentime,white_screentime,...,only_men_count,only_women_count,total_Man,total_Woman,total_asian,total_black,total_indian,total_latino_hispanic,total_middle_eastern,total_white
0,85.389134,14.610866,79.496592,12.742528,3.04699,1.321586,0.734214,4.038179,2.53304,88.325991,...,1516,243,2326.0,398.0,83.0,36.0,20.0,110.0,69.0,2406.0


# Horizontal barplot in decending order of number of woman on screen

In [738]:
df = get_evolution_data()

100%|████████████████████████████████████| 161/161 [00:21<00:00,  7.40it/s]


In [739]:
df

Unnamed: 0,man_screentime,woman_screentime,only_men,only_women,asian_screentime,black_screentime,indian_screentime,latino_hispanic_screentime,middle_eastern_screentime,white_screentime,...,total_Man,total_Woman,total_asian,total_black,total_indian,total_latino_hispanic,total_middle_eastern,total_white,title,year
0,94.101509,5.898491,91.476710,5.946482,4.183813,0.411523,2.057613,2.263374,4.595336,86.488340,...,1372.0,86.0,61.0,6.0,30.0,33.0,67.0,1261.0,The_Hunchback_of_Notre_Dame_(1923),1923-01-01
0,96.304719,3.695281,90.707965,6.342183,2.103468,7.220011,2.615122,4.206936,2.728823,81.125640,...,1694.0,65.0,37.0,127.0,46.0,74.0,48.0,1427.0,The_Lost_World_(1925),1925-01-01
0,90.578735,9.421265,87.899942,8.260617,2.781516,2.198295,3.095559,4.665769,5.204127,82.054733,...,2019.0,210.0,62.0,49.0,69.0,104.0,116.0,1829.0,Dr._Jekyll_and_Mr._Hyde_(1931),1931-01-01
0,93.626943,6.373057,90.096618,6.843800,3.056995,1.036269,2.020725,4.455959,8.290155,81.139896,...,1807.0,123.0,59.0,20.0,39.0,86.0,160.0,1566.0,Frankenstein_(1931),1931-01-01
0,82.366864,17.633136,79.891304,13.179348,1.775148,2.366864,2.958580,5.325444,1.183432,86.390533,...,696.0,149.0,15.0,20.0,25.0,45.0,10.0,730.0,White_Zombie_(1932),1932-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,84.794397,15.205603,66.019726,16.958320,5.772707,3.886127,3.648893,8.269318,5.377316,73.045639,...,7506.0,1346.0,511.0,344.0,323.0,732.0,476.0,6466.0,Mamma_Mia!_Here_We_Go_Again_(2018),2018-01-01
0,83.738356,16.261644,74.544818,12.745098,6.156339,2.187120,1.964358,5.629810,2.531389,81.530984,...,4135.0,803.0,304.0,108.0,97.0,278.0,125.0,4026.0,Mary_Poppins_Returns_(2018),2018-01-01
0,80.009255,19.990745,72.528202,21.101526,18.695049,4.488663,2.591393,15.085609,3.146691,55.992596,...,1729.0,432.0,404.0,97.0,56.0,326.0,68.0,1210.0,Bumblebee_(2018),2018-01-01
0,95.707633,4.292367,93.356401,2.076125,7.410407,59.769184,3.158534,4.656813,1.801984,23.203078,...,4727.0,212.0,366.0,2952.0,156.0,230.0,89.0,1146.0,Black_Panther_(2018),2018-01-01


# GET REVENUE DATA

In [17]:
import requests

In [16]:
api_key = '87337046eaf9c07ce51c68d19a21041a'

In [15]:

def fetch_movie_basic_data(movie):
    """
    Get movie title ID from The Movie DB API. Returns error string if not found
    """
    remove_4k = movie.lower().replace('[4k]', '').strip()
    split_movie = remove_4k.split()
    year = split_movie[-1].replace('(', '').replace(')', '')
    search_terms = split_movie[:-1]
    title = '+'.join(search_terms)
    url = f'https://api.themoviedb.org/3/search/movie?api_key={api_key}&query={title}'

    response = requests.get(url)

    if response.status_code != 200:
        return 'Error: status code not 200'

    data = response.json()

    keep_data = {}

    for index in range(len(data['results'])):
        if year in data['results'][index]['release_date']:
            keep_data['release_date'] = data['results'][index]['release_date']
            keep_data['original_language'] = data['results'][index][
                'original_language']
            keep_data['poster_path'] = data['results'][index]['poster_path']
            return data['results'][index]['id'], keep_data




def fetch_movie_details(movie):
    """
    Get desired movie details from The Movie DB API. Returns error string if not found
    """
    print(movie)

    keep_data = {}

    try: 
        
        movie_id = fetch_movie_basic_data(movie)
        url = f'https://api.themoviedb.org/3/movie/{movie_id[0]}?api_key={api_key}&language=en-US'
    
        response = requests.get(url)

        if response.status_code != 200:
            
            keep_data['genres'] = "None"
            keep_data['spoken_languages'] = "None"
            keep_data['runtime'] = 0
            keep_data['revenue'] = 0
            
            return keep_data
        
        else:
            
            data = response.json()
            keep_data['genres'] = data['genres']
            keep_data['spoken_languages'] = data['spoken_languages']
            keep_data['runtime'] = data['runtime']
            keep_data['revenue'] = data['revenue']

            return keep_data   
        
    except:
            keep_data['genres'] = "None"
            keep_data['spoken_languages'] = "None"
            keep_data['runtime'] = 0
            keep_data['revenue'] = 0
            
            return keep_data

In [780]:
fetch_movie_details("300_(2006)")

300_(2006)


{'genres': 'None', 'spoken_languages': 'None', 'runtime': 0, 'revenue': 0}

In [781]:
def add_revenue(column):
    
    column = column.replace("_", " ")
    
    return fetch_movie_details(column).get("revenue", None)
    
def add_runtime(column):
    
    column = column.replace("_", " ")
    
    return fetch_movie_details(column).get("runtime", None)

In [782]:
df["revenue"] = df["title"].apply(add_revenue)
df["runtime"] = df["title"].apply(add_runtime)

The Hunchback of Notre Dame (1923)
The Lost World (1925)
Dr. Jekyll and Mr. Hyde (1931)
Frankenstein (1931)
White Zombie (1932)
Son of Frankenstein (1939)
The Wolf Man (1941)
The Mummy's Tomb (1942)
Abbott and Costello Meet Frankenstein (1948)
Anne of the Indies (1951)
Roman Holiday (1953)
Abbott and Costello Meet the Mummy (1955)
This Island Earth (1955)
Vertigo (1958)
Psycho (1960)
Cape Fear (1962)
To Kill a Mockingbird (1962)
Doctor Zhivago (1965)
Bonnie and Clyde (1967)
The Love Bug (1968)
Willy Wonka & the Chocolate Factory (1971)
The Million Dollar Duck (1971)
The Godfather: Part II (1974)
Carrie (1976)
Superman (1978)
The Black Hole (1979)
Superman II (1980)
Raiders of the Lost Ark (1981)
Trading Places (1983)
Supergirl (1984)
A Nightmare on Elm Street (1984)
Stand by Me (1986)
The Princess Bride (1987)
A Nightmare on Elm Street 3: Dream Warriors (1987)
Masters of the Universe (1987)
Killer Klowns from Outer Space (1988)
Honey, I Shrunk the Kids (1989)
Dances with Wolves (1990)


Justice League (2017)
War for the Planet of the Apes (2017)
Ghost in the Shell (2017)
Deadpool 2 (2018)
A Quiet Place (2018)
Mamma Mia! Here We Go Again (2018)
Mary Poppins Returns (2018)
Bumblebee (2018)
Black Panther (2018)
Scary Stories to Tell in the Dark (2019)


In [783]:
df_all = df.copy()

In [785]:
df_all[df_all["revenue"]  == 0]

Unnamed: 0,man_screentime,woman_screentime,only_men,only_women,asian_screentime,black_screentime,indian_screentime,latino_hispanic_screentime,middle_eastern_screentime,white_screentime,...,total_asian,total_black,total_indian,total_latino_hispanic,total_middle_eastern,total_white,title,year,revenue,runtime
0,94.101509,5.898491,91.47671,5.946482,4.183813,0.411523,2.057613,2.263374,4.595336,86.48834,...,61.0,6.0,30.0,33.0,67.0,1261.0,The_Hunchback_of_Notre_Dame_(1923),1923-01-01,0,113
0,96.304719,3.695281,90.707965,6.342183,2.103468,7.220011,2.615122,4.206936,2.728823,81.12564,...,37.0,127.0,46.0,74.0,48.0,1427.0,The_Lost_World_(1925),1925-01-01,0,93
0,82.366864,17.633136,79.891304,13.179348,1.775148,2.366864,2.95858,5.325444,1.183432,86.390533,...,15.0,20.0,25.0,45.0,10.0,730.0,White_Zombie_(1932),1932-01-01,0,67
0,96.739533,3.260467,95.696822,1.760391,4.038533,1.667284,2.185995,4.742497,16.08003,71.285661,...,109.0,45.0,59.0,128.0,434.0,1924.0,Son_of_Frankenstein_(1939),1939-01-01,0,99
0,90.473888,9.526112,86.661912,5.991441,4.01354,0.918762,1.160542,6.72147,3.384913,83.800774,...,83.0,19.0,24.0,139.0,70.0,1733.0,The_Wolf_Man_(1941),1941-01-01,0,70
0,95.721078,4.278922,94.06495,3.023516,5.942948,4.358162,1.664025,5.62599,7.210777,75.198098,...,75.0,55.0,21.0,71.0,91.0,949.0,The_Mummy's_Tomb_(1942),1942-01-01,0,61
0,94.008224,5.991776,86.367713,6.591928,5.208537,4.993147,9.594674,11.454866,9.203055,59.545722,...,266.0,255.0,490.0,585.0,470.0,3041.0,Anne_of_the_Indies_(1951),1951-01-01,0,81
0,92.057762,7.942238,88.32,7.946667,4.043321,1.841155,3.971119,10.252708,10.180505,69.711191,...,112.0,51.0,110.0,284.0,282.0,1931.0,Abbott_and_Costello_Meet_the_Mummy_(1955),1955-01-01,0,79
0,88.171501,11.828499,81.772053,7.338017,14.148802,2.471627,2.370744,19.848676,1.790668,59.369483,...,561.0,98.0,94.0,787.0,71.0,2354.0,The_Love_Bug_(1968),1968-01-01,0,107
0,96.3827,3.6173,94.663573,4.021655,6.946265,2.280472,1.782438,12.686763,3.407602,72.896461,...,265.0,87.0,68.0,484.0,130.0,2781.0,The_Million_Dollar_Duck_(1971),1971-01-01,0,89


In [842]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

def overall_gender_dash(total_stats_df):

    df_all.sort_values(by="woman_screentime",ascending=False, inplace=True)

    y_saving = df_all["woman_screentime"].values

    x2 = df_all["revenue"].values

    x1 = df_all["title"].values
    y_net_worth = df_all["total_Woman"].values


    # Creating two subplots
    fig = make_subplots(rows=1, cols=2, specs=[[{}, {}]], shared_xaxes=True,
                        shared_yaxes=False, vertical_spacing=0.02, horizontal_spacing = 0.1)

    fig.append_trace(go.Bar(
        x=y_saving,
        y=x1,
        marker=dict(
            color='rgb(24,116,205)',
            line=dict(
                color='rgb(0,191,255)',
                width=1),
        ),
        name='Screentime percentage of woman per movie',
        orientation='h',
    ), 1, 1)



    fig.append_trace(go.Scatter(
        x=y_net_worth, y=x2, mode='markers',hovertext=df_all["title"], marker=dict(
            color='rgb(255,185,15)'),
        name='Number of women on screen VS. movie revenue',
    ), 1, 2)

    fig.update_layout( 
        title='Women screentime and movie revenue',

        yaxis=dict(
            showgrid=False,
            showline=False,
            showticklabels=True,
            domain=[0, 0.85]
        ),
        yaxis2=dict(
            showgrid=False,
            showline=True,
            showticklabels=True,
            domain=[0, 0.85]
        ),
        xaxis=dict(
            zeroline=False,
            showline=True,
            showticklabels=True,
            showgrid=True,
            domain=[0, 0.42]
        ),
        xaxis2=dict(
            zeroline=False,
            showline=True,
            showticklabels=True,
            showgrid=True,
            domain=[0.47, 1]
        ),

        height=600,
        width=1500, 
        legend=dict(x=0.029, y=1.038, font_size=15),
        margin=dict(l=100, r=20, t=70, b=70),
    )
    
    fig.update_layout(
    shapes=[dict(type="line",
                 xref='paper',
                 yref='paper',
                 x0=0.433,
                 y0=0.89,
                 x1=0.433,
                 y1=0.002,
                 line=dict(color="black",
                           width=2),
                 ),
            ],
)

    return fig

overall_gender_dash(df)

## Race Overview

In [792]:
df_evo

Unnamed: 0,index,man_screentime,woman_screentime,only_men,only_women,asian_screentime,black_screentime,indian_screentime,latino_hispanic_screentime,middle_eastern_screentime,...,total_asian,total_black,total_indian,total_latino_hispanic,total_middle_eastern,total_white,title,year,revenue,runtime
0,0,94.101509,5.898491,91.476710,5.946482,4.183813,0.411523,2.057613,2.263374,4.595336,...,61.0,6.0,30.0,33.0,67.0,1261.0,The_Hunchback_of_Notre_Dame_(1923),1923-01-01,0,113
1,0,96.304719,3.695281,90.707965,6.342183,2.103468,7.220011,2.615122,4.206936,2.728823,...,37.0,127.0,46.0,74.0,48.0,1427.0,The_Lost_World_(1925),1925-01-01,0,93
2,0,90.578735,9.421265,87.899942,8.260617,2.781516,2.198295,3.095559,4.665769,5.204127,...,62.0,49.0,69.0,104.0,116.0,1829.0,Dr._Jekyll_and_Mr._Hyde_(1931),1931-01-01,1300000,98
3,0,93.626943,6.373057,90.096618,6.843800,3.056995,1.036269,2.020725,4.455959,8.290155,...,59.0,20.0,39.0,86.0,160.0,1566.0,Frankenstein_(1931),1931-01-01,12000000,71
4,0,82.366864,17.633136,79.891304,13.179348,1.775148,2.366864,2.958580,5.325444,1.183432,...,15.0,20.0,25.0,45.0,10.0,730.0,White_Zombie_(1932),1932-01-01,0,67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,0,84.794397,15.205603,66.019726,16.958320,5.772707,3.886127,3.648893,8.269318,5.377316,...,511.0,344.0,323.0,732.0,476.0,6466.0,Mamma_Mia!_Here_We_Go_Again_(2018),2018-01-01,395044706,113
157,0,83.738356,16.261644,74.544818,12.745098,6.156339,2.187120,1.964358,5.629810,2.531389,...,304.0,108.0,97.0,278.0,125.0,4026.0,Mary_Poppins_Returns_(2018),2018-01-01,348807090,131
158,0,80.009255,19.990745,72.528202,21.101526,18.695049,4.488663,2.591393,15.085609,3.146691,...,404.0,97.0,56.0,326.0,68.0,1210.0,Bumblebee_(2018),2018-01-01,467989645,114
159,0,95.707633,4.292367,93.356401,2.076125,7.410407,59.769184,3.158534,4.656813,1.801984,...,366.0,2952.0,156.0,230.0,89.0,1146.0,Black_Panther_(2018),2018-01-01,1346739107,134


In [None]:
plot_race_timeline(df_evo,plot_type="bar",step=10)

In [806]:
df_evo.columns

Index(['index', 'man_screentime', 'woman_screentime', 'only_men', 'only_women',
       'asian_screentime', 'black_screentime', 'indian_screentime',
       'latino_hispanic_screentime', 'middle_eastern_screentime',
       'white_screentime', 'women_of_color', 'total_frames', 'total_seconds',
       'total_faces', 'total_men', 'total_women', 'total_women_of_color',
       'only_men_count', 'only_women_count', 'total_Man', 'total_Woman',
       'total_asian', 'total_black', 'total_indian', 'total_latino_hispanic',
       'total_middle_eastern', 'total_white', 'title', 'year', 'revenue',
       'runtime'],
      dtype='object')

In [809]:
df_evo["non_white_count"] = df_evo[['total_asian', 'total_black', 'total_indian', 'total_latino_hispanic',
       'total_middle_eastern']].sum(axis=1)

In [813]:
df_evo["non_white_count_percent"] = df_evo["non_white_count"] / df_evo["total_white"]

In [810]:
df_evo

Unnamed: 0,index,man_screentime,woman_screentime,only_men,only_women,asian_screentime,black_screentime,indian_screentime,latino_hispanic_screentime,middle_eastern_screentime,...,total_black,total_indian,total_latino_hispanic,total_middle_eastern,total_white,title,year,revenue,runtime,non_white_count
0,0,94.101509,5.898491,91.476710,5.946482,4.183813,0.411523,2.057613,2.263374,4.595336,...,6.0,30.0,33.0,67.0,1261.0,The_Hunchback_of_Notre_Dame_(1923),1923-01-01,0,113,197.0
1,0,96.304719,3.695281,90.707965,6.342183,2.103468,7.220011,2.615122,4.206936,2.728823,...,127.0,46.0,74.0,48.0,1427.0,The_Lost_World_(1925),1925-01-01,0,93,332.0
2,0,90.578735,9.421265,87.899942,8.260617,2.781516,2.198295,3.095559,4.665769,5.204127,...,49.0,69.0,104.0,116.0,1829.0,Dr._Jekyll_and_Mr._Hyde_(1931),1931-01-01,1300000,98,400.0
3,0,93.626943,6.373057,90.096618,6.843800,3.056995,1.036269,2.020725,4.455959,8.290155,...,20.0,39.0,86.0,160.0,1566.0,Frankenstein_(1931),1931-01-01,12000000,71,364.0
4,0,82.366864,17.633136,79.891304,13.179348,1.775148,2.366864,2.958580,5.325444,1.183432,...,20.0,25.0,45.0,10.0,730.0,White_Zombie_(1932),1932-01-01,0,67,115.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,0,84.794397,15.205603,66.019726,16.958320,5.772707,3.886127,3.648893,8.269318,5.377316,...,344.0,323.0,732.0,476.0,6466.0,Mamma_Mia!_Here_We_Go_Again_(2018),2018-01-01,395044706,113,2386.0
157,0,83.738356,16.261644,74.544818,12.745098,6.156339,2.187120,1.964358,5.629810,2.531389,...,108.0,97.0,278.0,125.0,4026.0,Mary_Poppins_Returns_(2018),2018-01-01,348807090,131,912.0
158,0,80.009255,19.990745,72.528202,21.101526,18.695049,4.488663,2.591393,15.085609,3.146691,...,97.0,56.0,326.0,68.0,1210.0,Bumblebee_(2018),2018-01-01,467989645,114,951.0
159,0,95.707633,4.292367,93.356401,2.076125,7.410407,59.769184,3.158534,4.656813,1.801984,...,2952.0,156.0,230.0,89.0,1146.0,Black_Panther_(2018),2018-01-01,1346739107,134,3793.0


In [820]:
px.scatter(df_evo,
           x="non_white_count",
           y="revenue",
           hover_name="title",
           labels={"revenue": "Movie Revenue [US$]",
                   "non_white_count":"POC on screen count"}, color_discrete_sequence=['rgb(255,185,15)']
          )

In [923]:
import plotly.graph_objects as go
import plotly.subplots as sp

def overall_race_dash(df_evo):
    
    df_evo["non_white_count"] = df_evo[['total_asian', 'total_black', 'total_indian', 'total_latino_hispanic',
       'total_middle_eastern']].sum(axis=1)
    
    df_evo["non_white_count_percent"] = df_evo["non_white_count"] / df_evo["total_white"]


    # Create figures in Express
    figure2 = plot_race_timeline(df_evo,plot_type="bar",step=10)
    
    figure1 = px.scatter(df_evo,
           x="non_white_count",
           y="revenue",
           hover_name="title",
           labels={"revenue": "Movie Revenue [US$]",
                   "non_white_count":"POC on screen count"}, color_discrete_sequence=['rgb(255,185,15)']
          )

    # For as many traces that exist per Express figure, get the traces from each plot and store them in an array.
    # This is essentially breaking down the Express fig into it's traces

    figure1_traces = []
    figure2_traces = []


    for trace in range(len(figure1["data"])):
        figure1_traces.append(figure1["data"][trace])

    for trace in range(len(figure2["data"])):
        figure2_traces.append(figure2["data"][trace])

    #Create a 1x2 subplot
    this_figure = sp.make_subplots(rows=1, cols=2, specs=[[{}, {}]]) 

    # Get the Express fig broken down as traces and add the traces to the proper plot within in the subplot
    for traces in figure1_traces:
        this_figure.append_trace(traces, row=1, col=1)

    for traces in figure2_traces:
        this_figure.append_trace(traces, row=1, col=2)


    this_figure.update_layout(height=500, width=1100)
    this_figure.update_layout(uniformtext_minsize=15)

    this_figure.update_layout(
        title_text=f"Number of POC VS. revenue                                  Screenttime percentage evoultion")
        # Add annotations in the center of the donut pies

    #the subplot as shown in the above image
    return this_figure

In [924]:
overall_race_dash(df_evo)

In [906]:

def r_screentime_donut(df):
    x = [df['asian_screentime'].values[0],
        df['black_screentime'].values[0],
        df['indian_screentime'].values[0],
        df['latino_hispanic_screentime'].values[0],
        df['middle_eastern_screentime'].values[0],
        df['white_screentime'].values[0]
        ]
    names = ['Asian', 'Black', 'Indian', 'Latino Hispanic', 'Middle Eastern', 'White']
    fig = go.Pie(labels=names, values=x, hole=0.5)

    return fig


def woc_screentime_donut(df):
    x = [df['women_of_color'].values[0], (100-df['women_of_color'].values[0])]
    names = ['Women of color present', 'No women of color present']

    fig = go.Pie(labels=names, values=x, hole=0.5)
    
    return fig


In [912]:

def race_dash(movie_stats):

    fig = make_subplots(rows=1, cols=2, specs=[[{"type": "domain"}, {"type": "domain"}]])

    fig1 = r_screentime_donut(df)
    fig2 = woc_screentime_donut(df)

    fig.add_trace(fig1, 
         row=1, col=1)

    fig.add_trace(fig2,
        row=1, col=2)
    
    return fig

In [914]:
print(race_dash(df_one))

Figure({
    'data': [{'domain': {'x': [0.0, 0.45], 'y': [0.0, 1.0]},
              'hole': 0.5,
              'labels': [Asian, Black, Indian, Latino Hispanic, Middle Eastern,
                         White],
              'type': 'pie',
              'values': [4.183813443072702, 0.411522633744856, 2.05761316872428,
                         2.263374485596708, 4.595336076817558, 86.48834019204389]},
             {'domain': {'x': [0.55, 1.0], 'y': [0.0, 1.0]},
              'hole': 0.5,
              'labels': [Women of color present, No women of color present],
              'type': 'pie',
              'values': [0.1982160555004955, 99.8017839444995]}],
    'layout': {'template': '...'}
})


In [919]:
def get_evolution_data():

    df_stats_list = []

    regex = re.compile(r'\((\d{4})\)')

    movie_list = get_movie_list("CSVs")[1:]

    for movie in tqdm(movie_list):
        
        movie = movie.replace(" ","_")

        if movie == "":
            continue

        year = regex.findall(movie)[0]

        df = pd.read_csv(
            f"gs://{BUCKET_NAME_STREAMLIT}/CSVs/{movie}/statistics",
            index_col=None)

        df["title"] = movie
        df["year"] = year

        df["year"] = pd.to_datetime(df["year"].values)
        df.sort_values("year", inplace=True)
        df_stats_list.append(df)

    df_stats_total = pd.concat(df_stats_list, axis=0)
    df_stats_total.sort_values("year", inplace=True)

    def add_revenue(column):

        column = column.replace("_", " ")
        return fetch_movie_details(column).get("revenue", None)

    def add_runtime(column):

        column = column.replace("_", " ")
        return fetch_movie_details(column).get("runtime", None)

    df_stats_total["revenue"] = df_stats_total["title"].apply(add_revenue)
    df_stats_total["runtime"] = df_stats_total["title"].apply(add_runtime)

    return df_stats_total.reset_index()

In [920]:
get_evolution_data()

100%|████████████████████████████████████| 170/170 [00:24<00:00,  6.86it/s]


The Cabinet of Dr. Caligari (1920)
The Hunchback of Notre Dame (1923)
The Lost World (1925)
Frankenstein (1931)
Dr. Jekyll and Mr. Hyde (1931)
White Zombie (1932)
Son of Frankenstein (1939)
The Wolf Man (1941)
The Mummy's Tomb (1942)
Abbott and Costello Meet Frankenstein (1948)
Anne of the Indies (1951)
Roman Holiday (1953)
Abbott and Costello Meet the Mummy (1955)
This Island Earth (1955)
Vertigo (1958)
Psycho (1960)
Cape Fear (1962)
To Kill a Mockingbird (1962)
Doctor Zhivago (1965)
Bonnie and Clyde (1967)
The Love Bug (1968)
The Million Dollar Duck (1971)
Willy Wonka & the Chocolate Factory (1971)
The Godfather: Part II (1974)
Carrie (1976)
Superman (1978)
The Black Hole (1979)
Superman II (1980)
Raiders of the Lost Ark (1981)
Trading Places (1983)
Supergirl (1984)
A Nightmare on Elm Street (1984)
Stand by Me (1986)
The Princess Bride (1987)
A Nightmare on Elm Street 3: Dream Warriors (1987)
Masters of the Universe (1987)
Killer Klowns from Outer Space (1988)
Honey, I Shrunk the Kid

Big Eyes (2014)
X-Men: Days of Future Past (2014)
Need for Speed (2014)
The Inbetweeners 2 (2014)
Muppets Most Wanted (2014)
Teenage Mutant Ninja Turtles (2014)
The Giver (2014)
Fantastic Four (2015)
Avengers: Age of Ultron (2015)
Pitch Perfect 2 (2015)
Suicide Squad (2016)
Sausage Party (2016)
10 Cloverfield Lane (2016)
Deadpool (2016)
Miss Peregrine's Home for Peculiar Children (2016)
The Shallows (2016)
War for the Planet of the Apes (2017)
Ghost in the Shell (2017)
Justice League (2017)
Mary Poppins Returns (2018)
Mamma Mia! Here We Go Again (2018)
Deadpool 2 (2018)
Bumblebee (2018)
A Quiet Place (2018)
Black Panther (2018)
Scary Stories to Tell in the Dark (2019)


Unnamed: 0,index,man_screentime,woman_screentime,only_men,only_women,asian_screentime,black_screentime,indian_screentime,latino_hispanic_screentime,middle_eastern_screentime,...,total_asian,total_black,total_indian,total_latino_hispanic,total_middle_eastern,total_white,title,year,revenue,runtime
0,0,97.765618,2.234382,96.153846,2.354788,3.009576,1.003192,3.237574,5.882353,6.429549,...,66.0,22.0,71.0,129.0,141.0,1764.0,The_Cabinet_of_Dr._Caligari_(1920),1920-01-01,8811,77
1,0,94.101509,5.898491,91.476710,5.946482,4.183813,0.411523,2.057613,2.263374,4.595336,...,61.0,6.0,30.0,33.0,67.0,1261.0,The_Hunchback_of_Notre_Dame_(1923),1923-01-01,0,113
2,0,96.304719,3.695281,90.707965,6.342183,2.103468,7.220011,2.615122,4.206936,2.728823,...,37.0,127.0,46.0,74.0,48.0,1427.0,The_Lost_World_(1925),1925-01-01,0,93
3,0,93.626943,6.373057,90.096618,6.843800,3.056995,1.036269,2.020725,4.455959,8.290155,...,59.0,20.0,39.0,86.0,160.0,1566.0,Frankenstein_(1931),1931-01-01,12000000,71
4,0,90.578735,9.421265,87.899942,8.260617,2.781516,2.198295,3.095559,4.665769,5.204127,...,62.0,49.0,69.0,104.0,116.0,1829.0,Dr._Jekyll_and_Mr._Hyde_(1931),1931-01-01,1300000,98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,0,92.476230,7.523770,91.089613,6.466395,6.242249,7.730467,5.167425,12.649855,5.994212,...,151.0,187.0,125.0,306.0,145.0,1505.0,Deadpool_2_(2018),2018-01-01,786365638,119
166,0,80.009255,19.990745,72.528202,21.101526,18.695049,4.488663,2.591393,15.085609,3.146691,...,404.0,97.0,56.0,326.0,68.0,1210.0,Bumblebee_(2018),2018-01-01,467989645,114
167,0,91.747573,8.252427,91.101322,7.841410,12.378641,3.478964,4.692557,14.563107,3.074434,...,153.0,43.0,58.0,180.0,38.0,764.0,A_Quiet_Place_(2018),2018-01-01,340677200,91
168,0,95.707633,4.292367,93.356401,2.076125,7.410407,59.769184,3.158534,4.656813,1.801984,...,366.0,2952.0,156.0,230.0,89.0,1146.0,Black_Panther_(2018),2018-01-01,1346739107,134
