In [282]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from google.cloud import storage
from tqdm import tqdm

In [268]:
# project id - GCP project id
PROJECT_ID="le-wagon-bootcamp-328018"

# bucket name - GCP bucket name
BUCKET_NAME="diversity-in-cinema-735"

# train data file location
BUCKET_TRAIN_DATA_PATH = "data/training_data"

# Streamlit bucket name - GCP bucket name
BUCKET_NAME_STREAMLIT = "wagon-data-735-movie-diversity"

In [269]:
def get_movie_list(subfolders):

    """
    Function ro grab file names from a GCP bucket directory

    Parameters:

    bucket_name: Name of GCP bucket
    subfolders: complete subfolder path as a string where file names should
                be retrieved from in the format folder_1/folder_2/.../folder_n

    """

    # check if movie was already processed:

    client = storage.Client()
    file_names = [str(x).split(f"{subfolders}/")[1].\
        split("/statistics")[0]\
                for x in \
                    client.list_blobs(BUCKET_NAME_STREAMLIT, prefix=subfolders)]
    # replace first entry with empty string

    return file_names

In [345]:
def get_evolution_data():
    
    df_stats_list = []
    
    regex = re.compile(r'\((\d{4})\)')
    
    movie_list = get_movie_list("CSVs")[1:]
    
    for movie in tqdm(movie_list):
        
        if movie == "":
            continue

        year = regex.findall(movie)[0]

        df = pd.read_csv(f"gs://{BUCKET_NAME_STREAMLIT}/CSVs/{movie}/statistics", index_col=None)

        df["title"] = movie
        df["year"] = year

        df["year"] = pd.to_datetime(df["year"].values)
        df.sort_values("year", inplace=True)
        df_stats_list.append(df)
        
    df_stats_total = pd.concat(df_stats_list, axis=0)
    df_stats_total.sort_values("year", inplace=True)
    
    return df_stats_total.reset_index()

In [330]:
def plot_gender_timeline(df, plot_type="bar"):
    
    # group by decade
    df_grouped = df.groupby(pd.cut(df["year"], pd.date_range('1920', '2030', freq='5YS'), right=False)).mean()

    new_df = pd.DataFrame({'year':pd.date_range(start='01-01-1920', end='01-01-2030', freq='5YS')})
    df_stats_total = new_df.merge(df, on ='year', how='left')
    df_grouped = df_stats_total.groupby(pd.Grouper(key='year', freq = '5AS')).mean()
    df_grouped = df_grouped.dropna()
    
    if plot_type == "bar":
    # plot gender over time
        fig = px.bar(df_grouped, x=df_grouped.index,
                     y = ['man_screentime', 'woman_screentime', 'only_men', 'only_women'],
                     barmode="overlay", labels={"value": "Screentime [%]", "year":""} )
        fig.show()
        
    elif plot_type == "line":
        fig = px.line(df_grouped, x=df_grouped.index,
             y = ['man_screentime', 'woman_screentime', 'only_men', 'only_women'],
                      labels={"value": "Screentime [%]", "year":""} )
        fig.show()

In [331]:
def plot_race_timeline(df, plot_type="bar", step="5YS"):
    
    # group by decade
    df_grouped = df.groupby(pd.cut(df["year"], pd.date_range('1920', '2030', freq=f'{step}YS'), right=False)).mean()

    new_df = pd.DataFrame({'year':pd.date_range(start='01-01-1920', end='01-01-2030', freq=f'{step}YS')})
    df_stats_total = new_df.merge(df, on ='year', how='left')
    df_grouped = df_stats_total.groupby(pd.Grouper(key='year', freq=f'{step}YS')).mean()
    df_grouped = df_grouped.dropna()
    
    if plot_type == "bar":
    # plot gender over time
        fig = px.bar(df_grouped, x=df_grouped.index,
                     y = ['asian_screentime', 'black_screentime', 'indian_screentime','latino_hispanic_screentime', 'middle_eastern_screentime', 'white_screentime', 'women_of_color'],
                     barmode="overlay", labels={"value": "Screentime [%]", "year":""} )
        fig.show()
        
    elif plot_type == "line":
        fig = px.line(df_grouped, x=df_grouped.index,
             y = ['asian_screentime', 'black_screentime', 'indian_screentime', 'latino_hispanic_screentime', 'middle_eastern_screentime', 'white_screentime', 'women_of_color'],
                      labels={"value": "Screentime [%]", "year":""} )
        fig.show()

In [346]:
df_evo = get_evolution_data()

100%|█████████████████████████████████████████| 116/116 [00:17<00:00,  6.60it/s]


In [394]:
plot_gender_timeline(df_evo,plot_type="line")

In [357]:
df_stats_total = pd.concat(df_stats_list, axis=0)
df_stats_total.sort_values("year", inplace=True)
df_stats_total.reset_index(drop=True, inplace=True)
df_stats_total

Unnamed: 0,man_screentime,woman_screentime,only_men,only_women,asian_screentime,black_screentime,indian_screentime,latino_hispanic_screentime,middle_eastern_screentime,white_screentime,women_of_color,title,year
0,94.101509,5.898491,91.476710,5.946482,4.183813,0.411523,2.057613,2.263374,4.595336,86.488340,0.198216,The_Hunchback_of_Notre_Dame_(1923),1923-01-01
1,93.626943,6.373057,90.096618,6.843800,3.056995,1.036269,2.020725,4.455959,8.290155,81.139896,0.241546,Frankenstein_(1931),1931-01-01
2,90.578735,9.421265,87.899942,8.260617,2.781516,2.198295,3.095559,4.665769,5.204127,82.054733,0.349040,Dr._Jekyll_and_Mr._Hyde_(1931),1931-01-01
3,82.366864,17.633136,79.891304,13.179348,1.775148,2.366864,2.958580,5.325444,1.183432,86.390533,0.000000,White_Zombie_(1932),1932-01-01
4,90.473888,9.526112,86.661912,5.991441,4.013540,0.918762,1.160542,6.721470,3.384913,83.800774,0.499287,The_Wolf_Man_(1941),1941-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,83.677686,16.322314,81.455399,18.075117,19.421488,5.922865,1.859504,4.338843,3.719008,64.738292,3.442879,Ghost_in_the_Shell_(2017),2017-01-01
93,96.786632,3.213368,95.523724,4.297225,6.041131,8.226221,2.313625,5.526992,14.267352,63.624679,0.984781,War_for_the_Planet_of_the_Apes_(2017),2017-01-01
94,80.009255,19.990745,72.528202,21.101526,18.695049,4.488663,2.591393,15.085609,3.146691,55.992596,10.749834,Bumblebee_(2018),2018-01-01
95,91.747573,8.252427,91.101322,7.841410,12.378641,3.478964,4.692557,14.563107,3.074434,61.812298,2.995595,A_Quiet_Place_(2018),2018-01-01


## Visualize

In [388]:
one_movie = df_stats_total.dropna().iloc[0:1, ::]
one_movie.columns

Index(['man_screentime', 'woman_screentime', 'only_men', 'only_women',
       'asian_screentime', 'black_screentime', 'indian_screentime',
       'latino_hispanic_screentime', 'middle_eastern_screentime',
       'white_screentime', 'women_of_color', 'title', 'year'],
      dtype='object')

In [375]:
one_movie_gender = one_movie[["man_screentime", "woman_screentime"]]
one_movie_gender = one_movie_gender.T

In [376]:
one_movie_gender

Unnamed: 0,0
man_screentime,94.101509
woman_screentime,5.898491


In [469]:
def man_woman_screentime_bar(df):
    
    """
    Input: Original movie overview dataframe
    """
    
    one_movie_gender = df[["man_screentime", "woman_screentime"]]
    one_movie_gender = one_movie_gender.T

    fig = px.bar(one_movie_gender,
                 x=one_movie_gender.index,
                 y=one_movie_gender[0],
                 labels={"index":"", "0": "Screentime [%]"},
                 color=one_movie_gender.index)
    return fig

In [531]:
import plotly.graph_objects as go

def race_screentime_bar(df):
    
    """
    Input: Original movie overview dataframe
    """
    
    one_movie_race = df[['asian_screentime', 'black_screentime', 'indian_screentime',
       'latino_hispanic_screentime', 'middle_eastern_screentime',
       'white_screentime']]
    
    one_movie_race = one_movie_race.T
    
    fig = go.Figure()
    fig.add_trace(go.Bar(x=one_movie_race.index,
                 y=one_movie_race[0],
                 marker=dict(color=one_movie_race[0])))
        
    fig.update_layout(legend_title_text = "Race")
    fig.update_xaxes(title_text="")
    fig.update_yaxes(title_text="Screentime [%]")
    
    fig.update_layout(
    title_text="Screentime Percentage Distribution - Gender")
    
    return fig

In [532]:
def race_screentime_bar(df):
    
    """
    Input: Original movie overview dataframe
    """
    
    one_movie_race = df[['asian_screentime', 'black_screentime', 'indian_screentime',
       'latino_hispanic_screentime', 'middle_eastern_screentime',
       'white_screentime']]
    
    one_movie_race = one_movie_race.T

    fig = px.bar(one_movie_race,
                 x=one_movie_race.index,
                 y=one_movie_race[0],
                 labels={"index":"", "0": "Screentime [%]"},
                 color=one_movie_race.index)
    
    fig.update_layout(
    title_text="Screentime Percentage Distribution - Race")
    
    return fig

In [533]:
race_screentime_bar(one_movie)

## Distribution over run time

In [395]:
def gcp_file_names(bucket_name, subfolders):

    """
    Function ro grab file names from a GCP bucket directory

    Parameters:

    bucket_name: Name of GCP bucket
    subfolders: complete subfolder path as a string where file names should
                be retrieved from in the format folder_1/folder_2/.../folder_n

    """

    client = storage.Client()
    file_names = [str(x).split(f"{subfolders}/")[1].\
        split(".csv")[0].\
            strip() + ".csv" for x in \
                client.list_blobs(bucket_name, prefix=subfolders)]

    return file_names

In [407]:
file_names = gcp_file_names(BUCKET_NAME, "output")
file_names.remove("summary.csv")


file = file_names[30]
file

'Diary of a Wimpy Kid: Dog Days (2012).csv'

In [400]:
df = pd.read_csv(
    f"gs://{BUCKET_NAME}/output/{file}", index_col=None,)

In [401]:
df

Unnamed: 0.1,Unnamed: 0,frame_number,face_id,gender,race
0,0,13,0.0,Man,white
1,1,13,1.0,Man,white
2,2,13,2.0,Man,white
3,3,13,3.0,Man,white
4,4,13,4.0,Man,white
...,...,...,...,...,...
5498,5,10486,5498.0,Man,white
5499,0,10489,5499.0,Man,white
5500,0,10492,5500.0,Man,latino hispanic
5501,0,10507,5501.0,Man,white


In [402]:
# add seconds column -> 1 frame = 0.5 seconds
df["seconds"] = df["frame_number"] / 2

In [415]:
# add minutes
df["minutes"] = round((df["seconds"] / 60), 4)

In [416]:
df

Unnamed: 0.1,Unnamed: 0,frame_number,face_id,gender,race,seconds,minutes
0,0,13,0.0,Man,white,6.5,0.1083
1,1,13,1.0,Man,white,6.5,0.1083
2,2,13,2.0,Man,white,6.5,0.1083
3,3,13,3.0,Man,white,6.5,0.1083
4,4,13,4.0,Man,white,6.5,0.1083
...,...,...,...,...,...,...,...
5498,5,10486,5498.0,Man,white,5243.0,87.3833
5499,0,10489,5499.0,Man,white,5244.5,87.4083
5500,0,10492,5500.0,Man,latino hispanic,5246.0,87.4333
5501,0,10507,5501.0,Man,white,5253.5,87.5583


In [417]:
df_grouped = df.groupby(["minutes", "gender"], as_index=False).count()
df_grouped.head()

Unnamed: 0.1,minutes,gender,Unnamed: 0,frame_number,face_id,race,seconds
0,0.1083,Man,6,6,6,6,6
1,0.1333,Man,6,6,6,6,6
2,0.1583,Man,1,1,1,1,1
3,0.1583,Woman,1,1,1,1,1
4,0.1833,Man,1,1,1,1,1


In [460]:
def run_time(movie_title, by="gender"):
    
    movie_title = movie_title.replace("_", " ").replace(".csv", "") + ".csv"
    
    df = pd.read_csv(
    f"gs://{BUCKET_NAME}/output/{movie_title}", index_col=None,)
    
    # add seconds column -> 1 frame = 0.5 seconds
    df["seconds"] = df["frame_number"] / 2

    # add minutes
    df["minutes"] = round((df["seconds"] / 60))
    
    df_grouped = df.groupby(["minutes", by], as_index=False).count()


    fig = px.scatter(df_grouped,
                     x="minutes",
                     y="face_id",
                     size="face_id",
                     color=by,
                     size_max=60, 
                     labels={"face_id": "Number of detected faces", "minutes": "Film length [minutes]"},
                    title=f"Distribution of {by.capitalize()} Over Film Run-time")
    return fig

In [461]:
run_time("2012 (2009)", by="gender")

## Dashboard

In [543]:
import plotly.graph_objects as go
import plotly.subplots as sp

def dashboard_gender(movie_title):
    
    # grab csvs
    
    movie_name = movie_title.replace(" ", "_")
    file_name = f'https://storage.googleapis.com/wagon-data-735-movie-diversity/CSVs/{movie_name}/statistics'
    df = pd.read_csv(file_name)

    # Create figures in Express
    figure1 = run_time(movie_name, by="gender")
    figure2 = race_screentime_bar(df)
    figure3 = man_woman_screentime_bar(df)

    # For as many traces that exist per Express figure, get the traces from each plot and store them in an array.
    # This is essentially breaking down the Express fig into it's traces

    figure1_traces = []
    figure2_traces = []
    figure3_traces = []


    for trace in range(len(figure1["data"])):
        figure1_traces.append(figure1["data"][trace])

    for trace in range(len(figure2["data"])):
        figure2_traces.append(figure2["data"][trace])


    for trace in range(len(figure3["data"])):
        figure3_traces.append(figure3["data"][trace])

    #Create a 1x2 subplot
    this_figure = sp.make_subplots(rows=2, cols=2, specs=[[{'colspan': 2}, None],
                                               [{}, {}]]) 

    # Get the Express fig broken down as traces and add the traces to the proper plot within in the subplot
    for traces in figure1_traces:
        this_figure.append_trace(traces, row=1, col=1)

    for traces in figure2_traces:
        this_figure.append_trace(traces, row=2, col=1)

    for traces in figure3_traces:
        this_figure.append_trace(traces, row=2, col=2)

    this_figure.update_layout(height=900, width=1100)
    this_figure.update_layout(uniformtext_minsize=15)

    this_figure.update_layout(
        title_text=f"{movie_name} Gender Statistics")
        # Add annotations in the center of the donut pies

    #the subplot as shown in the above image
    return this_figure

In [544]:
dashboard_gender("102 Dalmatians (2000)")

## Horizontal bar plot of movies with the most number of women on screen

In [551]:
from sklearn.preprocessing import OneHotEncoder


In [552]:
def output_preproc(df):
    '''One Hot Encodes gender and race data from output dataframe'''
    ohe_g = OneHotEncoder(sparse=False)

    ohe_g.fit(df[['gender']])
    gender_encoded = ohe_g.transform(df[['gender']])
    results_g = gender_encoded.T

    for i, cat in enumerate(ohe_g.categories_[0]):
        df[cat] = results_g[i]

    ohe_r = OneHotEncoder(sparse=False)

    ohe_r.fit(df[['race']])
    race_encoded = ohe_r.transform(df[['race']])
    results_r = race_encoded.T

    for i, cat in enumerate(ohe_r.categories_[0]):
        df[cat] = results_r[i]

    return df


def woman_of_color(x):
    if 'Woman' in x and 'white' not in x:
        return 1
    return 0

In [571]:
def baseline_stats(df):
    '''Creates a dataframe of engineered/composite features from preprocessed output'''
    df_new = output_preproc(df)

    df_new['women_of_color'] = df_new['gender'] + ' ' + df_new['race']
    df_new['women_of_color'] = df_new['women_of_color'].apply(woman_of_color)

    df_new = df_new.groupby('frame_number').sum()

    df_new['face_count'] = df_new['Man'] + df_new['Woman']

    only_men = len(df_new[df_new['Woman'] == 0])
    only_women = len(df_new[df_new['Man'] == 0])


    dict_stats = {
        'total_frames': [len(df_new)],
        'total_seconds': [len(df_new) / 2],
        'total_faces': [df_new['face_count'].sum()],
        'total_men': [df_new['Man'].sum()],
        'total_women': [df_new['Woman'].sum()],
        'total_women_of_color': [df_new['women_of_color'].sum()],
        'only_men_count': only_men,
        'only_women_count': only_women
    }


    for cat in ["Man",
                "Woman",
                "asian",
                "black",
                "indian",
                "latino hispanic",
                "middle eastern",
                "white"] :

        new_key = "total" + "_" + cat.strip().replace(" ", "_")

        if cat.strip() not in df_new.columns:
            dict_stats[new_key] = [0]

        else:
            dict_stats[new_key] = [df_new[cat].sum()]


    df_stats = pd.DataFrame.from_dict(dict_stats)

    return df_stats


def final_stats(df):
    '''Creates final statistical dataframe for use in dashboard'''
    df_new = baseline_stats(df)

    dict_stats = {
        'man_screentime':
        df_new['total_men'] / df_new['total_faces'] * 100,
        'woman_screentime':
        df_new['total_women'] / df_new['total_faces'] * 100,
        'only_men':
        df_new['only_men_count'] / df_new['total_frames'] * 100,
        'only_women':
        df_new['only_women_count'] / df_new['total_frames'] * 100,
        'asian_screentime':
        df_new['total_asian'] / df_new['total_faces'] * 100,
        'black_screentime':
        df_new['total_black'] / df_new['total_faces'] * 100,
        'indian_screentime':
        df_new['total_indian'] / df_new['total_faces'] * 100,
        'latino_hispanic_screentime':
        df_new['total_latino_hispanic'] / df_new['total_faces'] * 100,
        'middle_eastern_screentime':
        df_new['total_middle_eastern'] / df_new['total_faces'] * 100,
        'white_screentime':
        df_new['total_white'] / df_new['total_faces'] * 100,
        'women_of_color':
        df_new['total_women_of_color'] / df_new['total_frames'] * 100
    }

    final_df = pd.concat([pd.DataFrame.from_dict(dict_stats), df_new], axis=1)

    return final_df

In [572]:
movie_title = "102 Dalmatians (2000)".replace("_", " ").replace(".csv", "") + ".csv"

df = pd.read_csv(
f"gs://{BUCKET_NAME}/output/{movie_title}", index_col=None,)

In [573]:
final_stats(df)

<function final_stats at 0x13d07e8b0>


Unnamed: 0,man_screentime,woman_screentime,only_men,only_women,asian_screentime,black_screentime,indian_screentime,latino_hispanic_screentime,middle_eastern_screentime,white_screentime,...,only_men_count,only_women_count,total_Man,total_Woman,total_asian,total_black,total_indian,total_latino_hispanic,total_middle_eastern,total_white
0,85.389134,14.610866,79.496592,12.742528,3.04699,1.321586,0.734214,4.038179,2.53304,88.325991,...,1516,243,2326.0,398.0,83.0,36.0,20.0,110.0,69.0,2406.0
