In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


from scipy import stats
from scipy.stats import f_oneway
import statsmodels.formula.api as smf
import ast
from datetime import datetime

import dash
from dash import dcc, html
from dash.dependencies import Input, Output



data_path = './data/'
cmu_character = pd.read_csv(data_path+'cmu_character.csv')

ethnicity_map = pd.read_table(data_path+'ethnicities_data.tsv', sep='\t', index_col=0).to_dict()['Ethnicities']
cmu_character['actor_ethnicity'] = cmu_character['actor_ethnicity'].map(ethnicity_map)
ethnicity_map = pd.read_table(data_path+'ethnicity_group_data.tsv', sep='\t', index_col=0).to_dict()['Ethnic Group']
cmu_character['actor_ethnicity'] = cmu_character['actor_ethnicity'].map(ethnicity_map)


Initially, we discretize the age data into subsets to identify distinct clusters. Subsequently, we assign a group number to each cluster, taking into account factors such as ethnicity, gender, and age at the time of release.

In [3]:
cplt_character = cmu_character.dropna(subset=['actor_age_at_release','actor_ethnicity','actor_gender','persona']).copy()
bins=[0, 40, 60, 80, 100]
Ages = ['0-40','41-60','61-80','81-100']
cplt_character['actor_age_gr_at_release'] = pd.cut(cplt_character['actor_age_at_release'],bins=bins,labels=Ages)

cplt_character['group'] = cplt_character.groupby(by=['actor_age_gr_at_release','actor_ethnicity','actor_gender']).ngroup()
cplt_character

Unnamed: 0,wikiID,freebase_movID,character_name,actor_birth,actor_gender,actor_height,actor_ethnicity,actor_name,actor_age_at_release,freebase_char_actor_map,freebase_charID,freebase_actorID,release_date,is_principal,persona,fameScore_before,fameScore,fameScore_diff,actor_age_gr_at_release,group
66,156558,/m/014k4y,Yvette,1970-09-11,F,1.650,"Black, Caribbean or African",Taraji P. Henson,30.0,/m/0jtx5t,/m/03jnxj_,/m/0blbxk,2001-06-27,0.0,46.0,1.296952,2.978778,1.681826,0-40,2.0
67,156558,/m/014k4y,Jody,1978-12-30,M,1.797,"Black, Caribbean or African",Tyrese Gibson,22.0,/m/0jtx5h,/m/03jnxf4,/m/01l1b90,2001-06-27,1.0,4.0,0.000000,9.629749,9.629749,0-40,3.0
85,156558,/m/014k4y,Rodney,1971-10-20,M,1.918,"Black, Caribbean or African",Snoop Dogg,29.0,/m/0jtx5n,/m/03jnx_2,/m/01vw8mh,2001-06-27,0.0,41.0,2.778827,3.303080,0.524254,0-40,3.0
98,156558,/m/014k4y,Melvin,1959-05-12,M,1.830,"Black, Caribbean or African",Ving Rhames,42.0,/m/03jnxct,/m/03jnxcx,/m/032zg9,2001-06-27,0.0,35.0,17.236874,16.815890,-0.420985,41-60,11.0
102,156558,/m/014k4y,Sweetpea,1976-10-19,M,1.829,"Black, Caribbean or African",Omar Gooding,24.0,/m/03jnxn4,/m/03jnxn7,/m/05cgxx,2001-06-27,1.0,40.0,0.907537,5.268581,4.361043,0-40,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331070,6456053,/m/0g605h,Paolo Maltese,1927-10-18,M,1.850,White,George C. Scott,37.0,/m/04p4q_p,/m/0hnw0l3,/m/0l786,1964-12-31,0.0,40.0,37.677609,32.431867,-5.245742,0-40,7.0
331076,6456053,/m/0g605h,Joey Friedlander,1918-11-04,M,1.770,White,Art Carney,46.0,/m/0cg660g,/m/0hnw0k2,/m/015qt5,1964-12-31,0.0,40.0,0.000000,6.228965,6.228965,41-60,15.0
331083,22330502,/m/05sync6,Yvonne Valbret,1905-09-18,F,1.702,White,Greta Garbo,25.0,/m/05tg4vb,/m/0h2ppfw,/m/0h14h,1931-01-31,0.0,49.0,8.177769,7.936300,-0.241469,0-40,6.0
331097,24997872,/m/09g6klx,Raja,1951-09-07,M,1.780,"Asian, Middle East and Tribes",Mammootty,58.0,/m/0gvytkx,/m/0h27wx_,/m/02hkv5,2010-05-07,1.0,40.0,0.804361,0.812329,0.007967,41-60,9.0


The following function is defined to determine the group of an actor based on their characteristics:

In [5]:
def find_group(gender,ethni,age):

    return cplt_character[(cplt_character['actor_gender'] == gender) & 
                (cplt_character['actor_age_gr_at_release'] == age) & 
                (cplt_character['actor_ethnicity'] == ethni)]['group'].unique()[0]

A function is then written to generate a plot illustrating the mean and standard deviation of the differential fame scores for a specified group of actors.

In [99]:
def plotall(cluster):
    actor_group = cplt_character[cplt_character['group'] == cluster]
    actor_group = actor_group.dropna(subset=['persona']).copy()
    actor_group['persona'] = actor_group['persona'].astype(int)

    actor_group = actor_group[['freebase_char_actor_map', 'fameScore_diff', 'persona']]

    persona_counts = actor_group.groupby('persona',as_index=False).agg('count')
    persona_counts = persona_counts[persona_counts['fameScore_diff']>10]

    actor_group = actor_group.merge(persona_counts['persona'],on='persona')

    actor_group_persona = actor_group[['persona','fameScore_diff']].groupby(by=['persona'],as_index=False).agg(['mean','std'])

    fig = px.scatter(actor_group_persona['fameScore_diff'], x=actor_group_persona.index, y='mean', error_y='std', title='Average Fame Scores with Variance Bars',
                 labels={'avg_fame_score': 'Average Fame Score', 'persona': 'Persona'})

    return fig


Lastly, an application is developed, allowing users to select a group of actors and assess whether a persona is more likely to yield a positive fame score differential or not.

In [101]:
Genders = cplt_character['actor_gender'].unique()
Ethnis = cplt_character['actor_ethnicity'].unique()
# Dash app
app = dash.Dash(__name__)

app.layout = html.Div([    # fameScore_diff = cplt_character[cplt_character['group']==n]['fameScore_diff']
    html.H1("Interactive Regression Coefficients Plot"),
    html.Div([
    # Dropdown to select cluster
    dcc.Dropdown(
        id='gender-selection',
        options=[
            {'label': str(gender), 'value': gender}
            for gender in Genders
        ],
        value=Genders[0],  # Set default value
        style={'width': '300px'}
    )], style={'display': 'inline-block', 'margin-right': '20px'}),

    html.Div([
    dcc.Dropdown(
        id='ethni-selection',
        options=[
            {'label': str(ethni), 'value': ethni}
            for ethni in Ethnis
        ],
        value=Ethnis[0],  # Set default value
        style={'width': '300px'}
    )], style={'display': 'inline-block', 'margin-right': '20px'}),

    html.Div([
    dcc.Dropdown(
        id='age-selection',
        options=[
            {'label': str(age), 'value': age}
            for age in Ages
        ],
        value=Ages[0],  # Set default value
        style={'width': '300px'}
    )], style={'display': 'inline-block', 'margin-right': '20px'}),
    
    # Graph to display coefficients
    dcc.Graph(id='coefficients-graph')
])

# Callback to update graph based on cluster selection
@app.callback(
    Output('coefficients-graph', 'figure'),
    [Input('gender-selection', 'value'),
     Input('ethni-selection', 'value'),
     Input('age-selection', 'value')]
)
def update_graph(gender,ethni,age):
    cluster = cplt_character[(cplt_character['actor_gender'] == gender) & 
                (cplt_character['actor_age_gr_at_release'] == age) & 
                (cplt_character['actor_ethnicity'] == ethni)]['group'].unique()[0]
    return plotall(cluster)

if __name__ == '__main__':
    app.run_server(debug=True, port=8051)


In the presented analysis, it's evident that none of the personas exhibits a discernible impact on the fameScore_diff. To investigate further into the potential correlation between fameScore_diff and persona within a particular group of actors, we opt to conduct an ANOVA test. This statistical test will help us determine whether there are significant differences in the fameScore_diff among the various personas within the chosen actor group.

In [109]:
n = 5
fameScore_diff = cplt_character[cplt_character['group']==n]['fameScore_diff']
persona = cplt_character[cplt_character['group']==n][['persona']]
persona_counts = persona.value_counts().to_frame().reset_index()
persona_counts = persona_counts[persona_counts['count']>10]

select_lines = persona['persona'].isin(persona_counts['persona'])
persona = persona[select_lines]['persona']
fameScore_diff = fameScore_diff[select_lines]

f_statistic, p_value = f_oneway(*[fameScore_diff[persona == category] for category in persona.unique()])

print(f'f_statistic : {f_statistic}')
print(f'p_value : {p_value}')

f_statistic : 1.3112106652304976
p_value : 0.16261988000336472


We notice that, for numerous actor groups, the null hypothesis cannot be rejected. Therefore, it can be assumed that the selection of a persona does not exert an influence on whether there is an increase in fameScore or not.

In [None]:

    # # actor_group = actor_group[actor_group['fameScore_diff']>0]
    # print(actor_group.shape)

    # formula = 'fameScore_diff ~ C(persona)'

    # mod = smf.ols(formula=formula, data=actor_group)
    # res = mod.fit()

    # # Create a vertical bar plot using plotly.graph_objs
    # fig = go.Figure()
    # fig.add_trace(go.Bar(
    #     x=res.params.index[1:],
    #     y=res.params.values[1:]+res.params.values[0],
    #     marker_color=['cornflowerblue' if p < 0.1 else 'tomato' for p in res.pvalues],
    #     marker=dict(line=dict(width=2)),  # Adjust the width of the bar lines
    #     name=''
    # ))

    # fig.update_layout(
    #     title='Regression coefficients',
    #     xaxis=dict(title='Features'),
    #     yaxis=dict(title='Coefficients'),
    #     legend=dict(orientation='v', yanchor='middle', xanchor='right', traceorder='reversed', itemsizing='constant'),
    #     showlegend=True,
    #     height=500  # Adjust the height of the entire plot
    # )

    # # Add legend
    # fig.add_trace(go.Bar(x=[None], y=[None], marker_color='cornflowerblue', name='p < 0.05 (significant)'))
    # fig.add_trace(go.Bar(x=[None], y=[None], marker_color='tomato', name='p >= 0.05 (not significant)'))