In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


from scipy import stats
from scipy.stats import f_oneway
import statsmodels.formula.api as smf
import ast
from datetime import datetime

import dash
from dash import dcc, html
from dash.dependencies import Input, Output

data_path = './data/'
cmu_character = pd.read_csv(data_path+'cmu_character.csv')
cmu_movies = pd.read_csv(data_path+'cmu_movies.csv')

cmu_movies['genre'] = cmu_movies['genre'].apply(lambda x: eval(x))

# Is there a better country to play in ?

The primary goal is to assess the fame scores across major film-producing nations. We will focus on the top five countries with the highest popularity ratings from the provided database.

In [40]:
actor_country= cmu_character[['wikiID','actor_name','freebase_actorID','fameScore','release_date','actor_ethnicity']] \
                            .merge(cmu_movies[['wikiID','country']])
actor_country['country'] = actor_country['country'].apply(lambda x: eval(x)[0] if eval(x) else None)

In [41]:
main_countries = actor_country[['country','wikiID']].groupby(by=['country'],as_index=False).agg('count').sort_values(by=['wikiID'],ascending=False).head(5)
actor_country = actor_country.merge(main_countries['country'],on=['country'])
main_countries

Unnamed: 0,country,wikiID
104,United States of America,199881
41,India,30834
103,United Kingdom,19579
33,France,16222
49,Japan,6635


In [42]:
median_country = actor_country[['country','fameScore']].groupby(by=['country'],as_index=False).agg('median')

actor_country['release_date']=pd.to_datetime(actor_country['release_date']).dt.year
bins = range(actor_country['release_date'].min(),actor_country['release_date'].max() + 6, 5)
actor_country['release_date']= pd.cut(actor_country['release_date'], bins=bins, labels=bins[:-1])
actor_country['release_date'] = pd.to_numeric(actor_country['release_date'], errors='coerce')

In [43]:
fig = px.bar(
    median_country,
    x='country',
    y='fameScore',
    labels={'fameScore': 'Median Fame Score'},
    title='Median Fame Score by Country',
)

# Update layout parameters
fig.update_layout(
    width=800,
    height=500,
    margin=dict(l=50, r=50, b=50, t=50),
    xaxis_title='Country',
    yaxis_title='Median Fame Score',
)

fig.show()

It is evident that the median fame score is consistently higher in the epicenter of the film industry ('USA'), whereas it is notably low in India.

We believe that actor ethnicity is highly correlated with the country of the movie. Conducting a chi-square analysis could provide evidence supporting this hypothesis.

In [44]:
from scipy.stats import chi2_contingency

# Calculate the correlation matrix
contingency_table = pd.crosstab(actor_country['country'], actor_country['actor_ethnicity'])

# Perform the chi-squared test
chi2, p_value, _, _ = chi2_contingency(contingency_table)

print(f'chi2 : {chi2}')
print(f'p_value : {p_value}')

# Create a Plotly Heatmap
fig = go.Figure(go.Heatmap(
    z=contingency_table.values,
    x=contingency_table.columns,
    y=contingency_table.index,
    colorscale='Blues',
    colorbar=dict(title='Count'),
    zmin=0,
    zmax=contingency_table.max().max(),
))

# Add annotations
for i in range(len(contingency_table.index)):
    for j in range(len(contingency_table.columns)):
        fig.add_annotation(
            x=contingency_table.columns[j],
            y=contingency_table.index[i],
            text=str(contingency_table.iloc[i, j]),
            showarrow=False,
            font=dict(color='white' if contingency_table.iloc[i, j] > contingency_table.max().max() / 2 else 'black')
        )

# Update layout parameters
fig.update_layout(
    title='Correlation Heatmap',
    xaxis=dict(title='country'),
    yaxis=dict(title='actor_ethnicity'),
)

fig.show()

chi2 : 39509.66568106885
p_value : 0.0


Therefore, our observation suggests that only Asian individuals are likely to be cast in Indian movies, whereas the diversity in the USA is much broader. Now, we aim to understand whether the low fame score is attributed to the Indian ethnicity of the actor or if it is primarily influenced by the country of origin of the movie.

For this reason, we are plotting the median fame score by country specifically for individuals of Asian ethnicity.

In [45]:
median_country = actor_country[['country','fameScore','actor_ethnicity']].groupby(by=['actor_ethnicity','country'],as_index=False).agg('median')
median_country = median_country[median_country['actor_ethnicity']=='Asian, Middle East and Tribes']
median_country
# Create a line plot
fig = px.bar(
    median_country,
    x='country',  # Assuming release_date is in a datetime format
    y='fameScore',
    color='actor_ethnicity',
    labels={'fameScore': 'Fame Score'},
    title='Fame Score Over Time by Country',
)

# Update layout parameters
fig.update_layout(
    width=800,
    height=500,
    margin=dict(l=50, r=50, b=50, t=50),
    xaxis_title='Release Date',
    yaxis_title='median Fame Score',
)

fig.show()

It appears that even for individuals of Asian ethnicity, the advantages in terms of fame score are more pronounced when they are cast in movies produced in the USA.

This observation could be open to discussion based on the origin of the data and the methodology used to compute the fame score. Given that the IMDb dataset was created in the USA, it is plausible to assume that a significant portion of the ratings comes from American or European audiences. Consequently, a tentative conclusion is that, for optimal fame scores among American audiences, an actor may find it beneficial to participate in productions based in the USA or Europe. However, further analysis and considerations may be needed to validate and refine this conclusion.

### Does these conclusions constant in time ?

In [47]:
median_country = actor_country[['country','fameScore','release_date']].groupby(by=['release_date','country'],as_index=False).agg('median')
median_country

# Create a line plot
fig = px.line(
    median_country,
    x='release_date',  # Assuming release_date is in a datetime format
    y='fameScore',
    color='country',
    labels={'fameScore': 'Fame Score'},
    title='Fame Score Over Time by Country',
)

# Update layout parameters
fig.update_layout(
    width=800,
    height=500,
    margin=dict(l=50, r=50, b=50, t=50),
    xaxis_title='Release Date',
    yaxis_title='Median Fame Score',
)

fig.show()

It's interesting to note that for older dates, data limitations may affect the analysis due to insufficient information. On the other hand, when examining more recent data, there doesn't seem to be any discernible change over time. This stability in patterns could suggest a consistent trend in the relationship between actor location and fame scores, emphasizing the enduring influence of factors like movie origin on audience recognition.

In [12]:
limit_year = 1980
actor_country = actor_country[actor_country['release_date']>limit_year]
median_country = median_country[median_country['release_date']>limit_year]
# Create a line plot
fig = px.line(
    median_country,
    x='release_date',  # Assuming release_date is in a datetime format
    y='fameScore',
    color='country',
    labels={'fameScore': 'Fame Score'},
    title='Fame Score Over Time by Country',
)

# Update layout parameters
fig.update_layout(
    width=800,
    height=500,
    margin=dict(l=50, r=50, b=50, t=50),
    xaxis_title='Release Date',
    yaxis_title='Fame Score',
)

fig.show()