In [10]:
# Modules to import
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import json
from importlib import reload
import ast
from sklearn.metrics import silhouette_score
from sklearn.metrics import r2_score  
from sklearn.cluster import KMeans
import plotly.graph_objects as go
import plotly.express as px
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [11]:
paths = ['src/data','src/models','src/scripts','src/utils']
for path in paths:
    sys.path.append(path)

### IV.3 Theme Study for Cultural Influence Analysis

### US Cultural Influence on foreign countries

In this section, our goal is to analyze the cultural influence of the United States in the film productions of other countries. To do this, we begin by defining a cultural influence score of the U.S. on films. We will then analyze this influence from a geographical perspective and attempt to cluster the countries of the world using this influence score.

In [12]:
import scriptculture
import generalUtils
reload(scriptculture)

<module 'scriptculture' from 'c:\\Users\\arnau\\OneDrive\\Documents\\Scolarité EPFL\\ADA\\ada-2024-project-teamcsx24\\src/scripts\\scriptculture.py'>

In [13]:
from scriptculture import process_data_us_influence          # Import the function that processes the data for this study

df_us_influence_final = process_data_us_influence()
df_us_influence_final.head()

Unnamed: 0,Country,US_Term_Count,Number of movies,Naïve_Influence_score,log Number of movies,World_region,NLP US Influence Score
0,France,531,2302,0.230669,7.741534,Europe,0.131262
1,Russia,49,484,0.10124,6.182085,Eastern Europe and Central Asia,0.099043
2,India,262,4754,0.055111,8.466742,South Asia,0.079816
3,Australia,160,748,0.213904,6.617403,Oceania,0.157515
4,New Zealand,34,256,0.132812,5.545177,Oceania,0.195417


For this analysis, we identified a list of typically american words. We then search for this words occurences in the different summaries. We then grouped the results by countries. This is the first method that we use for this analysis but it was not relevant.

We have implemented a U.S. influence score using Transformers (NLP model). The code and explanations for this model are detailed in the "nlp_code" notebook within the "models" directory.

The processed dataset that we'll use for this analysis is completed with the following features:

- US_Term_Count: Number of U.S. terms found in the movies from each country
- Number of movies: Number of movies produced in each country
- Naïve_Influence_score: Ratio of US_Term_Count to the Number of movies
- World_region: Geographical cluster that we identified
- NLP US Influence Score: The U.S. influence score on the country, calculated using the NLP model, is more precise than the Influence score and will be the focus of this analysis.


From now on, the only influence score we will consider is the U.S. cultural influence score obtained through the NLP method.

Let’s start by observing the regression of this influence score with the log of the number of movies per country:

In [14]:

# Calculate the R^2 of the linear regression between the number of movies and the influence score of the USA
x = df_us_influence_final['log Number of movies']
y = df_us_influence_final['NLP US Influence Score']

# Perform the linear regression prediction 
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
y_pred = p(x)

# Calculate R^2
r_squared = r2_score(y, y_pred)

print(f"R^2 : {r_squared:.4f}")

R^2 : 0.2279


In [15]:
z = np.polyfit(df_us_influence_final['log Number of movies'], df_us_influence_final['NLP US Influence Score'], 1) # Fit the linear regression model
p = np.poly1d(z)

# Interactive figure using Plotly
fig = go.Figure()

# Add points representing each country
fig.add_trace(go.Scatter(
    x=df_us_influence_final['log Number of movies'],
    y=df_us_influence_final['NLP US Influence Score'],
    mode='markers',
    marker=dict(color='blue', size=6, opacity=0.6),
    text=df_us_influence_final['Country'],  
    hovertemplate="<b>Country:</b> %{text}<br>" +
                  "<b>log(Number of Movies):</b> %{x}<br>" +
                  "<b>Influence Score:</b> %{y}<extra></extra>",
    name='Countries'
))

# Add the regression line
x_vals = np.linspace(df_us_influence_final['log Number of movies'].min(), 
                     df_us_influence_final['log Number of movies'].max(), 100)
y_vals = p(x_vals)

fig.add_trace(go.Scatter(
    x=x_vals,
    y=y_vals,
    mode='lines',
    line=dict(color='red', dash='dash'),
    name='Regression Line'
))

# Add annotation for the R²
fig.add_annotation(
    x=df_us_influence_final['log Number of movies'].min(),  # Position on the X-axis
    y=df_us_influence_final['NLP US Influence Score'].max(),             # Position on the Y-axis
    text=f"R² = {r_squared:.2f}",
    showarrow=False,
    font=dict(size=12, color="black"),
    align="left",
    bgcolor="white",
    bordercolor="black",
    borderwidth=1
)

# Format the figure
fig.update_layout(
    xaxis_title='log Number of Movies',
    yaxis_title='US Influence Score',
    template='plotly_white',
    hovermode='closest',
    width=500,
    height=400,
    title={
        'text': "US Influence Score vs log(Number of Movies)",
        'x': 0.5,  # Center the title
        'xanchor': 'center',
        'yanchor': 'top'
    }  
    
)

fig.show()

We calculate the p-value of the coefficient in the regression between log Number of Movies and US Influence Score to test if the coefficient of log Number of Movies is statistically different from 0.

In [16]:
mod = smf.ols(formula='Q("NLP US Influence Score") ~ Q("log Number of movies")', data=df_us_influence_final) 
res = mod.fit()
p_value = res.pvalues['Q("log Number of movies")']
print(f'P-value of the regression: {p_value}')

P-value of the regression: 5.838089938611666e-07


P-value < 0.01, The coefficient of the log Number of Movies variable is statistically significant at the 99% confidence level.
There is a significant relationship between log Number of Movies and US Influence Score. The coefficient is therefore likely different from zero, and there is an association between these two variables in your model.

Now, we determine which regions of the world have movies most influenced by American culture. To do this, we create a boxplot of the influence scores for countries grouped by world region to observe the distribution of US influence score in each region.

In [17]:
# Sort by the median 'NLP US Influence Score' for each region
df_us_influence_final_world_region = df_us_influence_final[df_us_influence_final['World_region'] != 'Other'] # Remove 'Other' region as we didn't assign it to a specific region
median_order = df_us_influence_final_world_region.groupby('World_region')['NLP US Influence Score'].median().sort_values().index

region_color_mapping = {             # Define the colors for each region to keep the same color for each region in all plots
    "Oceania": "#1f77b4",  # Blue
    "Africa": "#ff7f0e",  # Orange
    "Middle East": "#2ca02c",  # Green
    "South Asia": "#d62728",  # Red
    "East and Southeast Asia": "#9467bd",  # Violet
    "South America": "#8c564b",  # Brown
    "Europe": "#e377c2",  # Rose
    "North America": "#7f7f7f",  # Grey
    "Eastern Europe and Central Asia": "#bcbd22",  # Olive Green
}

# Create an interactive boxplot with Plotly
fig = px.box(
    df_us_influence_final_world_region, 
    x='World_region', 
    y='NLP US Influence Score', 
    category_orders={'World_region': median_order},  # Sort regions by median
    color='World_region',  
    color_discrete_map=region_color_mapping,
    title='Distribution of US Influence Score by world region',
    labels={'World_region': 'World Region', 'NLP US Influence Score': 'Influence Score'}
)

# Customization
fig.update_traces(quartilemethod="inclusive")  
fig.update_layout(
    xaxis_title=None,
    yaxis_title='US Influence Score',
    xaxis_tickangle=45,  
    height=475,
    width=600,
    title={
        'text': "Distribution of US Influence Score by world region",
        'x': 0.5,  # Center the title
        'xanchor': 'center',
        'yanchor': 'top'
    }          
)

fig.show()

In [18]:
!pip install nbformat>=4.2.0

### Clustering of countries

We will cluster the countries based on their U.S. influence score and the log of the number of movies produced.
First, we start by standardizing the two features to ensure consistent clustering.

In [19]:
df_selected = df_us_influence_final[['log Number of movies', 'NLP US Influence Score']]
# Standardize the data thanks to the StandardScaler
scaler = StandardScaler()

df_scaled = scaler.fit_transform(df_selected)
df_scaled = pd.DataFrame(df_scaled, columns=df_selected.columns)

Once the features are standardized, we want to determine the optimal number of clusters to choose for this study, knowing that we will perform clustering using K-Means.
We therefore calculate the silhouette score for different values of K and display them in a line plot.

In [20]:
# Plot of the silhouette score for different values of K

X = df_scaled[['log Number of movies', 'NLP US Influence Score']].values

import plotly.express as px
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Calculate the silhouette score for different values of K
silhouettes = []
for k in range(2, 12):
    labels = KMeans(n_clusters=k, random_state=10).fit_predict(X)  
    score = silhouette_score(X, labels)
    silhouettes.append({"k": k, "score": score})

# Convert to dataframe to plot
silhouettes = pd.DataFrame(silhouettes)

# Create an interactive line plot with Plotly
fig = px.line(
    silhouettes, 
    x="k", 
    y="score", 
    title="Silhouette Score vs number of clusters K", 
    labels={"k": "Number of clusters (K)", "score": "Silhouette Score"},
    markers=True 
)

# Customization
fig.update_layout(
    xaxis=dict(dtick=1),  
    yaxis_title="Silhouette Score",
    xaxis_title="Nombre de clusters (K)",
    height=400,  
    width=500 ,
    title={
        'text': "Silhouette Score vs number of clusters K",
        'x': 0.5,  # Center the title
        'xanchor': 'center',
        'yanchor': 'top'
    } 
)

fig.show()

The optimal number of clusters is K=3. We now plot the scatter plot of the countries, with the standardized influence score on the y-axis and the standardized log of the number of movies on the x-axis, to observe the distribution of countries across the different clusters

In [21]:
# Cluster the data with K = 3
kmeans = KMeans(n_clusters=3, random_state=42).fit(X)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

# Add the cluster labels to the dataframe
df_us_influence_final['cluster'] = labels

country_names = df_us_influence_final['Country']

# Define a color map for the clusters
color_map = {
    2: 'rgb(0, 0, 139)',  
    1: 'rgba(245, 222, 179, 1)',  
    0: 'rgb(139, 0, 0)'    
}

# Create the figure
fig = go.Figure()

for cluster_id in range(3):   # Loop over the 3 clusters 
    fig.add_trace(go.Scatter(
        x=X[labels == cluster_id, 0],  # log Number of movies
        y=X[labels == cluster_id, 1],  # Influence_Score
        mode='markers',
        marker=dict(
            color=color_map[cluster_id],  # Use color_map to color by cluster
            size=6, opacity=0.6
        ),
        text=country_names[labels == cluster_id],  # Add country names to the points
        hovertemplate="<b>%{text}</b><br>log Number of movies: %{x}<br>Influence score: %{y}<extra></extra>",
        name=f'Cluster {cluster_id}'  # Name each cluster in the legend
    ))


# Add centroids
fig.add_trace(go.Scatter(
    x=centroids[:, 0], 
    y=centroids[:, 1],  
    mode='markers',
    marker=dict(color='black', size=12, symbol='x'),
    hovertemplate="Centroid<br>log Number of movies: %{x}<br>Influence score: %{y}<extra></extra>",
    name='Centroids'
))

# Format the figure
fig.update_layout(
    title={
        'text': "KMeans Clustering (K = 3)",
        'x': 0.5,  # Center the title
        'xanchor': 'center',
        'yanchor': 'top'
    } ,
    xaxis_title="Standardized log Number of Movies",
    yaxis_title="Standardized US Influence score",
    height=400,
    width=500,
)

fig.show()

Here, we want to study the distribution of the clusters that we identified across the different world regions we analyzed earlier.
To do this, we will plot a normalized stacked bar chart for each world region to clearly show the proportion of clusters within the regions.

In [22]:
color_map = {
    2: 'rgb(0, 0, 139)',  
    1: 'rgba(245, 222, 179, 1)',  
    0: 'rgb(139, 0, 0)'    
}

df_us_influence_final_world_region = df_us_influence_final[df_us_influence_final['World_region'] != 'Other'] # Remove 'Other' region as we didn't assign it to a specific region
# Calculate the proportion of countries in each cluster per world region (normalize by region total)
region_cluster_counts = df_us_influence_final_world_region.groupby(['World_region', 'cluster'])['Country'].count().unstack(fill_value=0)
region_cluster_proportions = region_cluster_counts.div(region_cluster_counts.sum(axis=1), axis=0)

# Sort the world regions based on the proportion of countries in cluster 
region_cluster_proportions = region_cluster_proportions.sort_values(by=2, ascending=True)


fig = go.Figure()

# Add a bar for each cluster, stacking them for each region
for cluster in region_cluster_proportions.columns:
    fig.add_trace(go.Bar(
        x=region_cluster_proportions.index,  
        y=region_cluster_proportions[cluster],  # Proportions of countries in the cluster on the y-axis
        name=f"Cluster {cluster}", # Label each cluster for the legend
        marker=dict(color=color_map[cluster])  # Apply the color for the cluster from the color_map
    ))

# Customize the layout of the plot
fig.update_layout(
    xaxis_title=None,
    barmode='stack',  # Stack the bars to show proportions 
    xaxis=dict(tickangle=45), 
    yaxis=dict(title="Proportion of Countries"), 
    legend=dict(title="Cluster"), 
    height=400,  
    width=500,
    title={
        'text': "Distribution of Cluster by World Region",
        'x': 0.5,  # Center the title
        'xanchor': 'center',
        'yanchor': 'top'
    }  
)

fig.show()

Our goal is to compare the clusters we identified with an official globalization index. Using plotly, we plot a world map with countries colored by their cluster. We will then do the same with coloring based on the KOF Globalization Index.

In [23]:
# Create a choropleth map with Plotly representing the clusters
fig = px.choropleth(df_us_influence_final,
                    locations='Country',  
                    locationmode='country names',  
                    color='cluster',  # Color based on the cluster column
                    color_continuous_scale='RdYlBu',  # Use the specified color scale
                    labels={'cluster': 'Cluster ID'},  
                    title="Countries cluster based on US Influence Score and Number of Movies",)

# Display the map
fig.update_layout(
    height=400,  
    width=600,
    title={
        'text': "Countries cluster based on US Influence Score and Number of Movies",
        'x': 0.5,  # Center the title
        'xanchor': 'center',
        'yanchor': 'top',
        'font': {'size': 14}
    }
)

fig.show()

In [24]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


And here is the map of countries with their KOF Globalization Index. The data comes from the KOF online website, and we downloaded this data in Excel format.

In [25]:
# Load the globalization index data from the KOF
df_globalization = pd.read_excel('data/additionalData/globalization_index.xlsx')

In [26]:
# Create a choropleth map with Plotly representing the globalization index
fig = px.choropleth(df_globalization, 
                    locations='Country',  
                    locationmode='country names',  
                    color='Globalization',     # Color based on the globalization index
                    color_continuous_scale='RdYlBu',  
                    title="Globalization Index by Country",
                    labels={'globalization': 'Globalization Index'}
                    )  

fig.update_layout(
    height=400,  
    width=600,
    title={
        'text': "Globalization Index by Country",
        'x': 0.5,  
        'xanchor': 'center',
        'yanchor': 'top'
    }
)

fig.show()