In [79]:
# Modules to import
import ast
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import json
from importlib import reload
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [31]:
paths = ['src/data','src/models','src/scripts','src/utils']
for path in paths:
    sys.path.append(path)

### IV.4 Character Study for Cultural Influence Analysis

### Cultural influence through characters analysis

The goal of this section is to study the cultural influence of countries on each other through the appearance of characters from a certain origin in film productions.

Quantitatively, we will assign influence points to each character corresponding to the number of countries that have produced films featuring the character.

The first step is to assign an origin country to each character.

Initially, we will use a naive method, assigning the country of production of the first film in which the character appears.

We will then consider a more precise method based on an NLP model to assign the origin country to each character.

In [101]:
from scriptculture import process_data_character          # Import the function that processes the data for this study

df_character_influence = process_data_character()
df_character_influence.head()

Unnamed: 0,Character,character_actor_freebase_id,actor_name,first_movie_name,first_apperance_date,origin_country,all_countries,number_countries_score
0,'Baby' Louise,/m/0c0lv89,Diane Pace,Gypsy,1962-01-01,[United States of America],[United States of America],1
1,ACP Jai Dixit,/m/0220_c2,Abhishek Bachchan,Dhoom,2004-08-27,[India],[India],1
2,Abigail Chase,/m/0k14v6,Diane Kruger,National Treasure,2004-11-08,[United States of America],[United States of America],1
3,Abraham Lincoln,/m/0k89pf,Henry Fonda,Young Mr. Lincoln,1939-01-01,[United States of America],[United States of America],1
4,Abraham Van Helsing,/m/0jxjsg,Edward Van Sloan,Dracula,1931-02-12,[United States of America],"[United Kingdom, United States of America, Fra...",4


For this analysis, we grouped the characters that appeared in different countries (from the character_cluster database). 

The processed dataset that we'll use for this analyse is completed with the features :
 - first_movie_name : The first movie in which the character appeared
 - first_apperance_date : The date of release of the first movie 
 - origin_country : The country of the first movie
 - all_countries : The countries of origin of all the movies in which the character has appeared 
 - number_countries_score : The number of countries that have produced films featuring the character


In [34]:
df_character_influence.describe()

Unnamed: 0,number_countries
count,970.0
mean,1.797938
std,1.161116
min,0.0
25%,1.0
50%,1.0
75%,2.0
max,8.0


970 characters appear in multiple films; on average, they appear in 1.8 different countries. They feature in up to 8 different films. Let's look at the distribution of the number of countries

In [96]:
import plotly.express as px

# Créer le graphique avec Plotly
fig = px.histogram(
    df_character_influence,
    x='number_countries',
    nbins=20,
    title='Distribution of the number of countries that have produced films featuring the character',
    labels={'number_countries': 'Number of Countries'},
    template='plotly_dark'  # Vous pouvez choisir un autre thème (e.g., 'plotly_white')
)

# Ajuster les axes et le titre
fig.update_layout(
    width=900,  # Réduire la largeur
    height=600,  # Augmenter la hauteur
    xaxis_title='Number of countries that have produced films featuring the character',
    yaxis_title='Frequency',
    title={
        'x': 0.5,  # Centrer le titre
        'xanchor': 'center'
    }
)

# Afficher le graphique
fig.show()

In [116]:
print(df_character_influence[df_character_influence['number_countries'] > 1].count()) #Number of characters that appear in more than one country
print("number of influence point to be given :",df_character_influence['number_countries'].sum()-970) #Number of influence point to be given

Character                      428
character_actor_freebase_id    428
actor_name                     428
first_movie_name               428
first_apperance_date           428
origin_country                 428
all_countries                  428
number_countries               428
dtype: int64
number of influence point to be given : 774


Of the 970 characters, 428 appear in movies produced across different countries.

For each countries that have produced films featuring the character, we will assign 1 character influence point to the character's origin country. This results in a total of 774 points being assigned.

A major concern that remains is whether the method of taking the first movie appearance as the origin country of a character is reliable.

Let's test this with the character that appears in movies produced in the most countries.

In [None]:
#find the character that appear in 8 countries and show all the origin countries 

df_character_influence[df_character_influence['number_countries'] == 8].head()

Unnamed: 0,Character,character_actor_freebase_id,actor_name,first_movie_name,first_apperance_date,origin_country,all_countries,number_countries
120,Cardinal Richelieu,/m/02vbtwv,Charlton Heston,The Three Musketeers,1973-01-01,"[United States of America, Spain, United Kingdom]","[Australia, Spain, Germany, Austria, France, P...",8


The character appearing in productions from the most different countries is Cardinal Richelieu (French origin). However, with our initial method, the influence points would not have been distributed to France but to UK, US and Spain.

To overcome these inconsistencies, we decided to reuse the zero-shot classification method, already employed to analyze the cultural influence of the United States through movie summaries.

In [108]:
import scriptculture
reload(scriptculture)

<module 'scriptculture' from 'c:\\Users\\Oscar\\Project ADA\\ada-2024-project-teamcsx24\\src/scripts\\scriptculture.py'>

In [None]:
from scriptculture import process_character_nlp     # Import the function that processes the data for this study with NLP method Origin attribution
df_influence_character = process_character_nlp()
df_influence_character.head()

Unnamed: 0,original_title,Character,Best_Country,number_countries_score,release_year,countries,new_countries
0,Gypsy,'Baby' Louise,United States of America,1,1962.0,[United States of America],0
1,Gypsy,'Baby' Louise,United States of America,1,1993.0,[United States of America],0
3,Dhoom,ACP Jai Dixit,India,1,2004.0,[India],0
2,Dhoom 2,ACP Jai Dixit,India,1,2006.0,[India],0
5,National Treasure,Abigail Chase,United States of America,1,2004.0,[United States of America],0


For this analysis, we submitted the name of each character to a zero-shot classification NLP model, which assigns a score to several labels representing the different countries in which the character has appeared. The country with the highest score then becomes the character's origin country.

The detailed code for obtaining the score of each country per character can be found in the 'nlp_code' notebook within the 'models' directory.

The processed dataset that we'll use for this analyse is completed with the features :
 - original_title : Original title of the movie of interest
 - Best_Country : The country with the highest score for the character becomes the assigned origin country.
 - number_countries_score : The number of countries that have produced films featuring the character
 - release_year : The date of release of the movie 
 - countries : The production countries of the movie
 - new_countries : Is equal to the number of new countries that this film introduces in the character's number_countries score.


From here, we want to create a horizontal bar chart that shows the temporal evolution of countries with the most influential characters, meaning the countries with the highest cumulative 'number_countries' score.

To do this, we will create a temporal cumulative dataframe, which for each year will give the number of countries in which a character has appeared up until that year. We will then group this number by country for each year to obtain a ranking of the countries with the most influential characters each year. We will plot only the top 10 countries for each year

In [None]:
from scriptculture import create_cumulative_df_2
#Create a temporal cumulative DataFrame for the number of new countries in which characters appear, that is the one will use to plot the number of character points of influence over time
cumulative_df_2 = create_cumulative_df_2(df_influence_character[df_influence_character['number_countries'] > 1])  

In [38]:
cumulative_df_2 = cumulative_df_2.sort_values(by=['release_year', 'tot'], ascending=[True, False])
top_10_df_2 = cumulative_df_2.groupby('release_year').head(10)  #We will only keep the top 10 countries for each year in the plot

# We will sort the countries by number of countries where a character appears
top_10_df_2 = top_10_df_2.sort_values(by=['release_year', 'tot'], ascending=[True, True])


In [106]:
fig = px.bar(
    top_10_df_2, 
    x='tot', 
    y='Best_Country', 
    animation_frame='release_year',  
    orientation='h',  # Horizontal bars
    title="Top 10 countries with the most influential characters",
    labels={"tot": "Cumulated character influence score", "Best_Country": "Countries", "year": "Année"},
    color='Best_Country',  # Colorer les barres selon le pays
)


fig.update_layout(
    yaxis={'categoryorder': 'total ascending'},  
    height=800  
)
# Afficher le graphique
fig.show()

### Bonus : The most influential characters in cinema

Small bonus, here we still determine which are the most influential characters and what their origin is according to our NLP method.

In [109]:
from scriptculture import process_top_characters # Import the function that processes the data for this study
top_character_df = process_top_characters()
top_character_df .head()

Unnamed: 0,Character,first_movie_name,Best_Country,number_countries_score,all_countries,first_apperance_date
0,'Baby' Louise,Gypsy,United States of America,1,[United States of America],1962-01-01
1,ACP Jai Dixit,Dhoom,India,1,[India],2004-08-27
2,Abigail Chase,National Treasure,United States of America,1,[United States of America],2004-11-08
3,Abraham Lincoln,Young Mr. Lincoln,United States of America,1,[United States of America],1939-01-01
4,Abraham Van Helsing,Dracula,United Kingdom,4,"[United Kingdom, United States of America, Fra...",1931-02-12


In [111]:
top_character_df[top_character_df['number_countries_score']>6].head() #Top characters, that appear in more than 6 countries

Unnamed: 0,Character,first_movie_name,Best_Country,number_countries_score,all_countries,first_apperance_date
120,Cardinal Richelieu,The Three Musketeers,France,8,"[United States of America, France, Spain, Aust...",1973-01-01
175,Count Dracula,Dracula,Romania,7,"[United States of America, Romania, France, Sp...",1931-02-12
609,Mina Harker,Dracula,United Kingdom,7,"[United States of America, France, Spain, West...",1958-01-01
674,Oliver Twist,Oliver Twist,United Kingdom,7,"[United States of America, Canada, France, Cze...",1922-10-30
