In [230]:
import pandas as pd
import numpy as np
import ast

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import spacy
from gensim import corpora
from gensim.models.ldamodel import LdaModel

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fannl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [211]:
DATA_FOLDER = "data/"

movies = pd.read_csv(DATA_FOLDER + "v1_movies_cleaned.csv")

for col in movies.columns:
    try:
        movies[col] = movies[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    except:
        pass
    print(col, type(movies[col][0]))

title <class 'str'>
languages <class 'list'>
countries <class 'list'>
genres <class 'list'>
keywords <class 'float'>
release_date <class 'str'>
plot_summary <class 'str'>
year_release_date <class 'numpy.int64'>
cold_war_side <class 'str'>
character_western_bloc_representation <class 'list'>
character_eastern_bloc_representation <class 'list'>
western_bloc_values <class 'list'>
eastern_bloc_values <class 'list'>
theme <class 'list'>


In [212]:
# helpers 
VOC_MAPPING = {
    'anti-': 'anti',
    'anti ': 'anti',
    'anti hero': 'antihero',
    'archetypes': 'archetype',
    'archetype:': 'archetype',
}

# Function to split the row list into character name and character representation
def extract_character_name_and_attributes(row):
    if isinstance(row, list) and len(row) > 0 and row[0] is not None:
        character_name = row[0]
        character_representation = row[1:]
        return pd.Series([character_name, character_representation])
    else:
        return pd.Series([None, None])

# Function to apply the mapping with substring matching
def apply_mapping_with_substrings(string_list, mapping):
    updated = []
    for s in string_list:
        new_s = s
        for key, value in mapping.items():
            if key in s:
                new_s = s.replace(key, value)
        updated.append(new_s.lower())
    return updated

In [213]:
def create_character_df(original_df, character_column, character_side):
    # create the new dataframe
    new_df = original_df[[character_column, 'cold_war_side', 'title', 'year_release_date']]
    new_df = new_df.rename(columns={'cold_war_side': 'movie_side'})
    new_df[['character_name', 'character_representation']] = \
        new_df[character_column].apply(extract_character_name_and_attributes)
    new_df = new_df.drop(columns=[character_column])
    new_df['character_side'] = character_side

    # clean the obtained dataframe
    new_df = new_df.drop_duplicates(subset=['character_name'])
    new_df = new_df[new_df['character_representation'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)]
    new_df = new_df[new_df['character_name'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0) <= 5]
    new_df['character_representation'] = new_df['character_representation'].apply(
        lambda x: apply_mapping_with_substrings(x, VOC_MAPPING)
    )
    return new_df

In [214]:
character_eastern = create_character_df(movies, 'character_eastern_bloc_representation', 'Eastern')
character_western = create_character_df(movies, 'character_western_bloc_representation', 'Western')

In [215]:
character_eastern.sample(5)

Unnamed: 0,movie_side,title,year_release_date,character_name,character_representation,character_side
21823,Eastern,Spodelena lyubov,1980,Petko,"[values camaraderie and resilience, archetype ...",Eastern
11868,Eastern,Jan Zizka,1956,Jan Zizka,"[values of unity and resistance, archetype of ...",Eastern
5338,Eastern,Counter-Attack,1945,Colonel Semenov,"[leadership, strategy, military archetype]",Eastern
16913,Eastern,Novenkaya,1969,Olga Kremneva,"[nostalgia, mentorship, tragic hero]",Eastern
26819,Western,The Naked Spur,1953,Ben Vandergroat,"[values of manipulation and survival, archetyp...",Eastern


In [216]:
character_western.sample(5)

Unnamed: 0,movie_side,title,year_release_date,character_name,character_representation,character_side
31763,Western,Witness to Murder,1954,Lt. Lawrence Mathews,"[solidarity, justice, protector, hero archetype]",Western
15142,Eastern,Men in Soldier's Overcoats,1968,Estonian Shooting Corps,"[loyalty, sacrifice, soldier archetype]",Western
12601,Western,King Creole,1958,Danny Fisher,"[determined, resilient, hero]",Western
19085,Western,Rad,1986,Cru Jones,"[ambition, individuality, hero]",Western
6213,Western,Deep West,1971,Hallelujah,"[individualism, anti-hero]",Western


In [218]:
character_df = pd.concat([character_eastern, character_western], axis=0)
character_df.sample(5)

Unnamed: 0,movie_side,title,year_release_date,character_name,character_representation,character_side
26120,Eastern,The Left Hand of God,1955,General Yang,"[authoritarian leadership, antagonist]",Western
19085,Western,Rad,1986,Duke Best,"[manipulation, greed, antagonist]",Eastern
10358,Eastern,Hibiscus Town,1986,Hu Yuyin,"[qin shutian, values love, personal sacrifice,...",Eastern
15601,,Moon 44,1990,Felix Stone,"[undercover agent, bravery, hero]",Western
10364,Eastern,Hidden River,1948,Regino Sandoval,"[oppressive landlord, villain archetype]",Eastern


In [106]:
# Preprocess the text
def preprocess(string_list, nlp, words_to_remove):
    docs = [nlp(s) for s in string_list]
    tokens = []
    for doc in docs:
        tokens.extend([token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.lemma_ not in words_to_remove])
    return tokens

def topic_detection(df, nb_topics, nb_passes):
    # Create a dictionary and corpus for Gensim
    dictionary = corpora.Dictionary(df)
    corpus = [dictionary.doc2bow(text) for text in df]
    
    # Train LDA model
    lda_model = LdaModel(corpus, num_topics=nb_topics, id2word=dictionary, passes=nb_passes)
    
    # Print the topics
    for idx, topic in lda_model.print_topics(-1):
        print(f'Topic: {idx}\nWords: {topic}\n')
    
    return lda_model, corpus, dictionary

# Assign topics to characters
def get_dominant_topic(lda_model, corpus):
    topics = []
    for row in corpus:
        row = lda_model[row]
        max_topic, max_val = 0, 0
        for (topic_num, prop_topic) in row:
            if prop_topic > max_val:
                max_val = prop_topic
                max_topic = topic_num
        topics.append(max_topic)
    return topics

In [107]:
nlp = spacy.load('en_core_web_sm')
words_to_remove = {'archetype', 'hero', 'value', 'values', 'None', 'represent', 'figure', 'desire', 'seek', 'seeker'}

In [108]:
character_eastern['processed_repres'] = character_eastern['character_representation'].apply(preprocess, args=(nlp, words_to_remove))
eastern_char_topic = topic_detection(character_eastern['processed_repres'], 4, 15)
character_eastern['dominant_topic'] = get_dominant_topic(eastern_char_topic[0], eastern_char_topic[1])
print(character_eastern['dominant_topic'].value_counts())

Topic: 0
Words: 0.049*"sacrifice" + 0.033*"love" + 0.027*"struggle" + 0.025*"freedom" + 0.024*"romantic" + 0.022*"tragic" + 0.020*"heroism" + 0.018*"duty" + 0.017*"soldier" + 0.014*"individualism"

Topic: 1
Words: 0.086*"antagonist" + 0.081*"villain" + 0.037*"antihero" + 0.034*"authority" + 0.031*"oppression" + 0.025*"power" + 0.024*"manipulation" + 0.023*"greed" + 0.021*"control" + 0.016*"ambition"

Topic: 2
Words: 0.026*"justice" + 0.019*"traditional" + 0.019*"revolutionary" + 0.018*"collectivism" + 0.017*"innocent" + 0.017*"innocence" + 0.015*"perseverance" + 0.014*"social" + 0.013*"moral" + 0.013*"law"

Topic: 3
Words: 0.065*"loyalty" + 0.026*"resistance" + 0.025*"resilience" + 0.022*"betrayal" + 0.019*"collective" + 0.017*"mentor" + 0.016*"rebel" + 0.016*"bravery" + 0.014*"protector" + 0.013*"courage"

dominant_topic
1    1328
3    1113
0    1026
2     832
Name: count, dtype: int64


In [109]:
character_western['processed_repres'] = character_western['character_representation'].apply(preprocess, args=(nlp, words_to_remove))
western_char_topic = topic_detection(character_western['processed_repres'], 4, 15)
character_western['dominant_topic'] = get_dominant_topic(western_char_topic[0], western_char_topic[1])
print(character_western['dominant_topic'].value_counts())

Topic: 0
Words: 0.045*"antihero" + 0.041*"everyman" + 0.032*"moral" + 0.028*"individualism" + 0.018*"american" + 0.018*"resilience" + 0.017*"ambition" + 0.012*"community" + 0.012*"revolutionary" + 0.010*"determination"

Topic: 1
Words: 0.031*"courage" + 0.028*"honor" + 0.027*"family" + 0.022*"heroic" + 0.022*"mentor" + 0.020*"loyalty" + 0.018*"teamwork" + 0.016*"resourceful" + 0.015*"anti" + 0.015*"idealism"

Topic: 2
Words: 0.052*"bravery" + 0.051*"heroism" + 0.045*"individualism" + 0.042*"loyalty" + 0.036*"freedom" + 0.024*"romantic" + 0.023*"sacrifice" + 0.023*"love" + 0.017*"duty" + 0.016*"camaraderie"

Topic: 3
Words: 0.150*"justice" + 0.033*"law" + 0.032*"integrity" + 0.029*"determination" + 0.027*"truth" + 0.025*"protector" + 0.025*"bravery" + 0.024*"american" + 0.024*"enforcement" + 0.024*"leadership"

dominant_topic
2    1145
3    1045
1     881
0     873
Name: count, dtype: int64


In [110]:
character_eastern.sample(5)

Unnamed: 0,movie_side,title,year_release_date,character_name,character_representation,character_side,processed_repres,dominant_topic
13061,,La Muerte de Mikel,1984,Mikel,"[begoña, doña maria luisa, martín, values of b...",Eastern,"[begoña, doña, maria, luisa, martín, basque, n...",3
26964,Western,The Odessa File,1974,Eduard Roschmann,"[former ss officer, power-driven, villain]",Eastern,"[ss, officer, power, drive, villain]",1
23315,Eastern,The Apple Game,1977,Czech society,"[satire, critical commentary]",Eastern,"[satire, critical, commentary]",1
12522,Eastern,Kievlyanka,1958,Gali,"[resilience, orphan archetype]",Eastern,"[resilience, orphan]",3
28989,,The World of Suzie Wong,1960,Suzie Wong,"[young woman forced into prostitution, surviva...",Eastern,"[young, woman, force, prostitution, survivalis...",2


In [111]:
eastern_movie_eastern_char = character_eastern[character_eastern["movie_side"] == 'Eastern']
western_movie_eastern_char = character_eastern[character_eastern["movie_side"] == 'Western']
eastern_movie_western_char = character_western[character_western["movie_side"] == 'Eastern']
western_movie_western_char = character_western[character_western["movie_side"] == 'Western']

# count dominant topic for each df above
print("Eastern movie Eastern characters")
print(eastern_movie_eastern_char['dominant_topic'].value_counts())

print("Western movie Eastern characters")
print(western_movie_eastern_char['dominant_topic'].value_counts())

print("Eastern movie Western characters")
print(eastern_movie_western_char['dominant_topic'].value_counts())

print("Western movie Western characters")
print(western_movie_western_char['dominant_topic'].value_counts())

Eastern movie Eastern characters
dominant_topic
0    726
3    659
2    555
1    424
Name: count, dtype: int64
Western movie Eastern characters
dominant_topic
1    752
3    396
0    240
2    223
Name: count, dtype: int64
Eastern movie Western characters
dominant_topic
2    308
0    248
1    246
3    104
Name: count, dtype: int64
Western movie Western characters
dominant_topic
3    865
2    781
1    568
0    560
Name: count, dtype: int64


In [72]:
# plot topics
import pyLDAvis
from pyLDAvis import gensim_models

In [73]:
pyLDAvis.display(gensim_models.prepare(western_char_topic[0], western_char_topic[1], western_char_topic[2]))

In [74]:
pyLDAvis.display(gensim_models.prepare(eastern_char_topic[0], eastern_char_topic[1], eastern_char_topic[2]))

In [243]:
from empath import Empath
lexicon = Empath()

# Select lexical categories of interest
CAT = ['politics', 'leader', 'military', 'heroic', 'law', 'affection', 'help',
        'pride', 'family', 'love', 'power', 'deception', 'friends', 'sadness', 'disappointment']

# Define groups of similar topics
TOPIC_GROUPS = {
    'love&affection': ['love', 'affection', 'family', 'friends'],
    'politics&power': ['politics', 'leader', 'military', 'power'],
    'deception&sadness': ['deception', 'sadness', 'disappointment']
}

def semantical_analysis(text, categories):
    """
    Perform a semantical analysis of a text using the Empath lexicon

    Parameters
    text : str, the text to analyze
    categories : list[str], the lexical categories to consider

    Returns
    dict[str, float], the analysis results (only the categories with non-zero values)
    """
    analysis = lexicon.analyze(text, categories=categories, normalize=True)
    return None if max(analysis.values()) == 0 else {k: v for k, v in analysis.items() if v != 0}

def map_topic_to_group(topic, groups):
    """
    Map a topic to a group of similar topics
    """
    for group, topics in groups.items():
        if topic in topics:
            return group
    return topic  

In [244]:
for cat in list(lexicon.cats.keys()):
    print(cat)

help
office
dance
money
wedding
domestic_work
sleep
medical_emergency
cold
hate
cheerfulness
aggression
occupation
envy
anticipation
family
vacation
crime
attractive
masculine
prison
health
pride
dispute
nervousness
government
weakness
horror
swearing_terms
leisure
suffering
royalty
wealthy
tourism
furniture
school
magic
beach
journalism
morning
banking
social_media
exercise
night
kill
blue_collar_job
art
ridicule
play
computer
college
optimism
stealing
real_estate
home
divine
sexual
fear
irritability
superhero
business
driving
pet
childish
cooking
exasperation
religion
hipster
internet
surprise
reading
worship
leader
independence
movement
body
noise
eating
medieval
zest
confusion
water
sports
death
healing
legend
heroic
celebration
restaurant
violence
programming
dominant_heirarchical
military
neglect
swimming
exotic
love
hiking
communication
hearing
order
sympathy
hygiene
weather
anonymity
trust
ancient
deception
fabric
air_travel
fight
dominant_personality
music
vehicle
politeness
t

In [245]:
# Apply the analysis to each row
character_df['empath_analysis'] = character_df['character_representation'].apply(lambda x: semantical_analysis(' '.join(x), CAT))
character_empath_df = character_df.dropna(subset=['empath_analysis']).copy()

print('Total number of characters :', len(character_df))
print('Characters with empath analysis not None :', len(character_empath_df))

print('\nNumber of characters per category :')
for cat in CAT:
    cat_count = character_empath_df['empath_analysis'].apply(lambda x: cat in x.keys()).sum()
    print(f'{cat}: {cat_count}')

character_empath_df.sample(5)

Total number of characters : 8243
Characters with empath analysis not None : 6588

Number of characters per category :
politics: 814
leader: 1191
military: 724
heroic: 3896
law: 948
affection: 711
help: 512
pride: 1915
family: 269
love: 554
power: 1097
deception: 604
friends: 359
sadness: 626
disappointment: 215


Unnamed: 0,movie_side,title,year_release_date,character_name,character_representation,character_side,empath_analysis
23296,Western,The Angry Red Planet,1959,Dr. Iris Ryan,"[colonel tom o'bannion, bravery, scientific in...",Western,"{'military': 0.125, 'heroic': 0.25}"
14399,Western,Macao,1952,Julie Benson,"[cynical, sultry night club singer, femme fata...",Eastern,{'heroic': 0.125}
2186,Western,Ashanti,1979,David Linderby,"[values freedom, fight against injustice, arch...",Western,"{'heroic': 0.14285714285714285, 'power': 0.142..."
22470,,Suspect,1960,Kathleen Riley,"[public defender, justice, moral integrity, hero]",Western,"{'military': 0.16666666666666666, 'heroic': 0...."
24552,Western,The Divided Heart,1954,Natural Mother,"[loss, survival, family, victim]",Eastern,"{'family': 0.25, 'sadness': 0.25}"


In [251]:
def get_empath_evolution(df, empath_column, year_column='year_release_date'):
    """
    Create a DataFrame to track the evolution of empath topics across years
    """
    empath_topics = set()
    for entry in df[empath_column]:
        empath_topics.update(entry.keys())

    # Create a new DataFrame to track topic proportions across years
    empath_evolution = {topic: [] for topic in empath_topics}
    empath_evolution['year'] = []

    for year in sorted(df[year_column].unique()):
        year_data = df[df['year_release_date'] == year][empath_column]
        aggregated = {topic: 0 for topic in empath_topics}
        count = len(year_data)

        for row in year_data:
            for topic, value in row.items():
                aggregated[topic] += value

        if count > 0:
            for topic in aggregated:
                aggregated[topic] /= count

        for topic, value in aggregated.items():
            empath_evolution[topic].append(value)

        empath_evolution['year'].append(year)

    return pd.DataFrame(empath_evolution)

empath_evolution_df = get_empath_evolution(character_empath_df, 'empath_analysis')
melted_empath_df = empath_evolution_df.melt(id_vars=['year'], var_name='topic', value_name='proportion')

# group similar lexicon categories
melted_empath_df['topic'] = melted_empath_df['topic'].apply(map_topic_to_group, groups=TOPIC_GROUPS)
melted_empath_df = melted_empath_df.groupby(['year', 'topic'], as_index=False).agg({'proportion': 'sum'})

In [262]:
fig = go.Figure()

# set topic color palette
unique_topics = melted_empath_df['topic'].unique()
color_palette = px.colors.qualitative.Plotly
topic_colors = {topic: color_palette[i] for i, topic in enumerate(unique_topics)}

# Map colors directly in the DataFrame
melted_empath_df['color'] = melted_empath_df['topic'].map(topic_colors)

fig = px.line(
    melted_empath_df,
    x='year',
    y='proportion',
    color='topic',
    title='Evolution of Grouped Empath Topics Across Years',
    color_discrete_map=topic_colors
)

fig.update_layout(
    width = 1100,
    height = 600,
    xaxis_title="Movie Release Year",
    yaxis_title="Frequency of a Category",
    legend_title="Categories",
    title={
        "text": "Changing Themes in Character Representation Across Years",
        "x": 0.5,
        "xanchor": "center",
        "font": {"size": 20, "family": "Arial", "weight": "bold"}
    },
    margin=dict(l=50, r=50, t=50, b=50),  # Adjust margins
)

fig.show()

In [263]:
fig = go.Figure()

unique_years = sorted(melted_empath_df['year'].unique())
max_y = melted_empath_df['proportion'].max()

for year in unique_years:
    year_data = melted_empath_df[melted_empath_df['year'] == year]
    grouped_data = year_data.groupby('topic', as_index=False).agg({'proportion': 'sum'})
    
    fig.add_trace(go.Bar(
        x=grouped_data['topic'],
        y=grouped_data['proportion'],
        name=f"Year {year}",
        visible=(year == unique_years[0]),  # Only show the first year's data initially
        marker=dict(color=[topic_colors[topic] for topic in grouped_data['topic']])
    ))

# Define steps for the slider
steps = []
for i, year in enumerate(unique_years):
    step = dict(
        method="update",
        args=[
            {"visible": [j == i for j in range(len(unique_years))]},  # Toggle visibility for the selected year
            {"title": f"Empath Topic Proportions for Year: {year}"}
        ],
        label=str(year)
    )
    steps.append(step)

# Add the slider to the layout
fig.update_layout(
    sliders=[{
        "active": 0,
        "currentvalue": {"prefix": "Year: "},
        "pad": {"t": 50},
        "steps": steps
    }],
    title={
        "text": "Characters' Empath Features Distribution by Year",
        "x": 0.5,
        "xanchor": "center",
        "font": {"size": 20, "family": "Arial", "weight": "bold"}
    },
    xaxis_title="Empath Topics",
    yaxis_title="Proportion",
    yaxis=dict(range=[0, max_y]),  # Set the fixed y-axis range
    legend_title="Year",
    barmode="group"  # This ensures bars are grouped by topics
)

fig.show()
