In [1]:
import pandas as pd
import numpy as np
import ast

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import spacy
from gensim import corpora
from gensim.models.ldamodel import LdaModel

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matteo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
DATA_FOLDER = "data/preprocessed/"

movies = pd.read_csv(DATA_FOLDER + "preprocessed_movies.csv")

for col in movies.columns:
    try:
        movies[col] = movies[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    except:
        pass
    print(col, type(movies[col][0]))

wikipedia_id <class 'numpy.float64'>
freebase_id <class 'str'>
title <class 'str'>
languages <class 'float'>
countries <class 'list'>
genres <class 'list'>
keywords <class 'float'>
release_date <class 'numpy.int64'>
runtime <class 'numpy.float64'>
plot_summary <class 'str'>
cold_war_side <class 'str'>
character_western_bloc_representation <class 'list'>
character_eastern_bloc_representation <class 'list'>
western_bloc_values <class 'list'>
eastern_bloc_values <class 'list'>
theme <class 'list'>


In [3]:
def create_theme(df):
    new_df = df[["theme", 'cold_war_side', 'title', 'release_date']]
    # get only Western and Eastern movies
    new_df = new_df[(new_df["cold_war_side"] == "Western") | (new_df["cold_war_side"] == "Eastern")]
    new_df = new_df.rename(columns={'cold_war_side': 'movie_side'})
    new_df = new_df.drop_duplicates(subset=["theme"])
    # be sure theme is a list
    new_df["theme"] = new_df["theme"].apply(lambda x: x if isinstance(x, list) else [x])
    # be sure all elements of the list are strings
    new_df["theme"] = new_df["theme"].apply(lambda x: [str(i) for i in x])
    # use lower case for theme for each element of the list
    new_df["theme"] = new_df["theme"].apply(lambda x: [i.lower() for i in x])
    return new_df

theme_df = create_theme(movies)

In [4]:
theme_df.sample(5)

Unnamed: 0,theme,movie_side,title,release_date
3530,"[comedy, television culture, quiz shows, knowl...",Western,Champagne for Caesar,1950
20751,"[addiction, redemption, personal struggle, mor...",Western,The Man with the Golden Arm,1955
24678,"[struggle, collectivism, family loyalty, tradi...",Western,West Side Story,1961
966,"[espionage, identity, cold war, intelligence, ...",Western,Agent 8 3/4,1964
10499,"[romance, deception, personal transformation, ...",Western,Les tentations de Marianne,1973


In [5]:
# Preprocess the text
def preprocess(string_list, nlp, words_to_remove):
    docs = [nlp(s) for s in string_list]
    tokens = []
    for doc in docs:
        tokens.extend([token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.lemma_ not in words_to_remove])
    return tokens

def topic_detection(df, nb_topics, nb_passes):
    # Create a dictionary and corpus for Gensim
    dictionary = corpora.Dictionary(df)
    corpus = [dictionary.doc2bow(text) for text in df]
    
    # Train LDA model
    lda_model = LdaModel(corpus, num_topics=nb_topics, id2word=dictionary, passes=nb_passes)
    
    # Print the topics
    for idx, topic in lda_model.print_topics(-1):
        print(f'Topic: {idx}\nWords: {topic}\n')
    
    return lda_model, corpus, dictionary

# Assign topics to characters
def get_dominant_topic(lda_model, corpus):
    topics = []
    for row in corpus:
        row = lda_model[row]
        max_topic, max_val = 0, 0
        for (topic_num, prop_topic) in row:
            if prop_topic > max_val:
                max_val = prop_topic
                max_topic = topic_num
        topics.append(max_topic)
    return topics

In [6]:
nlp = spacy.load('en_core_web_sm')
words_to_remove = {'theme'}

In [7]:
theme_df['processed_repres'] = theme_df['theme'].apply(preprocess, args=(nlp, words_to_remove))
theme_topic = topic_detection(theme_df['processed_repres'], 4, 15)
theme_df['dominant_topic'] = get_dominant_topic(theme_topic[0], theme_topic[1])
print(theme_df['dominant_topic'].value_counts())

Topic: 0
Words: 0.075*"war" + 0.049*"sacrifice" + 0.028*"heroism" + 0.027*"resistance" + 0.021*"conflict" + 0.020*"struggle" + 0.017*"identity" + 0.017*"military" + 0.017*"revolution" + 0.016*"oppression"

Topic: 1
Words: 0.050*"personal" + 0.044*"love" + 0.042*"struggle" + 0.026*"social" + 0.026*"conflict" + 0.025*"family" + 0.022*"life" + 0.019*"identity" + 0.019*"sacrifice" + 0.016*"societal"

Topic: 2
Words: 0.056*"justice" + 0.042*"crime" + 0.040*"adventure" + 0.030*"moral" + 0.024*"corruption" + 0.021*"morality" + 0.020*"revenge" + 0.018*"betrayal" + 0.016*"friendship" + 0.013*"heroism"

Topic: 3
Words: 0.054*"war" + 0.053*"espionage" + 0.050*"betrayal" + 0.036*"cold" + 0.024*"tension" + 0.022*"conflict" + 0.021*"deception" + 0.018*"crime" + 0.016*"loyalty" + 0.016*"moral"

dominant_topic
0    1637
1    1617
3    1265
2    1251
Name: count, dtype: int64


In [8]:
eastern_theme = theme_df[theme_df["movie_side"] == 'Eastern']
western_theme = theme_df[theme_df["movie_side"] == 'Western']

# count dominant topic for each df above
print("Eastern themes")
print(eastern_theme['dominant_topic'].value_counts())

print("Western themes")
print(western_theme['dominant_topic'].value_counts())

Eastern themes
dominant_topic
1    1065
0    1059
3     372
2     334
Name: count, dtype: int64
Western themes
dominant_topic
2    917
3    893
0    578
1    552
Name: count, dtype: int64


In [9]:
# plot topics
import pyLDAvis
from pyLDAvis import gensim_models

In [None]:
# pyLDAvis.display(gensim_models.prepare(western_char_topic[0], western_char_topic[1], western_char_topic[2]))
pyLDAvis.display(gensim_models.prepare(theme_topic[0], theme_topic[1], theme_topic[2]))

# save to png


In [11]:
pyLDAvis.display(gensim_models.prepare(eastern_char_topic[0], eastern_char_topic[1], eastern_char_topic[2]))

NameError: name 'eastern_char_topic' is not defined

In [243]:
from empath import Empath
lexicon = Empath()

# Select lexical categories of interest
CAT = ['politics', 'leader', 'military', 'heroic', 'law', 'affection', 'help',
        'pride', 'family', 'love', 'power', 'deception', 'friends', 'sadness', 'disappointment']

# Define groups of similar topics
TOPIC_GROUPS = {
    'love&affection': ['love', 'affection', 'family', 'friends'],
    'politics&power': ['politics', 'leader', 'military', 'power'],
    'deception&sadness': ['deception', 'sadness', 'disappointment']
}

def semantical_analysis(text, categories):
    """
    Perform a semantical analysis of a text using the Empath lexicon

    Parameters
    text : str, the text to analyze
    categories : list[str], the lexical categories to consider

    Returns
    dict[str, float], the analysis results (only the categories with non-zero values)
    """
    analysis = lexicon.analyze(text, categories=categories, normalize=True)
    return None if max(analysis.values()) == 0 else {k: v for k, v in analysis.items() if v != 0}

def map_topic_to_group(topic, groups):
    """
    Map a topic to a group of similar topics
    """
    for group, topics in groups.items():
        if topic in topics:
            return group
    return topic  

In [244]:
for cat in list(lexicon.cats.keys()):
    print(cat)

help
office
dance
money
wedding
domestic_work
sleep
medical_emergency
cold
hate
cheerfulness
aggression
occupation
envy
anticipation
family
vacation
crime
attractive
masculine
prison
health
pride
dispute
nervousness
government
weakness
horror
swearing_terms
leisure
suffering
royalty
wealthy
tourism
furniture
school
magic
beach
journalism
morning
banking
social_media
exercise
night
kill
blue_collar_job
art
ridicule
play
computer
college
optimism
stealing
real_estate
home
divine
sexual
fear
irritability
superhero
business
driving
pet
childish
cooking
exasperation
religion
hipster
internet
surprise
reading
worship
leader
independence
movement
body
noise
eating
medieval
zest
confusion
water
sports
death
healing
legend
heroic
celebration
restaurant
violence
programming
dominant_heirarchical
military
neglect
swimming
exotic
love
hiking
communication
hearing
order
sympathy
hygiene
weather
anonymity
trust
ancient
deception
fabric
air_travel
fight
dominant_personality
music
vehicle
politeness
t

In [245]:
# Apply the analysis to each row
character_df['empath_analysis'] = character_df['character_representation'].apply(lambda x: semantical_analysis(' '.join(x), CAT))
character_empath_df = character_df.dropna(subset=['empath_analysis']).copy()

print('Total number of characters :', len(character_df))
print('Characters with empath analysis not None :', len(character_empath_df))

print('\nNumber of characters per category :')
for cat in CAT:
    cat_count = character_empath_df['empath_analysis'].apply(lambda x: cat in x.keys()).sum()
    print(f'{cat}: {cat_count}')

character_empath_df.sample(5)

Total number of characters : 8243
Characters with empath analysis not None : 6588

Number of characters per category :
politics: 814
leader: 1191
military: 724
heroic: 3896
law: 948
affection: 711
help: 512
pride: 1915
family: 269
love: 554
power: 1097
deception: 604
friends: 359
sadness: 626
disappointment: 215


Unnamed: 0,movie_side,title,year_release_date,character_name,character_representation,character_side,empath_analysis
23296,Western,The Angry Red Planet,1959,Dr. Iris Ryan,"[colonel tom o'bannion, bravery, scientific in...",Western,"{'military': 0.125, 'heroic': 0.25}"
14399,Western,Macao,1952,Julie Benson,"[cynical, sultry night club singer, femme fata...",Eastern,{'heroic': 0.125}
2186,Western,Ashanti,1979,David Linderby,"[values freedom, fight against injustice, arch...",Western,"{'heroic': 0.14285714285714285, 'power': 0.142..."
22470,,Suspect,1960,Kathleen Riley,"[public defender, justice, moral integrity, hero]",Western,"{'military': 0.16666666666666666, 'heroic': 0...."
24552,Western,The Divided Heart,1954,Natural Mother,"[loss, survival, family, victim]",Eastern,"{'family': 0.25, 'sadness': 0.25}"


In [251]:
def get_empath_evolution(df, empath_column, year_column='year_release_date'):
    """
    Create a DataFrame to track the evolution of empath topics across years
    """
    empath_topics = set()
    for entry in df[empath_column]:
        empath_topics.update(entry.keys())

    # Create a new DataFrame to track topic proportions across years
    empath_evolution = {topic: [] for topic in empath_topics}
    empath_evolution['year'] = []

    for year in sorted(df[year_column].unique()):
        year_data = df[df['year_release_date'] == year][empath_column]
        aggregated = {topic: 0 for topic in empath_topics}
        count = len(year_data)

        for row in year_data:
            for topic, value in row.items():
                aggregated[topic] += value

        if count > 0:
            for topic in aggregated:
                aggregated[topic] /= count

        for topic, value in aggregated.items():
            empath_evolution[topic].append(value)

        empath_evolution['year'].append(year)

    return pd.DataFrame(empath_evolution)

empath_evolution_df = get_empath_evolution(character_empath_df, 'empath_analysis')
melted_empath_df = empath_evolution_df.melt(id_vars=['year'], var_name='topic', value_name='proportion')

# group similar lexicon categories
melted_empath_df['topic'] = melted_empath_df['topic'].apply(map_topic_to_group, groups=TOPIC_GROUPS)
melted_empath_df = melted_empath_df.groupby(['year', 'topic'], as_index=False).agg({'proportion': 'sum'})

In [262]:
fig = go.Figure()

# set topic color palette
unique_topics = melted_empath_df['topic'].unique()
color_palette = px.colors.qualitative.Plotly
topic_colors = {topic: color_palette[i] for i, topic in enumerate(unique_topics)}

# Map colors directly in the DataFrame
melted_empath_df['color'] = melted_empath_df['topic'].map(topic_colors)

fig = px.line(
    melted_empath_df,
    x='year',
    y='proportion',
    color='topic',
    title='Evolution of Grouped Empath Topics Across Years',
    color_discrete_map=topic_colors
)

fig.update_layout(
    width = 1100,
    height = 600,
    xaxis_title="Movie Release Year",
    yaxis_title="Frequency of a Category",
    legend_title="Categories",
    title={
        "text": "Changing Themes in Character Representation Across Years",
        "x": 0.5,
        "xanchor": "center",
        "font": {"size": 20, "family": "Arial", "weight": "bold"}
    },
    margin=dict(l=50, r=50, t=50, b=50),  # Adjust margins
)

fig.show()

In [263]:
fig = go.Figure()

unique_years = sorted(melted_empath_df['year'].unique())
max_y = melted_empath_df['proportion'].max()

for year in unique_years:
    year_data = melted_empath_df[melted_empath_df['year'] == year]
    grouped_data = year_data.groupby('topic', as_index=False).agg({'proportion': 'sum'})
    
    fig.add_trace(go.Bar(
        x=grouped_data['topic'],
        y=grouped_data['proportion'],
        name=f"Year {year}",
        visible=(year == unique_years[0]),  # Only show the first year's data initially
        marker=dict(color=[topic_colors[topic] for topic in grouped_data['topic']])
    ))

# Define steps for the slider
steps = []
for i, year in enumerate(unique_years):
    step = dict(
        method="update",
        args=[
            {"visible": [j == i for j in range(len(unique_years))]},  # Toggle visibility for the selected year
            {"title": f"Empath Topic Proportions for Year: {year}"}
        ],
        label=str(year)
    )
    steps.append(step)

# Add the slider to the layout
fig.update_layout(
    sliders=[{
        "active": 0,
        "currentvalue": {"prefix": "Year: "},
        "pad": {"t": 50},
        "steps": steps
    }],
    title={
        "text": "Characters' Empath Features Distribution by Year",
        "x": 0.5,
        "xanchor": "center",
        "font": {"size": 20, "family": "Arial", "weight": "bold"}
    },
    xaxis_title="Empath Topics",
    yaxis_title="Proportion",
    yaxis=dict(range=[0, max_y]),  # Set the fixed y-axis range
    legend_title="Year",
    barmode="group"  # This ensures bars are grouped by topics
)

fig.show()
