In [1]:
import pandas as pd
import numpy as np
import ast

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import spacy
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fannl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
DATA_FOLDER = "data/"

movies = pd.read_csv(DATA_FOLDER + "preprocessed_movies.csv")

for col in movies.columns:
    try:
        movies[col] = movies[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    except:
        pass
    print(col, type(movies[col][0]))

#movies.head()

wikipedia_id <class 'numpy.float64'>
freebase_id <class 'str'>
title <class 'str'>
languages <class 'float'>
countries <class 'list'>
genres <class 'list'>
keywords <class 'float'>
release_date <class 'numpy.int64'>
runtime <class 'numpy.float64'>
plot_summary <class 'str'>
cold_war_side <class 'str'>
character_western_bloc_representation <class 'list'>
character_eastern_bloc_representation <class 'list'>
western_bloc_values <class 'list'>
eastern_bloc_values <class 'list'>
theme <class 'list'>


In [3]:
# helpers 
VOC_MAPPING = {
    'anti-': 'anti',
    'anti ': 'anti',
    'anti hero': 'antihero',
    'archetypes': 'archetype',
    'archetype:': 'archetype',
    'archetypal': 'archetype',
}

# Function to split the row list into character name and character representation
def extract_character_name_and_attributes(row):
    if isinstance(row, list) and len(row) > 0 and row[0] is not None:
        character_name = row[0]
        character_representation = row[1:]
        return pd.Series([character_name, character_representation])
    else:
        return pd.Series([None, None])

# Function to apply the mapping with substring matching
def apply_mapping_with_substrings(string_list, mapping):
    updated = []
    for s in string_list:
        new_s = s
        for key, value in mapping.items():
            if key in s:
                new_s = s.replace(key, value)
        updated.append(new_s.lower())
    return updated

In [4]:
def create_character_df(original_df, character_column, character_side):
    # create the new dataframe
    new_df = original_df[[character_column, 'cold_war_side', 'title', 'release_date']]
    new_df = new_df.rename(columns={'cold_war_side': 'movie_side'})
    new_df[['character_name', 'character_representation']] = \
        new_df[character_column].apply(extract_character_name_and_attributes)
    new_df = new_df.drop(columns=[character_column])
    new_df['character_side'] = character_side

    # clean the obtained dataframe
    new_df = new_df.drop_duplicates(subset=['character_name'])
    new_df = new_df[new_df['character_representation'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)]
    new_df = new_df[new_df['character_name'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0) <= 5]
    new_df['character_representation'] = new_df['character_representation'].apply(
        lambda x: apply_mapping_with_substrings(x, VOC_MAPPING)
    )
    return new_df

In [5]:
character_eastern = create_character_df(movies, 'character_eastern_bloc_representation', 'Eastern')
character_western = create_character_df(movies, 'character_western_bloc_representation', 'Western')

In [6]:
character_eastern.sample(5)

Unnamed: 0,movie_side,title,release_date,character_name,character_representation,character_side
25365,Eastern,Yuwaku,1948,Saburi,"[shin, emotional restraint, hidden desires, co...",Eastern
19806,Eastern,The Harms Case,1987,Danil Harms,"[avant-garde, persecuted artist]",Eastern
22899,Western,The Wrong Arm of the Law,1963,Jack Coombes,"[values corruption, cunning, archetype antagon...",Eastern
13710,Eastern,Parvarish,1977,DSP Shamsher Singh,"[promise, honor, protector, archetype of the g...",Eastern
20400,Western,The League of Gentlemen,1959,Major Peter Race,"[betrayal, moral compromise, antihero]",Eastern


In [7]:
character_western.sample(5)

Unnamed: 0,movie_side,title,release_date,character_name,character_representation,character_side
2786,Eastern,Bolje je umeti,1960,Mane Karakas,"[optimism, unity, hero archetype]",Western
18478,,The Bullfighters,1945,Laurel and Hardy,"[humor, heroic duo]",Western
17655,Eastern,Teens in the Universe,1975,Soviet pioneers,"[collectivism, heroism, archetype of the hero]",Western
11864,Western,Miss Sadie Thompson,1953,Sadie Thompson,"[freedom, self-expression, archetype of the re...",Western
13574,Western,Oxford Blues,1984,Nick Di Angelo,"[values love, friendship, american ambition, p...",Western


In [8]:
character_df = pd.concat([character_eastern, character_western], axis=0)
character_df.sample(5)

Unnamed: 0,movie_side,title,release_date,character_name,character_representation,character_side
14153,Eastern,Polonez Oginskogo,1971,Vasily,"[orphaned violinist, resilience, innocence]",Eastern
2459,Western,Bitter Victory,1957,Major Brand,"[professionalism, duty, heroism, tragic hero]",Western
3432,Western,Cast a Giant Shadow,1966,"Colonel David ""Mickey"" Marcus","[jewish-american values, heroism, leadership, ...",Western
9823,Eastern,Kleinhoff Hotel,1977,Pascale,"[bourgeois, passive, observer]",Eastern
3938,Eastern,Colonel Wolodyjowski,1969,Colonel Michael Wolodyjowski,"[bravery, honor, hero archetype]",Eastern


In [9]:
# Preprocess the text

def preprocess(string_list, nlp, words_to_remove):
    # personalize the stop words
    stop_words = set(stopwords.words('english')).union(words_to_remove)

    docs = [nlp(s) for s in string_list]
    tokens = []
    for doc in docs:
        tokens.extend([token.lemma_ for token in doc if not token.is_punct and token.lemma_ not in stop_words])
    return tokens

def topic_detection(df, nb_topics, nb_passes):
    # Create a dictionary and corpus for Gensim
    dictionary = corpora.Dictionary(df)
    corpus = [dictionary.doc2bow(text) for text in df]
    
    # Train LDA model
    lda_model = LdaModel(corpus, num_topics=nb_topics, id2word=dictionary, passes=nb_passes)
    
    # Print the topics
    for idx, topic in lda_model.print_topics(-1, 20):
        print(f'Topic: {idx}\nWords: {topic}\n')
    
    return lda_model, corpus, dictionary

# Assign topics to characters
def get_dominant_topic(lda_model, corpus):
    topics = []
    for row in corpus:
        row = lda_model[row]
        max_topic, max_val = 0, 0
        for (topic_num, prop_topic) in row:
            if prop_topic > max_val:
                max_val = prop_topic
                max_topic = topic_num
        topics.append(max_topic)
    return topics

In [10]:
nlp = spacy.load('en_core_web_sm')
words_to_remove = {'archetype', 'hero', 'heroism', 'value', 'values', 'none', 'represent', 'figure', 'desire', 'seek', 'seeker'}

In [30]:
character_eastern['processed_repres'] = character_eastern['character_representation'].apply(preprocess, args=(nlp, words_to_remove))
eastern_char_topic = topic_detection(character_eastern['processed_repres'], 3, 15)
character_eastern['dominant_topic'] = get_dominant_topic(eastern_char_topic[0], eastern_char_topic[1])
print(character_eastern['dominant_topic'].value_counts())

Topic: 0
Words: 0.039*"loyalty" + 0.024*"sacrifice" + 0.020*"love" + 0.018*"collectivism" + 0.013*"revolutionary" + 0.013*"justice" + 0.012*"collective" + 0.012*"rebel" + 0.012*"resistance" + 0.012*"duty" + 0.011*"tragic" + 0.011*"struggle" + 0.011*"bravery" + 0.010*"community" + 0.010*"mentor" + 0.009*"conflict" + 0.009*"martyr" + 0.009*"leader" + 0.009*"romantic" + 0.009*"family"

Topic: 1
Words: 0.052*"resilience" + 0.021*"victim" + 0.019*"freedom" + 0.015*"survival" + 0.014*"innocence" + 0.010*"spirit" + 0.010*"rebellion" + 0.009*"antihero" + 0.009*"tragic" + 0.008*"struggle" + 0.008*"mentor" + 0.008*"innocent" + 0.007*"compassion" + 0.007*"trickster" + 0.007*"tyranny" + 0.007*"everyman" + 0.007*"american" + 0.006*"hope" + 0.006*"determination" + 0.006*"independence"

Topic: 2
Words: 0.067*"antagonist" + 0.056*"villain" + 0.029*"authority" + 0.020*"antihero" + 0.019*"manipulation" + 0.018*"ambition" + 0.017*"power" + 0.017*"oppression" + 0.016*"greed" + 0.015*"control" + 0.013*"cor

In [31]:
character_western['processed_repres'] = character_western['character_representation'].apply(preprocess, args=(nlp, words_to_remove))
western_char_topic = topic_detection(character_western['processed_repres'], 3, 15)
character_western['dominant_topic'] = get_dominant_topic(western_char_topic[0], western_char_topic[1])
print(character_western['dominant_topic'].value_counts())

Topic: 0
Words: 0.026*"duty" + 0.024*"courage" + 0.023*"american" + 0.023*"honor" + 0.018*"truth" + 0.018*"antihero" + 0.017*"loyalty" + 0.016*"redemption" + 0.015*"soldier" + 0.014*"trickster" + 0.013*"resourceful" + 0.012*"protagonist" + 0.011*"heroic" + 0.011*"resilience" + 0.010*"patriotism" + 0.010*"british" + 0.009*"determine" + 0.009*"leader" + 0.008*"self" + 0.008*"cunne"

Topic: 1
Words: 0.062*"individualism" + 0.062*"bravery" + 0.036*"loyalty" + 0.034*"freedom" + 0.022*"everyman" + 0.021*"leadership" + 0.021*"tragic" + 0.019*"sacrifice" + 0.019*"antihero" + 0.015*"rebel" + 0.015*"moral" + 0.014*"family" + 0.012*"determination" + 0.012*"love" + 0.012*"heroic" + 0.011*"resourcefulness" + 0.011*"american" + 0.010*"camaraderie" + 0.010*"idealism" + 0.007*"mentor"

Topic: 2
Words: 0.114*"justice" + 0.029*"law" + 0.023*"protector" + 0.021*"resilience" + 0.021*"ambition" + 0.019*"enforcement" + 0.018*"determination" + 0.018*"romantic" + 0.012*"integrity" + 0.012*"antihero" + 0.012*"

In [32]:
WEST_ARCHETYPES_MAP = {
    0: 'The brave loyal soldier',
    1: 'The individualistic rebel',
    2: 'The moral enforcer'
}

EAST_ARCHETYPES_MAP = {
    0: 'The Tragic Devoted Hero',
    1: 'The Resilient Revolutionary',
    2: 'The power-hungry manipulative villain'
}

character_western['dominant_topic_'] = character_western['dominant_topic'].map(WEST_ARCHETYPES_MAP)
character_eastern['dominant_topic_'] = character_eastern['dominant_topic'].map(EAST_ARCHETYPES_MAP)

In [36]:
eastern_movie_eastern_char = character_eastern[character_eastern["movie_side"] == 'Eastern']
western_movie_eastern_char = character_eastern[character_eastern["movie_side"] == 'Western']
eastern_movie_western_char = character_western[character_western["movie_side"] == 'Eastern']
western_movie_western_char = character_western[character_western["movie_side"] == 'Western']

# Define color scales
# Define color palettes for Eastern and Western characters
eastern_colors = {
    'The Tragic Devoted Hero': '#EB8B84',  # Light red
    'The Resilient Revolutionary': '#DD3C32',    # Medium red
    'The power-hungry manipulative villain': '#7B1B14'  # Dark red
}

western_colors = {
    'The brave loyal soldier': '#78C0F7',  # Light blue
    'The moral enforcer': '#0F89E6', # Medium blue
    'The individualistic rebel': '#074473'   # Dark blue
}

# Combine all counts into a single DataFrame
eastern_char_dominant_topics_df = pd.DataFrame({
    'Eastern Movie': eastern_movie_eastern_char['dominant_topic_'].value_counts(),
    'Western Movie': western_movie_eastern_char['dominant_topic_'].value_counts()
}).T

western_char_dominant_topics_df = pd.DataFrame({
    'Eastern Movie': eastern_movie_western_char['dominant_topic_'].value_counts(),
    'Western Movie': western_movie_western_char['dominant_topic_'].value_counts()
}).T


# Create a subplot with 1 row and 2 columns
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=["Eastern Characters", "Western Characters"]
)

# Add bars for each topic
for topic in eastern_char_dominant_topics_df.columns:
    fig.add_trace(go.Bar(
        x=eastern_char_dominant_topics_df.index,
        y=eastern_char_dominant_topics_df[topic],
        name=f"{topic}",
        marker_color=eastern_colors[topic],
        customdata=western_char_dominant_topics_df.index.map(lambda x: topic),
        hovertemplate='Archetype: <b>%{customdata}</b><br>Representatives count: <b>%{y}</b><extra></extra>'
    ), row=1, col=1)

# Add bars for each topic
for topic in western_char_dominant_topics_df.columns:
    fig.add_trace(go.Bar(
        x=western_char_dominant_topics_df.index,
        y=western_char_dominant_topics_df[topic],
        name=f"{topic}",
        marker_color=western_colors[topic],
        customdata=western_char_dominant_topics_df.index.map(lambda x: topic),
        hovertemplate='Archetype: <b>%{customdata}</b><br>Representatives count: <b>%{y}</b><extra></extra>'
    ), row=1, col=2)

# Update layout for both subplots
fig.update_layout(
    barmode='stack',
    title_text='Distribution of Eastern and Western Characters Archetypes',
    xaxis_title='Movie Side',
    yaxis_title='Number of Characters',
    legend_title='Dominant Topics',
    height=500, width=1200  # Adjust figure size
)

fig.update_xaxes(title_text="Movie Side", row=1, col=1)
fig.update_yaxes(title_text="Number of Characters", row=1, col=1)

fig.update_xaxes(title_text="Movie Side", row=1, col=2)
fig.update_yaxes(title_text="Number of Characters", row=1, col=2)

fig.show()


In [51]:
def compute_overall_term_freq(corpus, dictionary):
    """
    Computes the overall term frequency for all words in the corpus.

    Parameters:
    - corpus: BoW representation of the corpus (list of lists of (word_id, count)).
    - dictionary: Gensim Dictionary object mapping word IDs to words.

    Returns:
    - A Counter object mapping each word to its overall frequency in the corpus.
    """
    term_freq = Counter()
    for doc in corpus:
        term_freq.update(dictionary[term_id] for term_id, _ in doc if term_id in dictionary)
    return term_freq

def compute_estimated_term_freq(lda_model, corpus, dictionary, top_n=15):
    """
    Computes the estimated term frequency for the top N words in each topic.
    
    Parameters:
    - lda_model: Trained Gensim LDA model.
    - corpus: BoW representation of the corpus (list of lists of (word_id, count)).
    - dictionary: Gensim Dictionary object mapping word IDs to words.
    - top_n: Number of top words per topic to compute the frequencies for (default: 15).
    
    Returns:
    - A dictionary where each topic ID maps to another dictionary of top words and their estimated frequencies.
    """
    doc_topic_distrib = [dict(lda_model.get_document_topics(doc, minimum_probability=0.0)) for doc in corpus]
    topic_term_frequencies = {}
    
    for topic_id in range(lda_model.num_topics):
        top_words = lda_model.get_topic_terms(topic_id, topn=top_n)
        word_frequencies = {}

        for word_id, _ in top_words:
            word = dictionary[word_id]
            estimated_frequency = 0.0

            # Accumulate contributions from all doc for this word
            for doc_id, doc in enumerate(corpus):
                word_count = dict(doc).get(word_id, 0) # Get word count in the document
                topic_contribution = doc_topic_distrib[doc_id].get(topic_id, 0) # Topic proportion for this doc
                estimated_frequency += topic_contribution * word_count

            word_frequencies[word] = estimated_frequency

        topic_term_frequencies[topic_id] = word_frequencies

    return topic_term_frequencies

def plotly_term_frequencies(etf_by_topic, otf_by_topic, colors, topic_dict):
    for topic_id, word_frequencies in etf_by_topic.items():
        words = list(word_frequencies.keys())
        etf_values = np.sqrt(list(word_frequencies.values()))
        otf_values = [np.sqrt(otf_by_topic[word]) for word in words]

        sorted_indices = np.argsort(etf_values)
        words = [words[i] for i in sorted_indices]
        etf_values = [etf_values[i] for i in sorted_indices]
        otf_values = [otf_values[i] for i in sorted_indices]

        fig = go.Figure()

        fig.add_trace(go.Bar(
            y=words,
            x=otf_values,
            orientation='h',
            name='Overall Term Frequency (OTF)',
            marker=dict(color=colors[1], opacity=0.7),
            hovertemplate='Word: <b>%{y}</b><br>sqrt(OTF) = <b>%{x}</b><extra></extra>',
            width=0.55
        ))

        fig.add_trace(go.Bar(
            y=words,
            x=etf_values,
            orientation='h',
            name='Estimated Term Frequency within the topic (ETF)',
            marker=dict(color=colors[0]),
            hovertemplate='Word: <b>%{y}</b><br>sqrt(ETF) = <b>%{x}</b><extra></extra>',
            width=0.55
        ))

        fig.update_layout(
            title=f'Term Frequencies for archetype of {topic_dict[topic_id].lower()}',
            xaxis_title='Frequency (sqrt scale)',
            yaxis_title='Words',
            barmode='overlay',
            legend=dict(
                x=0.95, 
                y=0.2,  
                xanchor='right',
                yanchor='top',
                ),
            height=500,  
            width=1000,  
            margin=dict(l=30, r=30, t=50, b=50) 
        )
        fig.show()

In [54]:
lda_model, corpus, dictionary = eastern_char_topic
overall_term_freq = compute_overall_term_freq(corpus, dictionary)
est_term_freq_by_topic = compute_estimated_term_freq(lda_model, corpus, dictionary, 20)
plotly_term_frequencies(est_term_freq_by_topic, overall_term_freq, ['#DD3C32','#F3BDBA'], EAST_ARCHETYPES_MAP)

In [55]:
lda_model, corpus, dictionary = western_char_topic
overall_term_freq = compute_overall_term_freq(corpus, dictionary)
est_term_freq_by_topic = compute_estimated_term_freq(lda_model, corpus, dictionary, 20)
plotly_term_frequencies(est_term_freq_by_topic, overall_term_freq, ['#0F89E6', '#9FD2F9'], WEST_ARCHETYPES_MAP)

In [18]:
# plot topics
import pyLDAvis
from pyLDAvis import gensim_models

In [19]:
pyLDAvis.display(gensim_models.prepare(western_char_topic[0], western_char_topic[1], western_char_topic[2]))

In [20]:
pyLDAvis.display(gensim_models.prepare(eastern_char_topic[0], eastern_char_topic[1], eastern_char_topic[2]))

In [56]:
from empath import Empath
lexicon = Empath()

# Select lexical categories of interest
CAT = ['politics', 'leader', 'military', 'heroic', 'law', 'affection', 'help',
        'pride', 'family', 'love', 'power', 'deception', 'friends', 'sadness', 
        'disappointment', 'shame']

# Define groups of similar topics
TOPIC_GROUPS = {
    'love & affection': ['love', 'affection', 'family'],
    'Support & solidarity': ['help', 'friends'],
    'Politics & power': ['politics', 'leader', 'military', 'power'],
    'Deception': ['deception', 'sadness', 'disappointment', 'shame']
}

def semantical_analysis(text, categories):
    """
    Perform a semantical analysis of a text using the Empath lexicon

    Parameters
    text : str, the text to analyze
    categories : list[str], the lexical categories to consider

    Returns
    dict[str, float], the analysis results (only the categories with non-zero values)
    """
    analysis = lexicon.analyze(text, categories=categories, normalize=True)
    none_condition = not text or max(analysis.values()) == 0
    return None if none_condition else {k: v for k, v in analysis.items() if v != 0}

def map_topic_to_group(topic, groups):
    """
    Map a topic to a group of similar topics
    """
    for group, topics in groups.items():
        if topic in topics:
            return group
    return topic  

In [57]:
# Apply the analysis to each row
character_df['empath_analysis'] = character_df['character_representation'].apply(lambda x: semantical_analysis(' '.join(x), CAT))
character_empath_df = character_df.dropna(subset=['empath_analysis']).copy()

print('Total number of characters :', len(character_df))
print('Characters with empath analysis not None :', len(character_empath_df))

print('\nNumber of characters per category :')
for cat in CAT:
    cat_count = character_empath_df['empath_analysis'].apply(lambda x: cat in x.keys()).sum()
    print(f'{cat}: {cat_count}')

character_empath_df.sample(5)

Total number of characters : 7052
Characters with empath analysis not None : 5652

Number of characters per category :
politics: 660
leader: 1057
military: 639
heroic: 3341
law: 830
affection: 638
help: 428
pride: 1695
family: 214
love: 484
power: 890
deception: 541
friends: 315
sadness: 523
disappointment: 165
shame: 218


Unnamed: 0,movie_side,title,release_date,character_name,character_representation,character_side,empath_analysis
8030,Eastern,Hey Babu Riba,1985,Four friends,"[friendship, love, sacrifice, loyalty, camarad...",Eastern,"{'heroic': 0.1111111111111111, 'affection': 0...."
16073,Western,Sharpe's Eagle,1993,Sir Henry Simmerson,"[incompetence, arrogance, antagonist]",Eastern,{'shame': 0.3333333333333333}
7943,Eastern,Hello Moscow!,1945,Kolya Leonov,"[patriotism, community, hero]",Western,"{'politics': 0.3333333333333333, 'heroic': 0.3..."
18741,Eastern,The Conspiracy of Ambassadors,1966,Ambassadors,"[loyalty to the soviet regime, archetype of th...",Eastern,"{'politics': 0.1, 'affection': 0.1, 'help': 0.1}"
13521,Western,Our Man Flint,1966,Derek Flint,"[resourceful spy, individualistic, hero archet...",Western,{'heroic': 0.2}


In [58]:
def get_empath_evolution(df, empath_column, year_column='release_date'):
    """
    Create a DataFrame to track the evolution of empath topics across years
    """
    empath_topics = set()
    for entry in df[empath_column]:
        empath_topics.update(entry.keys())

    # Create a new DataFrame to track topic proportions across years
    empath_evolution = {topic: [] for topic in empath_topics}
    empath_evolution['year'] = []

    # iterate only over years until end of cold war i.e. 1991
    years = [y for y in sorted(df[year_column].unique()) if y <= 1991]

    for year in years:
        year_data = df[df['release_date'] == year][empath_column]
        aggregated = {topic: 0 for topic in empath_topics}
        count = len(year_data)

        for row in year_data:
            for topic, value in row.items():
                aggregated[topic] += value

        if count > 0:
            for topic in aggregated:
                aggregated[topic] /= count

        for topic, value in aggregated.items():
            empath_evolution[topic].append(value)

        empath_evolution['year'].append(year)

    return pd.DataFrame(empath_evolution)

empath_evolution_df = get_empath_evolution(character_empath_df, 'empath_analysis')
melted_empath_df = empath_evolution_df.melt(id_vars=['year'], var_name='topic', value_name='proportion')

# group similar lexicon categories
melted_empath_df['topic'] = melted_empath_df['topic'].apply(map_topic_to_group, groups=TOPIC_GROUPS)
melted_empath_df = melted_empath_df.groupby(['year', 'topic'], as_index=False).agg({'proportion': 'sum'})
# Apply square root transformation to the 'proportion' column
melted_empath_df['proportion_sqrt'] = np.sqrt(melted_empath_df['proportion'])

In [59]:
fig = go.Figure()

# set topic color palette
unique_topics = melted_empath_df['topic'].unique()
color_palette = px.colors.qualitative.Plotly
topic_colors = {topic: color_palette[i] for i, topic in enumerate(unique_topics)}

# Map colors directly in the DataFrame
melted_empath_df['color'] = melted_empath_df['topic'].map(topic_colors)

fig = px.line(
    melted_empath_df,
    x='year',
    y='proportion_sqrt',
    color='topic',
    title='Evolution of Grouped Empath Topics Across Years',
    color_discrete_map=topic_colors
)

fig.update_layout(
    width = 1100,
    height = 600,
    xaxis_title="Movie Release Year",
    yaxis_title="Frequency of a Category",
    legend_title="Categories",
    title={
        "text": "Changing Themes in Character Representation Across Years",
        "x": 0.5,
        "xanchor": "center",
        "font": {"size": 20, "family": "Arial", "weight": "bold"}
    },
    margin=dict(l=50, r=50, t=50, b=50),  # Adjust margins
    #yaxis=dict(type='log')
)

fig.show()

In [60]:
fig = go.Figure()

unique_years = sorted(melted_empath_df['year'].unique())
max_y = melted_empath_df['proportion_sqrt'].max()

for year in unique_years:
    year_data = melted_empath_df[melted_empath_df['year'] == year]
    grouped_data = year_data.groupby('topic', as_index=False).agg({'proportion_sqrt': 'sum'})
    
    fig.add_trace(go.Bar(
        x=grouped_data['topic'],
        y=grouped_data['proportion_sqrt'],
        name=f"Year {year}",
        visible=(year == unique_years[0]),  # Only show the first year's data initially
        marker=dict(color=[topic_colors[topic] for topic in grouped_data['topic']])
    ))

# Define steps for the slider
steps = []
for i, year in enumerate(unique_years):
    step = dict(
        method="update",
        args=[
            {"visible": [j == i for j in range(len(unique_years))]},  # Toggle visibility for the selected year
            {"title": f"Empath Topic Proportions for Year: {year}"}
        ],
        label=str(year)
    )
    steps.append(step)

# Add the slider to the layout
fig.update_layout(
    sliders=[{
        "active": 0,
        "currentvalue": {"prefix": "Year: "},
        "pad": {"t": 50},
        "steps": steps
    }],
    title={
        "text": "Characters' Empath Features Distribution by Year",
        "x": 0.5,
        "xanchor": "center",
        "font": {"size": 20, "family": "Arial", "weight": "bold"}
    },
    xaxis_title="Empath Topics",
    yaxis_title="Proportion",
    yaxis=dict(range=[0, max_y]),  # Set the fixed y-axis range
    legend_title="Year",
    barmode="group"  # This ensures bars are grouped by topics
)

fig.show()