In [None]:
import pandas as pd
import numpy as np
import ast

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import spacy
from gensim import corpora
from gensim.models.ldamodel import LdaModel

import warnings

from src.utils.constants import *
from src.utils.helpers import *

import pickle

warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matteo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
movies = convert_csv(pd.read_csv(PREPROCESSED_MOVIES))
movies.head()

Unnamed: 0,title,languages,countries,genres,release_date,cold_war_side,character_western_bloc_representation,character_eastern_bloc_representation,western_bloc_values,eastern_bloc_values,theme
0,$,,[Russia],"[Comedy, Drama, Crime]",1971,Western,"[Joe Collins, American bank security consultan...","[Dawn Divine, hooker with a heart of gold, cun...",[None],"[Resourcefulness, cleverness, individualism, h...",[None]
1,"$1,000 on the Black","[Italian, German]","[Germany, Italy]",[Western],1966,Eastern,[None],"[Sartana, villainous, oppressive, cruel, arche...","[Johnny Liston, justice, determination, resili...","[Justice, revenge, oppressed vs. oppressor, re...","[Terror, betrayal, familial conflict, crime, r..."
2,"$10,000 Blood Money",,[Russia],"[Drama, Western]",1967,,[None],[None],[None],[None],"[crime, betrayal, revenge, bounty hunter, heis..."
3,"$100,000 for Ringo",[Italian],[Italy],"[Drama, Western]",1965,,[None],[None],[None],[None],"[Western, Civil War, mistaken identity, treasu..."
4,'Anna' i wampir,,[Russia],[Crime],1982,,[None],[None],[None],[None],"[murder mystery, horror, fog, Poland, 1960s]"


In [3]:
def create_theme(df):
    new_df = df[["theme", 'cold_war_side', 'title', 'release_date']]
    # get only Western and Eastern movies
    new_df = new_df[(new_df["cold_war_side"] == "Western") | (new_df["cold_war_side"] == "Eastern")]
    new_df = new_df.rename(columns={'cold_war_side': 'movie_side'})
    new_df = new_df.drop_duplicates(subset=["theme"])
    # be sure theme is a list
    new_df["theme"] = new_df["theme"].apply(lambda x: x if isinstance(x, list) else [x])
    # be sure all elements of the list are strings
    new_df["theme"] = new_df["theme"].apply(lambda x: [str(i) for i in x])
    # use lower case for theme for each element of the list
    new_df["theme"] = new_df["theme"].apply(lambda x: [i.lower() for i in x])
    return new_df

theme_df = create_theme(movies)

In [4]:
theme_df.sample(5)

Unnamed: 0,theme,movie_side,title,release_date
954,"[hussite wars, rebellion, national identity, h...",Eastern,Against All,1957
18697,"[militarism, heroism, sacrifice, national prid...",Eastern,The Combat Unit of a Fighter Plane,1953
11275,"[war, identity, struggle, survival, loyalty ]",Eastern,"Mama, I'm Alive",1977
16594,"[war, sacrifice, love, peace, stalingrad, sold...",Eastern,Soldaty,1957
3022,"[comedy, friendship, post-war american life, a...",Western,Buck Privates Come Home,1947


In [5]:
# Preprocess the text
def preprocess(string_list, nlp, words_to_remove):
    docs = [nlp(s) for s in string_list]
    tokens = []
    for doc in docs:
        tokens.extend([token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.lemma_ not in words_to_remove])
    return tokens

def topic_detection(df, nb_topics, nb_passes):
    # Create a dictionary and corpus for Gensim
    dictionary = corpora.Dictionary(df)
    corpus = [dictionary.doc2bow(text) for text in df]
    
    # Train LDA model
    lda_model = LdaModel(corpus, num_topics=nb_topics, id2word=dictionary, passes=nb_passes)
    
    # Print the topics
    for idx, topic in lda_model.print_topics(-1):
        print(f'Topic: {idx}\nWords: {topic}\n')
    
    return lda_model, corpus, dictionary

# Assign topics to characters
def get_dominant_topic(lda_model, corpus):
    topics = []
    for row in corpus:
        row = lda_model[row]
        max_topic, max_val = 0, 0
        for (topic_num, prop_topic) in row:
            if prop_topic > max_val:
                max_val = prop_topic
                max_topic = topic_num
        topics.append(max_topic)
    return topics

In [7]:
nlp = spacy.load('en_core_web_sm')
words_to_remove = {'theme', 'soviet', 'ii', 'vs', 'vs.', ' '}

In [8]:
# theme_df['processed_repres'] = theme_df['theme'].apply(preprocess, args=(nlp, words_to_remove))
# theme_topic = topic_detection(theme_df['processed_repres'], 5, 15)
# pickle.dump(theme_topic, open(DATA_FOLDER + "theme_topic3.pkl", "wb",))


Topic: 0
Words: 0.057*"struggle" + 0.047*"personal" + 0.040*"love" + 0.030*"sacrifice" + 0.026*"conflict" + 0.024*"oppression" + 0.022*"social" + 0.019*"family" + 0.018*"identity" + 0.017*"life"

Topic: 1
Words: 0.078*"war" + 0.065*"espionage" + 0.044*"cold" + 0.043*"loyalty" + 0.035*"betrayal" + 0.030*"tension" + 0.027*"military" + 0.027*"sacrifice" + 0.023*"conflict" + 0.019*"identity"

Topic: 2
Words: 0.102*"war" + 0.054*"heroism" + 0.050*"sacrifice" + 0.035*"survival" + 0.026*"resistance" + 0.023*"post" + 0.023*"conflict" + 0.019*"law" + 0.015*"resilience" + 0.013*"enforcement"

Topic: 3
Words: 0.068*"crime" + 0.053*"moral" + 0.049*"justice" + 0.044*"betrayal" + 0.034*"corruption" + 0.028*"morality" + 0.021*"dilemma" + 0.018*"revenge" + 0.018*"investigation" + 0.017*"redemption"

Topic: 4
Words: 0.044*"adventure" + 0.031*"friendship" + 0.027*"cultural" + 0.021*"exploration" + 0.019*"evil" + 0.018*"good" + 0.015*"age" + 0.015*"comedy" + 0.015*"science" + 0.014*"art"



In [10]:
theme_topic = pickle.load(open("src/analysis/themes/theme_topic.pkl", "rb"))

for idx, topic in theme_topic[0].print_topics(-1):
        print(f'Topic: {idx}\nWords: {topic}\n')

theme_df['dominant_topic'] = get_dominant_topic(theme_topic[0], theme_topic[1])

print(theme_df['dominant_topic'].value_counts())

Topic: 0
Words: 0.094*"war" + 0.045*"espionage" + 0.038*"heroism" + 0.034*"sacrifice" + 0.031*"cold" + 0.030*"betrayal" + 0.024*"survival" + 0.021*"tension" + 0.020*"loyalty" + 0.020*"adventure"

Topic: 1
Words: 0.051*"justice" + 0.043*"crime" + 0.041*"moral" + 0.029*"betrayal" + 0.028*"morality" + 0.026*"oppression" + 0.025*"conflict" + 0.024*"corruption" + 0.018*"struggle" + 0.017*"redemption"

Topic: 2
Words: 0.039*"personal" + 0.038*"love" + 0.038*"sacrifice" + 0.031*"struggle" + 0.027*"identity" + 0.023*"war" + 0.021*"family" + 0.018*"cultural" + 0.017*"conflict" + 0.016*"life"

Topic: 3
Words: 0.039*"political" + 0.026*"revolution" + 0.025*"historical" + 0.017*"struggle" + 0.016*"mystery" + 0.015*"satire" + 0.014*"humor" + 0.014*"investigation" + 0.012*"comedy" + 0.012*"deception"

dominant_topic
2    1877
0    1635
1    1475
3     783
Name: count, dtype: int64


In [11]:
eastern_theme = theme_df[theme_df["movie_side"] == 'Eastern']
western_theme = theme_df[theme_df["movie_side"] == 'Western']

# count dominant topic for each df above
print("Eastern themes")
print(eastern_theme['dominant_topic'].value_counts())

print("Western themes")
print(western_theme['dominant_topic'].value_counts())

Eastern themes
dominant_topic
2    1214
0     599
1     559
3     458
Name: count, dtype: int64
Western themes
dominant_topic
0    1036
1     916
2     663
3     325
Name: count, dtype: int64


In [12]:
# plot topics
import pyLDAvis
from pyLDAvis import gensim_models

In [14]:
# pyLDAvis.save_html(gensim_models.prepare(theme_topic[0], theme_topic[1], theme_topic[2]), "plots/theme_topic_raw.html")

pyLDAvis.display(gensim_models.prepare(theme_topic[0], theme_topic[1], theme_topic[2]))


In [16]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def to_topic_list(series):
    # Given a Series from value_counts() indexed by topics, return a list 
    # of counts for topics [0,1,2,3] in that exact order.
    return [series.get(i, 0) for i in [0, 1, 2, 3]]

begin_cold_war = (1947,1953)
crisis = (1954,1962)
detent = (1963,1974)
second_cold_war = (1975,1984)
end = (1985,1991)

periods = ["Begin (47-53)", "Crisis (54-62)", "Detent (63-74)", "Second CW (75-84)", "End (85-91)"]

topics = {0: "Romance & Social Dramas", 1: "War & Spy Dramas", 2: "Crime & Moral Thrillers", 3: "Political Dramas"}

eastern_theme_begin = eastern_theme[eastern_theme["release_date"].apply(lambda x: x >= begin_cold_war[0] and x <= begin_cold_war[1])]
eastern_theme_crisis = eastern_theme[eastern_theme["release_date"].apply(lambda x: x >= crisis[0] and x <= crisis[1])]
eastern_theme_detent = eastern_theme[eastern_theme["release_date"].apply(lambda x: x >= detent[0] and x <= detent[1])]
eastern_theme_second = eastern_theme[eastern_theme["release_date"].apply(lambda x: x >= second_cold_war[0] and x <= second_cold_war[1])]
eastern_theme_end = eastern_theme[eastern_theme["release_date"].apply(lambda x: x >= end[0] and x <= end[1])]
                                  
western_theme_begin = western_theme[western_theme["release_date"].apply(lambda x: x >= begin_cold_war[0] and x <= begin_cold_war[1])]
western_theme_crisis = western_theme[western_theme["release_date"].apply(lambda x: x >= crisis[0] and x <= crisis[1])]
western_theme_detent = western_theme[western_theme["release_date"].apply(lambda x: x >= detent[0] and x <= detent[1])]
western_theme_second = western_theme[western_theme["release_date"].apply(lambda x: x >= second_cold_war[0] and x <= second_cold_war[1])]
western_theme_end = western_theme[western_theme["release_date"].apply(lambda x: x >= end[0] and x <= end[1])]

eastern_counts = {
    "Begin (47-53)": to_topic_list(eastern_theme_begin['dominant_topic'].value_counts()),
    "Crisis (54-62)": to_topic_list(eastern_theme_crisis['dominant_topic'].value_counts()),
    "Detent (63-74)": to_topic_list(eastern_theme_detent['dominant_topic'].value_counts()),
    "Second CW (75-84)": to_topic_list(eastern_theme_second['dominant_topic'].value_counts()),
    "End (85-91)": to_topic_list(eastern_theme_end['dominant_topic'].value_counts())
}

western_counts = {
    "Begin (47-53)": to_topic_list(western_theme_begin['dominant_topic'].value_counts()),
    "Crisis (54-62)": to_topic_list(western_theme_crisis['dominant_topic'].value_counts()),
    "Detent (63-74)": to_topic_list(western_theme_detent['dominant_topic'].value_counts()),
    "Second CW (75-84)": to_topic_list(western_theme_second['dominant_topic'].value_counts()),
    "End (85-91)": to_topic_list(western_theme_end['dominant_topic'].value_counts())
}


eastern_df = pd.DataFrame(eastern_counts, index=[topic for topic in topics.values()]).T
western_df = pd.DataFrame(western_counts, index=[topic for topic in topics.values()]).T

print("Eastern themes", eastern_df)

print("Western themes", western_df)

topic_colors = {
    topics[0]: "#1F6B5D",   
    topics[1]: "#06DD95",       
    topics[2]: "#FFE989",     
    topics[3]: "#F58634"  
}

Eastern themes                    Romance & Social Dramas  War & Spy Dramas  \
Begin (47-53)                           46                25   
Crisis (54-62)                         134               110   
Detent (63-74)                         217               178   
Second CW (75-84)                      130               139   
End (85-91)                             53                93   

                   Crime & Moral Thrillers  Political Dramas  
Begin (47-53)                           99                32  
Crisis (54-62)                         290               102  
Detent (63-74)                         335               149  
Second CW (75-84)                      301                95  
End (85-91)                            155                68  
Western themes                    Romance & Social Dramas  War & Spy Dramas  \
Begin (47-53)                          138               153   
Crisis (54-62)                         199               205   
Detent (63-74) 

In [17]:
# Create subplots: one for Eastern and one for Western
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=["Eastern Theme Topics Over Periods", "Western Theme Topics Over Periods"]
)

# Add stacked bars for Eastern (with legend)
for topic in eastern_df.columns:
    fig.add_trace(
        go.Bar(
            x=periods,
            y=eastern_df[topic],
            name=topic,
            marker_color=topic_colors[topic],
            hovertemplate='Period: %{x}<br>Topic: '+topic+'<br>Count: %{y}<extra></extra>'
        ),
        row=1, col=1
    )

# Add stacked bars for Western (no legend)
for topic in western_df.columns:
    fig.add_trace(
        go.Bar(
            x=periods,
            y=western_df[topic],
            name=topic,
            marker_color=topic_colors[topic],
            hovertemplate='Period: %{x}<br>Topic: '+topic+'<br>Count: %{y}<extra></extra>',
            showlegend=False  # Disable legend here
        ),
        row=1, col=2
    )

fig.update_layout(
    barmode='stack',
    title_text='Distribution of Topics in Eastern vs Western Themed Movies Over Cold War Periods',
    # center the title
    # title_x=0.5,
    height=600, width=1200,
    legend_title_text='Topics'
)

fig.update_xaxes(title_text="Cold War Periods", row=1, col=1)
fig.update_yaxes(title_text="Number of Movies", row=1, col=1)

fig.update_xaxes(title_text="Cold War Periods", row=1, col=2)
fig.update_yaxes(title_text="Number of Movies", row=1, col=2)

fig.show()

In [22]:
eastern_percent = eastern_df.div(eastern_df.sum(axis=1), axis=0) * 100
western_percent = western_df.div(western_df.sum(axis=1), axis=0) * 100

print("Eastern themes", eastern_percent.astype(int))
print("Western themes", western_percent.astype(int))

# change background color

# Create subplots: one for Eastern and one for Western
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=["Eastern Theme Topics (Percentage)", "Western Theme Topics (Percentage)"]
    
)

# Add stacked bars for Eastern (with legend)
for topic in eastern_percent.columns:
    fig.add_trace(
        go.Bar(
            x=periods,
            y=eastern_percent[topic],
            name=topic,
            marker_color=topic_colors[topic],
            hovertemplate='Period: %{x}<br>Topic: '+topic+'<br>Percentage: %{y:.1f}%<extra></extra>'
        ),
        row=1, col=1
    )

# Add stacked bars for Western (no legend to avoid duplication)
for topic in western_percent.columns:
    fig.add_trace(
        go.Bar(
            x=periods,
            y=western_percent[topic],
            name=topic,
            marker_color=topic_colors[topic],
            hovertemplate='Period: %{x}<br>Topic: '+topic+'<br>Percentage: %{y:.1f}%<extra></extra>',
            showlegend=False
        ),
        row=1, col=2
    )

fig.update_layout(
    barmode='stack',
    title_text='Percentage Distribution of Topics in Eastern vs Western Themed Movies Over Cold War Periods',
    height=600, width=1200,
    legend_title_text='Topics',
    font_family="Helvetica",
    plot_bgcolor='#F2F2F2',
)

fig.update_xaxes(title_text="Cold War Periods", row=1, col=1)
fig.update_yaxes(title_text="Percentage of Movies", ticksuffix='%', range=[0,100], row=1, col=1)

fig.update_xaxes(title_text="Cold War Periods", row=1, col=2)
fig.update_yaxes(title_text="Percentage of Movies", ticksuffix='%', range=[0,100], row=1, col=2)

# save as html
# fig.write_html("plots/theme_topics.html")

fig.show()

Eastern themes                    Romance & Social Dramas  War & Spy Dramas  \
Begin (47-53)                           22                12   
Crisis (54-62)                          21                17   
Detent (63-74)                          24                20   
Second CW (75-84)                       19                20   
End (85-91)                             14                25   

                   Crime & Moral Thrillers  Political Dramas  
Begin (47-53)                           49                15  
Crisis (54-62)                          45                16  
Detent (63-74)                          38                16  
Second CW (75-84)                       45                14  
End (85-91)                             42                18  
Western themes                    Romance & Social Dramas  War & Spy Dramas  \
Begin (47-53)                           31                34   
Crisis (54-62)                          32                33   
Detent (63-74) 

In [None]:
lda_model, corpus, dictionary = eastern_char_topic
overall_term_freq = compute_overall_term_freq(corpus, dictionary)
est_term_freq_by_topic = compute_estimated_term_freq(lda_model, corpus, dictionary, 20)
plotly_term_frequencies(est_term_freq_by_topic, overall_term_freq, ['#DD3C32','#F3BDBA'], EAST_ARCHETYPES_MAP)

In [None]:
lda_model, corpus, dictionary = western_char_topic
overall_term_freq = compute_overall_term_freq(corpus, dictionary)
est_term_freq_by_topic = compute_estimated_term_freq(lda_model, corpus, dictionary, 20)
plotly_term_frequencies(est_term_freq_by_topic, overall_term_freq, ['#0F89E6', '#9FD2F9'], WEST_ARCHETYPES_MAP)

In [None]:
# plot topics
import pyLDAvis
from pyLDAvis import gensim_models

In [None]:
pyLDAvis.display(gensim_models.prepare(western_char_topic[0], western_char_topic[1], western_char_topic[2]))