In [305]:
import pandas as pd
import numpy as np
import ast

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import spacy
from gensim import corpora
from gensim.models.ldamodel import LdaModel

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fannl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [306]:
DATA_FOLDER = "data/"

movies = pd.read_csv(DATA_FOLDER + "v1_movies_cleaned.csv")

for col in movies.columns:
    try:
        movies[col] = movies[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    except:
        pass
    print(col, type(movies[col][0]))

title <class 'str'>
languages <class 'list'>
countries <class 'list'>
genres <class 'list'>
keywords <class 'float'>
release_date <class 'str'>
plot_summary <class 'str'>
year_release_date <class 'numpy.int64'>
cold_war_side <class 'str'>
character_western_bloc_representation <class 'list'>
character_eastern_bloc_representation <class 'list'>
western_bloc_values <class 'list'>
eastern_bloc_values <class 'list'>
theme <class 'list'>


In [307]:
movies.head(2)

Unnamed: 0,title,languages,countries,genres,keywords,release_date,plot_summary,year_release_date,cold_war_side,character_western_bloc_representation,character_eastern_bloc_representation,western_bloc_values,eastern_bloc_values,theme
0,$,[English],[United States of America],"[Drama, Comedy, Action, Thriller, Heist, Crime...",,1971-12-17,"Set in Hamburg, West Germany, several criminal...",1971,Western,"[Joe Collins, American bank security consultan...","[Sarge, corrupt U.S. Army sergeant, values rut...","[Cunning, heroism, cleverness, survival, Antih...","[Ruthlessness, violence, greed, betrayal, Anti...","[Heist, crime, betrayal, survival, tension]"
1,"$1,000 on the Black","[Deutsch, Italiano]","[Germany, Italy]",[Western],,1966-12-18,Johnny Liston has just been released from pris...,1966,Western,"[Johnny Liston, justice, redemption, hero]","[Sartana, tyranny, betrayal, antagonist]","[Justice, redemption, individualism, personal ...","[Tyranny, fear, betrayal, oppression]","[Revenge, self-discovery, moral conflict, hero..."


In [308]:
# helpers 
VOC_MAPPING = {
    'anti-': 'anti',
    'anti hero': 'antihero',
    'archetypes': 'archetype',
    'archetype:': 'archetype',
}

# Function to split the row list into character name and character representation
def extract_character_name_and_attributes(row):
    if isinstance(row, list) and len(row) > 0 and row[0] is not None:
        character_name = row[0]
        character_representation = row[1:]
        return pd.Series([character_name, character_representation])
    else:
        return pd.Series([None, None])

# Function to apply the mapping with substring matching
def apply_mapping_with_substrings(string_list, mapping):
    updated = []
    for s in string_list:
        new_s = s
        for key, value in mapping.items():
            if key in s:
                new_s = s.replace(key, value)
        updated.append(new_s.lower())
    return updated

In [309]:
def create_character_df(original_df, character_column, character_side):
    # create the new dataframe
    new_df = original_df[[character_column, 'cold_war_side', 'title']]
    new_df = new_df.rename(columns={'cold_war_side': 'movie_side'})
    new_df[['character_name', 'character_representation']] = \
        new_df[character_column].apply(extract_character_name_and_attributes)
    new_df = new_df.drop(columns=[character_column])
    new_df['character_side'] = character_side

    # clean the obtained dataframe
    new_df = new_df.drop_duplicates(subset=['character_name'])
    new_df = new_df[new_df['character_representation'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False)]
    new_df = new_df[new_df['character_name'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0) <= 5]
    new_df['character_representation'] = new_df['character_representation'].apply(
        lambda x: apply_mapping_with_substrings(x, VOC_MAPPING)
    )
    return new_df

In [310]:
character_eastern = create_character_df(movies, 'character_eastern_bloc_representation', 'Eastern')
character_western = create_character_df(movies, 'character_western_bloc_representation', 'Western')

In [311]:
character_eastern.sample(5)

Unnamed: 0,movie_side,title,character_name,character_representation,character_side
29109,Eastern,There Were Seven Simeons,family,"[hope, despair, escape, archetype of the oppre...",Eastern
26274,Western,The Long Day's Dying,German captive,"[conflict, survival, prisoner archetype]",Eastern
12715,Eastern,Klyuchi ot neba,Lieutenant Kirillov,"[disciplined, loyal, authority figure]",Eastern
23017,Eastern,Tetri karavani,Gela,"[ambitious, urban-centric values, tragic hero ...",Eastern
2277,Eastern,Atlari yaharlayin,middle-aged man,"[justice, rebellion, hero]",Eastern


In [312]:
character_western.sample(5)

Unnamed: 0,movie_side,title,character_name,character_representation,character_side
24814,Western,The Farmer's Daughter,Katie Holstrom,"[down-to-earth, common sense, heroine]",Western
16903,Western,Nothing Underneath,Yellowstone park ranger,"[heroism, justice, protector]",Western
29382,Western,Thunderbirds Are GO,Scott Tracy,"[bravery, teamwork, archetypal hero]",Western
37,Western,008: Operation Exterminate,006 (Alberto Lupo),"[british intelligence, resourceful, hero arche...",Western
30554,Western,V.I. Warshawski,"Victoria ""V.I."" Warshawski","[independence, determination, detective archet...",Western


In [313]:
character_df = pd.concat([character_eastern, character_western], axis=0)
character_df.sample(5)

Unnamed: 0,movie_side,title,character_name,character_representation,character_side
11753,Western,It's a Wonderful Life,Mr. Potter,"[values greed, corruption, villain archetype]",Eastern
27020,Western,The Osterman Weekend,Maxwell Danforth,"[cia director, ruthless and manipulative, arch...",Eastern
6516,Eastern,Diamonds for the Dictatorship of the Proletariat,Cheka,"[law and order, detective archetype]",Eastern
20277,Eastern,Sasek a královna,Bolek Polivka,"[czech jester, satire against foreign occupati...",Western
12711,Eastern,Kleinhoff Hotel,Karl,"[revolutionary, idealism, tragic hero]",Western


In [314]:
nlp = spacy.load('en_core_web_sm')
words_to_remove = {'archetype', 'hero', 'value', 'values', 'None'}

# Preprocess the text
def preprocess(string_list):
    docs = [nlp(s) for s in string_list]
    tokens = []
    for doc in docs:
        tokens.extend([token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.lemma_ not in words_to_remove])
    return tokens

# Apply preprocessing
character_df['processed_repres'] = character_df['character_representation'].apply(preprocess)

In [321]:
# Create a dictionary and corpus for Gensim
dictionary = corpora.Dictionary(character_df['processed_repres'])
corpus = [dictionary.doc2bow(text) for text in character_df['processed_repres']]

# Train LDA model
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

# Print the topics
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx}\nWords: {topic}\n')

# Assign topics to characters
def get_dominant_topic(lda_model, corpus, texts):
    topics = []
    for i, row in enumerate(corpus):
        row = lda_model[row]
        max_topic, max_val = 0, 0
        for j, (topic_num, prop_topic) in enumerate(row):
            if prop_topic > max_val:
                max_val = prop_topic
                max_topic = topic_num
        topics.append(max_topic)
    return topics

character_df['dominant_topic'] = get_dominant_topic(lda_model, corpus, character_df['processed_repres'])

# Display the dataframe with topics
character_df.sample(10)

Topic: 0
Words: 0.039*"courage" + 0.031*"revolutionary" + 0.030*"leader" + 0.020*"patriotism" + 0.020*"collective" + 0.019*"martyr" + 0.019*"leadership" + 0.017*"order" + 0.015*"military" + 0.014*"communist"

Topic: 1
Words: 0.136*"justice" + 0.037*"law" + 0.027*"integrity" + 0.027*"truth" + 0.026*"enforcement" + 0.024*"anti" + 0.022*"protector" + 0.021*"ambition" + 0.021*"moral" + 0.019*"leadership"

Topic: 2
Words: 0.050*"antagonist" + 0.048*"antihero" + 0.047*"villain" + 0.027*"authority" + 0.023*"tragic" + 0.018*"traditional" + 0.017*"power" + 0.017*"figure" + 0.016*"oppression" + 0.015*"greed"

Topic: 3
Words: 0.077*"heroism" + 0.072*"loyalty" + 0.061*"bravery" + 0.042*"sacrifice" + 0.027*"duty" + 0.026*"honor" + 0.022*"soldier" + 0.020*"camaraderie" + 0.018*"antihero" + 0.016*"resourcefulness"

Topic: 4
Words: 0.049*"individualism" + 0.047*"resilience" + 0.038*"freedom" + 0.031*"determination" + 0.029*"american" + 0.025*"everyman" + 0.025*"romantic" + 0.022*"protagonist" + 0.020*

Unnamed: 0,movie_side,title,character_name,character_representation,character_side,processed_repres,dominant_topic
4954,Western,Clash by Night,Earl Pfeiffer,"[bitter and dissatisfied, value of cynicism an...",Eastern,"[bitter, dissatisfied, cynicism, disillusionme...",2
22242,Eastern,Struggle in the Desert,People's Liberation Army,"[collectivism, heroism, revolutionary archetype]",Western,"[collectivism, heroism, revolutionary]",0
29410,Eastern,Tiempo de morir,Juan,"[seeking redemption, justice, victim of societ...",Eastern,"[seek, redemption, justice, victim, societal, ...",4
4257,Western,Captain Blood,Captain Blood,"[freedom, bravery, heroism, archetypal hero]",Western,"[freedom, bravery, heroism, archetypal]",3
3894,Western,Brubaker,Larry Lee Bullen,"[reform, loyalty, ally]",Eastern,"[reform, loyalty, ally]",3
6221,,Deewar,Vijay Verma,"[smuggler, representing the struggle against s...",Eastern,"[smuggler, represent, struggle, societal, inju...",4
1660,Western,American Ninja 2: The Confrontation,corruption,"[betrayal, archetype antagonist]",Eastern,"[betrayal, antagonist]",2
27568,Eastern,The Road I Have Found,Young Korean worker,"[struggle against oppressors, heroism]",Eastern,"[struggle, oppressor, heroism]",2
24379,Western,The Day of the Jackal,The Jackal,"[values secrecy, violence, archetype assassin]",Eastern,"[secrecy, violence, assassin]",2
5808,Eastern,Dao,Iron Head,"[vengeance, aggressive protector]",Eastern,"[vengeance, aggressive, protector]",3


In [322]:
# plot topics
import pyLDAvis
from pyLDAvis import gensim_models
data =  gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(data)

In [324]:
print(character_df['dominant_topic'].value_counts())

dominant_topic
4    2094
3    1933
2    1751
1    1371
0    1094
Name: count, dtype: int64


In [325]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [328]:
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(character_df['processed_repres'].astype(str))

lda = LatentDirichletAllocation(n_components=5, random_state=42) 
lda.fit(X)

# Display the top 10 words for each topic
words = vectorizer.get_feature_names_out()
topics = []
for topic_idx, topic in enumerate(lda.components_):
    top_words = [words[i] for i in topic.argsort()[-10:]]  
    topics.append(f"Topic {topic_idx+1}: {' '.join(top_words)}")

for topic in topics:
    print(topic)

# Assign each character to a topic
character_topics = lda.transform(X) 
character_df['topic'] = np.argmax(character_topics, axis=1)  

character_df.sample(10)

Topic 1: traditional represent truth leadership heroic justice oppression family duty everyman
Topic 2: determination redemption enforcement antihero power integrity authority law moral justice
Topic 3: victim protagonist rebel courage revolutionary sacrifice freedom bravery resilience heroism
Topic 4: self conflict honor soldier sacrifice mentor romantic american love tragic
Topic 5: cunne control greed resourcefulness manipulation individualism antihero villain antagonist loyalty


Unnamed: 0,movie_side,title,character_name,character_representation,character_side,processed_repres,dominant_topic,topic
18358,Eastern,Pokrovsky Gates,Margarita Pavlovna,"[controlling nature, traditional values, antag...",Eastern,"[control, nature, traditional, antagonist]",2,4
17024,Eastern,Obochina,Pyotr Demyanovich Volovich,"[corruption, antihero]",Eastern,"[corruption, antihero]",2,4
4448,Western,Cat Ballou,Clay Boone,"[reluctant hero, loyal friend]",Eastern,"[reluctant, loyal, friend]",3,0
32277,Eastern,Zhivyot takoy paren,Altai truck driver,"[kind, outgoing, hero archetype]",Eastern,"[kind, outgoing]",4,0
1971,Western,Another Country,Tommy Judd,"[marxist, outsider, challenging the system, none]",Eastern,"[marxist, outsider, challenge, system]",4,1
18490,Eastern,Posledniy ugon,Red Cavalry,"[good vs. bad, heroism]",Eastern,"[good, vs., bad, heroism]",1,1
19763,Eastern,Rok,Soviet rock musicians,"[artistic expression, collectivism]",Eastern,"[artistic, expression, collectivism]",4,3
10052,Eastern,Hatey Bazarey,Lacchmanlal,"[malicious, entitled, rogue]",Eastern,"[malicious, entitle, rogue]",1,4
21529,Western,Society,Bill Whitney,"[wealth, privilege, alienated youth, archetype...",Western,"[wealth, privilege, alienated, youth, seek, tr...",1,0
23842,Western,The Bridges at Toko-Ri,Harry Brubaker,"[values of duty, courage, and resilience, arch...",Western,"[duty, courage, resilience, reluctant]",3,2


In [329]:
topic_counts = character_df['topic'].value_counts().sort_index()
topic_counts

topic
0    1357
1    1332
2    2134
3    1501
4    1919
Name: count, dtype: int64