In [165]:
import pandas as pd
import numpy as np
from src.utils.helpers import clean_column_values
import warnings
warnings.filterwarnings('ignore')

In [166]:
DATA_FOLDER = "data/"

movies = pd.read_csv(DATA_FOLDER + "v1_movies.csv")
movies.head(2)

Unnamed: 0,wikipedia_id,freebase_id,title,languages,countries,genres,keywords,release_date,runtime,plot_summary,year_release_date,cold_war_side,character_western_bloc_representation,character_eastern_bloc_representation,western_bloc_values,eastern_bloc values,theme
0,4213160.0,/m/0bq8q8,$,['English'],['United States of America'],"['Drama', 'Comedy', 'Action', 'Thriller', 'Hei...",,1971-12-17,119.0,"Set in Hamburg, West Germany, several criminal...",1971,Western,"['Joe Collins', 'American bank security consul...","['Sarge', 'corrupt U.S. Army sergeant', 'value...","['Cunning', 'heroism', 'cleverness', 'survival...","['Ruthlessness', 'violence', 'greed', 'betraya...","['Heist', 'crime', 'betrayal', 'survival', 'te..."
1,,,"$1,000 on the Black","['Deutsch', 'Italiano']","['Italy', 'Germany']",['Western'],,1966-12-18,104.0,Johnny Liston has just been released from pris...,1966,Western,"['Johnny Liston', 'justice', 'redemption', 'he...","['Sartana', 'tyranny', 'betrayal', 'antagonist']","['Justice', 'redemption', 'individualism', 'pe...","['Tyranny', 'fear', 'betrayal', 'oppression']","['Revenge', 'self-discovery', 'moral conflict'..."


In [167]:
movies['character_eastern_bloc_representation'] = \
    movies['character_eastern_bloc_representation'].apply(clean_column_values)

filtered_data = \
    [line for line in movies['character_eastern_bloc_representation'] if isinstance(line, list) and 'None' not in line]

character_names = []
character_representations = []

for line in filtered_data:
    character_names.append(line[0])
    character_representations.append(line[1:])

character_df = pd.DataFrame({'character': character_names, 'representation': character_representations})
character_df = character_df.drop_duplicates(subset=['character'])
character_df = character_df[character_df['representation'].apply(lambda x: len(x) > 0)]
character_df.head()

Unnamed: 0,character,representation
0,Sarge,"[corrupt U.S. Army sergeant, values ruthlessne..."
1,Sartana,"[tyranny, betrayal, antagonist]"
2,Mr. Lacey,"[corporate greed, villain, antagonist archetype]"
3,border guard,"[authority, duty, conformity]"
4,KGB authorities,"[authority, state control, antagonist]"


In [168]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fannl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [169]:
# Get the list of stopwords
stop_words = set(stopwords.words('english'))

# Function to replace specific phrases
def replace_phrases(text):
    replacements = {
        'anti hero': 'antihero',
        'anti-hero': 'antihero'
        # Add more replacements as needed
    }
    words = text.split()
    new_words = []
    i = 0
    while i < len(words):
        phrase = words[i]
        if i + 1 < len(words):
            phrase += ' ' + words[i + 1]
        if phrase.lower() in replacements:
            new_words.append(replacements[phrase.lower()])
            i += 2  # Skip the next word since it's part of the phrase
        else:
            new_words.append(words[i])
            i += 1
    return ' '.join(new_words)

def clean_group_of_words(text, words_to_rmv=[]):
    text = replace_phrases(text)
    words = text.split()
    # Filter out words_to_rmv and english stop words
    filtered_words = [word for word in words if word.lower() not in stop_words and word.lower() not in words_to_rmv]
    # Join the list back into a string
    return ' '.join(filtered_words).lower()

# Function to remove 'archetype' and stop words
def clean_text(text, words_to_rmv=[]):
    
    if isinstance(text, list):
        for i in range(len(text)):
            text[i] = clean_group_of_words(text[i], words_to_rmv)
        return text
    
    if isinstance(text, str):
        return clean_group_of_words(text, words_to_rmv)

to_remove = ['values', 'value', 'archetype', 'archetype:', 'archetypes']
character_df['representation'] = character_df['representation'].apply(clean_text, words_to_rmv=to_remove)
character_df.head(10)

Unnamed: 0,character,representation
0,Sarge,"[corrupt u.s. army sergeant, ruthlessness, vio..."
1,Sartana,"[tyranny, betrayal, antagonist]"
2,Mr. Lacey,"[corporate greed, villain, antagonist]"
3,border guard,"[authority, duty, conformity]"
4,KGB authorities,"[authority, state control, antagonist]"
5,Paradowski and Borovin,"[brave cosmonauts, hero]"
6,008 (Ingrid Schoeller),"[british intelligence, brave, female hero]"
7,Pioneers of the March independence movement,"[struggle independence, resilience, heroism]"
8,step brother,"[seeking attention, rebellious, jealous, youth]"
9,Ostap Bender,"[cunning, adaptability, trickster]"


In [170]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation

In [171]:
# Step 2: Feature Extraction
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(character_df['representation'].astype(str))

# Step 3: Clustering
kmeans = KMeans(n_clusters=8, random_state=42)
character_df['cluster'] = kmeans.fit_predict(X)

# Step 4: Topic Detection (Optional)
lda = LatentDirichletAllocation(n_components=8, random_state=42)
lda.fit(X)

# Display the topics
for index, topic in enumerate(lda.components_):
    print(f'Topic {index}:')
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

character_df.head(20)

Topic 0:
['oppressive', 'authoritarian', 'power', 'tyranny', 'authoritarianism', 'authority', 'control', 'antagonist', 'oppression', 'villain']
Topic 1:
['cultural', 'nationalism', 'moral', 'integrity', 'strength', 'leader', 'martyr', 'trickster', 'patriotism', 'unity']
Topic 2:
['independence', 'ideals', 'communism', 'loyalty', 'nurturing', 'revolutionary', 'hero', 'resilience', 'romantic', 'love']
Topic 3:
['power', 'cunning', 'antihero', 'betrayal', 'ambition', 'manipulation', 'corruption', 'greed', 'villain', 'antagonist']
Topic 4:
['freedom', 'rebel', 'femme', 'fatale', 'hero', 'victim', 'resilience', 'innocence', 'resistance', 'determination']
Topic 5:
['personal', 'compassion', 'antihero', 'freedom', 'ally', 'ruthlessness', 'conflict', 'hero', 'rebellion', 'individualism']
Topic 6:
['family', 'revenge', 'totalitarianism', 'order', 'truth', 'enforcement', 'authority', 'justice', 'hero', 'law']
Topic 7:
['soldier', 'honor', 'tragic', 'duty', 'collectivism', 'bravery', 'heroism', '

Unnamed: 0,character,representation,cluster
0,Sarge,"[corrupt u.s. army sergeant, ruthlessness, vio...",3
1,Sartana,"[tyranny, betrayal, antagonist]",1
2,Mr. Lacey,"[corporate greed, villain, antagonist]",1
3,border guard,"[authority, duty, conformity]",3
4,KGB authorities,"[authority, state control, antagonist]",1
5,Paradowski and Borovin,"[brave cosmonauts, hero]",7
6,008 (Ingrid Schoeller),"[british intelligence, brave, female hero]",7
7,Pioneers of the March independence movement,"[struggle independence, resilience, heroism]",2
8,step brother,"[seeking attention, rebellious, jealous, youth]",3
9,Ostap Bender,"[cunning, adaptability, trickster]",4


In [172]:
# count the number of characters in each cluster
cluster_counts = character_df['cluster'].value_counts().sort_index()
cluster_counts

cluster
0      88
1     435
2     211
3    2436
4      73
5     366
6     154
7     507
Name: count, dtype: int64

In [174]:
from sklearn.feature_extraction.text import CountVectorizer
# Combine list of representation into single string for each character
character_df['representation_text'] = character_df['representation'].apply(lambda x: ' '.join(x))

# Use CountVectorizer to convert text data into a bag-of-words representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(character_df['representation_text'])

# Apply Latent Dirichlet Allocation (LDA) for topic detection
lda = LatentDirichletAllocation(n_components=5, random_state=42)  # 3 topics
lda.fit(X)

# Display the top words for each topic
words = vectorizer.get_feature_names_out()
topics = []
for topic_idx, topic in enumerate(lda.components_):
    top_words = [words[i] for i in topic.argsort()[-10:]]  # Top 10 words per topic
    topics.append(f"Topic {topic_idx+1}: {' '.join(top_words)}")

# Output topics
for topic in topics:
    print(topic)

# Assign each character to a topic
character_topics = lda.transform(X)  # Get topic distribution per character
character_df['topic'] = np.argmax(character_topics, axis=1)  # Assign the topic with the highest weight

# Print dataframe with topics
character_df[['character', 'topic']]

Topic 1: tragic resilience duty mentor bravery collectivism heroism sacrifice hero loyalty
Topic 2: loyalty anti hero rebellion resistance greed freedom corruption villain antagonist
Topic 3: authoritarian anti authoritarianism ambition manipulation control power oppression antagonist villain
Topic 4: struggle identity integrity traditional social family hero everyman resilience justice
Topic 5: individualism figure innocence survival romantic love antihero tragic authority hero


Unnamed: 0,character,topic
0,Sarge,2
1,Sartana,2
2,Mr. Lacey,1
3,border guard,0
4,KGB authorities,2
...,...,...
5080,The three sons,3
5081,Berci,4
5082,Estonian fishermen,1
5085,Dragon,2


In [175]:
topic_counts = character_df['topic'].value_counts().sort_index()
topic_counts

topic
0    1110
1     536
2     950
3     656
4    1018
Name: count, dtype: int64