In [5]:
import pandas as pd
import numpy as np
import spacy
import ast

import pyLDAvis
from pyLDAvis import gensim_models

from characters_topic_detection import *
from characters_plots import *
from characters_semantical_analysis import *

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fannl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Characters Analysis

In [6]:
DATA_FOLDER = "data/"

movies = pd.read_csv(DATA_FOLDER + "preprocessed_movies.csv")

for col in movies.columns:
    try:
        movies[col] = movies[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    except:
        pass

In [7]:
character_eastern = create_character_df(movies, 'character_eastern_bloc_representation', 'Eastern')
character_western = create_character_df(movies, 'character_western_bloc_representation', 'Western')
character_df = pd.concat([character_eastern, character_western], axis=0)

In [4]:
print(len(character_df))
print(len(character_eastern))
print(len(character_western))

7052
3726
3326


## Topic Detection

In [5]:
nlp = spacy.load('en_core_web_sm')
words_to_remove = {'archetype', 'hero', 'heroine', 'heroism', 'value', 'values', 'none', 'represent', 'figure', 'desire', 'seek', 'seeker'}

archetypes_eastern, eastern_char_topic = get_main_character_archetypes(character_eastern, 3, 15, nlp, words_to_remove)

Topic: 0
Words: 0.034*"loyalty" + 0.029*"sacrifice" + 0.026*"resilience" + 0.021*"love" + 0.020*"struggle" + 0.020*"tragic" + 0.016*"revolutionary" + 0.015*"freedom" + 0.013*"romantic" + 0.013*"bravery"

Topic: 1
Words: 0.022*"loyalty" + 0.022*"mentor" + 0.019*"justice" + 0.013*"individualism" + 0.012*"moral" + 0.012*"resilience" + 0.011*"tragic" + 0.011*"protagonist" + 0.011*"community" + 0.011*"authority"

Topic: 2
Words: 0.071*"antagonist" + 0.060*"villain" + 0.028*"antihero" + 0.022*"collectivism" + 0.022*"authority" + 0.020*"authoritarian" + 0.020*"manipulation" + 0.018*"power" + 0.018*"ambition" + 0.018*"oppression"



In [6]:
pyLDAvis.display(gensim_models.prepare(eastern_char_topic[0], eastern_char_topic[1], eastern_char_topic[2]))

In [7]:
archetypes_western, western_char_topic = get_main_character_archetypes(character_western, 3, 15, nlp, words_to_remove)

Topic: 0
Words: 0.074*"bravery" + 0.074*"individualism" + 0.034*"determination" + 0.025*"leadership" + 0.024*"courage" + 0.022*"honor" + 0.021*"loyalty" + 0.014*"soldier" + 0.013*"resourcefulness" + 0.013*"sacrifice"

Topic: 1
Words: 0.041*"loyalty" + 0.034*"resilience" + 0.027*"duty" + 0.026*"protector" + 0.022*"heroic" + 0.019*"everyman" + 0.019*"tragic" + 0.019*"family" + 0.017*"camaraderie" + 0.017*"leader"

Topic: 2
Words: 0.095*"justice" + 0.040*"antihero" + 0.033*"american" + 0.031*"freedom" + 0.024*"law" + 0.021*"integrity" + 0.017*"ambition" + 0.017*"moral" + 0.016*"romantic" + 0.015*"enforcement"



In [9]:
pyLDAvis.display(gensim_models.prepare(western_char_topic[0], western_char_topic[1], western_char_topic[2]))

In [8]:
len(archetypes_eastern), len(archetypes_western)

(1858, 1055)

In [10]:
EAST_ARCHETYPES_MAP = {
    0: '0',
    1: '1',
    2: '2'
}

WEST_ARCHETYPES_MAP = {
    0: '0',
    1: '1',
    2: '2'
}

archetypes_western['archetype'] = archetypes_western['topic'].map(lambda x : WEST_ARCHETYPES_MAP[x[0]])
archetypes_eastern['archetype'] = archetypes_eastern['topic'].map(lambda x : EAST_ARCHETYPES_MAP[x[0]])

In [12]:
#archetypes_eastern[archetypes_eastern['archetype'] ==  '0'].sample(10)

In [13]:
# Define color palettes for Eastern and Western characters
eastern_colors = {
    '0': '#EB8B84',  # Light red
    '1': '#DD3C32',    # Medium red
    '2': '#7B1B14'  # Dark red
}

western_colors = {
    '0': '#78C0F7',  # Light blue
    '1': '#0F89E6', # Medium blue
    '2': '#074473'   # Dark blue
}

plot_eastern_western_archetypes_distrib(archetypes_eastern, archetypes_western, eastern_colors, western_colors)

In [14]:
lda_model, corpus, dictionary = eastern_char_topic
overall_term_freq = compute_overall_term_freq(corpus, dictionary)
est_term_freq_by_topic = compute_estimated_term_freq(lda_model, corpus, dictionary, 20)
plot_term_frequencies(est_term_freq_by_topic, overall_term_freq, ['#DD3C32','#F3BDBA'], EAST_ARCHETYPES_MAP)

In [16]:
lda_model, corpus, dictionary = western_char_topic
overall_term_freq = compute_overall_term_freq(corpus, dictionary)
est_term_freq_by_topic = compute_estimated_term_freq(lda_model, corpus, dictionary, 20)
plot_term_frequencies(est_term_freq_by_topic, overall_term_freq, ['#0F89E6', '#9FD2F9'], WEST_ARCHETYPES_MAP)

## Semantic analysis

In [8]:
character_df['empath_analysis'] = character_df['character_representation'].apply(lambda x: semantical_analysis(' '.join(x), CAT))
character_empath_df = character_df.dropna(subset=['empath_analysis']).copy()

print('Total number of characters :', len(character_df))
print('Characters with empath features not None :', len(character_empath_df))

Total number of characters : 7052
Characters with empath features not None : 5652


In [9]:
plot_nb_characters_per_side_per_period(character_df)

In [12]:
global_evolution_df, topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis')
plot_empath_evolution(global_evolution_df, topic_colors)

#p1_evolution_df, p1_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['Blocs Establishment'])
#plot_empath_evolution(p1_evolution_df, p1_topic_colors, 'During Blocs Establishment')

#p2_evolution_df, p2_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['Major tensions and crises'])
#plot_empath_evolution(p2_evolution_df, p2_topic_colors, 'During Major Crises')

#p3_evolution_df, p3_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['Détente'])
#plot_empath_evolution(p3_evolution_df, p3_topic_colors, 'During Détente')

#p4_evolution_df, p4_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['Second Cold War'])
#plot_empath_evolution(p4_evolution_df, p4_topic_colors, 'During 2nd Cold War')
#
p5_evolution_df, p5_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['End of the Cold War'])
plot_empath_evolution(p5_evolution_df, p5_topic_colors, 'During End of Cold War')


In [11]:
plot_empath_distrib_by_year(global_evolution_df, topic_colors, 1947)
