In [1]:
import pandas as pd
import spacy
import ast

import pyLDAvis
from pyLDAvis import gensim_models

from characters_topic_detection import *
from characters_plots import *
from characters_semantical_analysis import *

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fannl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
DATA_FOLDER = "data/"

movies = pd.read_csv(DATA_FOLDER + "preprocessed_movies.csv")

for col in movies.columns:
    try:
        movies[col] = movies[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    except:
        pass
    #print(col, type(movies[col][0]))

#movies.head()

In [3]:
character_eastern = create_character_df(movies, 'character_eastern_bloc_representation', 'Eastern')
character_western = create_character_df(movies, 'character_western_bloc_representation', 'Western')
character_df = pd.concat([character_eastern, character_western], axis=0)

In [None]:
nlp = spacy.load('en_core_web_sm')
words_to_remove = {'archetype', 'hero', 'heroine', 'heroism', 'value', 'values', 'none', 'represent', 'figure', 'desire', 'seek', 'seeker'}

character_eastern, eastern_char_topic = get_main_character_archetypes(character_eastern, 3, 15, nlp, words_to_remove)
print(character_eastern['topic'].value_counts())

Topic: 0
Words: 0.057*"loyalty" + 0.039*"resilience" + 0.032*"sacrifice" + 0.028*"tragic" + 0.021*"love" + 0.014*"victim" + 0.014*"bravery" + 0.013*"romantic" + 0.012*"determination" + 0.012*"struggle" + 0.011*"survival" + 0.010*"soldier" + 0.010*"conflict" + 0.009*"family" + 0.009*"protagonist"

Topic: 1
Words: 0.059*"antagonist" + 0.037*"villain" + 0.023*"authority" + 0.016*"manipulation" + 0.016*"ambition" + 0.015*"revolutionary" + 0.015*"power" + 0.015*"freedom" + 0.015*"oppression" + 0.014*"collective" + 0.014*"greed" + 0.014*"antihero" + 0.013*"control" + 0.012*"resistance" + 0.012*"corruption"

Topic: 2
Words: 0.025*"authoritarian" + 0.024*"mentor" + 0.022*"duty" + 0.019*"villain" + 0.019*"moral" + 0.012*"innocent" + 0.010*"honor" + 0.009*"nurture" + 0.009*"integrity" + 0.009*"oppressive" + 0.008*"cultural" + 0.008*"perseverance" + 0.008*"community" + 0.007*"deception" + 0.007*"trickster"

topic
1    1531
0    1302
2     893
Name: count, dtype: int64


In [5]:
character_western, western_char_topic = get_main_character_archetypes(character_western, 3, 15, nlp, words_to_remove)
print(character_western['topic'].value_counts())

Topic: 0
Words: 0.047*"loyalty" + 0.025*"duty" + 0.017*"family" + 0.017*"tragic" + 0.016*"mentor" + 0.014*"love" + 0.013*"everyman" + 0.012*"honor" + 0.012*"leader" + 0.011*"idealism" + 0.011*"romantic" + 0.010*"patriotism" + 0.010*"dedication" + 0.009*"authority" + 0.009*"perseverance"

Topic: 1
Words: 0.111*"justice" + 0.029*"resilience" + 0.028*"law" + 0.021*"protector" + 0.020*"determination" + 0.020*"moral" + 0.019*"heroic" + 0.019*"integrity" + 0.018*"enforcement" + 0.017*"truth" + 0.016*"everyman" + 0.015*"detective" + 0.015*"american" + 0.013*"camaraderie" + 0.011*"loyalty"

Topic: 2
Words: 0.067*"individualism" + 0.067*"bravery" + 0.036*"antihero" + 0.036*"freedom" + 0.024*"american" + 0.023*"leadership" + 0.021*"courage" + 0.021*"sacrifice" + 0.017*"ambition" + 0.016*"rebel" + 0.012*"trickster" + 0.012*"resourcefulness" + 0.012*"self" + 0.011*"redemption" + 0.011*"protagonist"

topic
2    1271
1    1094
0     961
Name: count, dtype: int64


In [6]:
EAST_ARCHETYPES_MAP = {
    0: 'The Revolutionary Leader',
    1: 'The Authoritarian Villain',
    2: 'The Tragic Hero'
}

WEST_ARCHETYPES_MAP = {
    0: 'The Loyal soldier',
    1: 'The individualistic rebel',
    2: 'The moral enforcer'
}

character_western['archetype'] = character_western['topic'].map(WEST_ARCHETYPES_MAP)
character_eastern['archetype'] = character_eastern['topic'].map(EAST_ARCHETYPES_MAP)

In [7]:
# Define color palettes for Eastern and Western characters
eastern_colors = {
    'The Tragic Hero': '#EB8B84',  # Light red
    'The Revolutionary Leader': '#DD3C32',    # Medium red
    'The Authoritarian Villain': '#7B1B14'  # Dark red
}

western_colors = {
    'The Loyal soldier': '#78C0F7',  # Light blue
    'The moral enforcer': '#0F89E6', # Medium blue
    'The individualistic rebel': '#074473'   # Dark blue
}

plot_eastern_western_archetypes_distrib(character_eastern, character_western, eastern_colors, western_colors)

In [8]:
lda_model, corpus, dictionary = eastern_char_topic
overall_term_freq = compute_overall_term_freq(corpus, dictionary)
est_term_freq_by_topic = compute_estimated_term_freq(lda_model, corpus, dictionary, 20)
plot_term_frequencies(est_term_freq_by_topic, overall_term_freq, ['#DD3C32','#F3BDBA'], EAST_ARCHETYPES_MAP)

In [9]:
lda_model, corpus, dictionary = western_char_topic
overall_term_freq = compute_overall_term_freq(corpus, dictionary)
est_term_freq_by_topic = compute_estimated_term_freq(lda_model, corpus, dictionary, 20)
plot_term_frequencies(est_term_freq_by_topic, overall_term_freq, ['#0F89E6', '#9FD2F9'], WEST_ARCHETYPES_MAP)

In [10]:
#pyLDAvis.display(gensim_models.prepare(western_char_topic[0], western_char_topic[1], western_char_topic[2]))
#pyLDAvis.display(gensim_models.prepare(eastern_char_topic[0], eastern_char_topic[1], eastern_char_topic[2]))

## Semantic analysis

In [11]:
character_df['empath_analysis'] = character_df['character_representation'].apply(lambda x: semantical_analysis(' '.join(x), CAT))
character_empath_df = character_df.dropna(subset=['empath_analysis']).copy()

print('Total number of characters :', len(character_df))
print('Characters with empath features not None :', len(character_empath_df))

Total number of characters : 7052
Characters with empath features not None : 5652


In [12]:
plot_nb_characters_per_side_per_period(character_df)

In [13]:
global_evolution_df, topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis')
plot_empath_evolution(global_evolution_df, topic_colors)

#p1_evolution_df, p1_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['Blocs Establishment'])
#plot_empath_evolution(p1_evolution_df, p1_topic_colors)

#p2_evolution_df, p2_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['Major tensions and crises'])
#plot_empath_evolution(p2_evolution_df, p2_topic_colors)

p3_evolution_df, p3_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['Détente'])
plot_empath_evolution(p3_evolution_df, p3_topic_colors)
#
#p4_evolution_df, p4_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['Second Cold War'])
#plot_empath_evolution(p4_evolution_df, p4_topic_colors)
#
#p5_evolution_df, p5_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['End of the Cold War'])
#plot_empath_evolution(p5_evolution_df, p5_topic_colors)


In [14]:
plot_empath_distrib_by_year(global_evolution_df, topic_colors, 1947)
