In [46]:
import pandas as pd
import numpy as np
import pickle
import spacy
import ast

import pyLDAvis
from pyLDAvis import gensim_models

from characters_topic_detection import *
from characters_plots import *
from characters_semantical_analysis import *

import warnings
warnings.filterwarnings('ignore')

# Characters Analysis

In [47]:
DATA_FOLDER = "data/"

movies = pd.read_csv(DATA_FOLDER + "preprocessed_movies.csv")

for col in movies.columns:
    try:
        movies[col] = movies[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    except:
        pass

In [3]:
character_eastern = create_character_df(movies, 'character_eastern_bloc_representation', 'Eastern')
character_western = create_character_df(movies, 'character_western_bloc_representation', 'Western')
character_df = pd.concat([character_eastern, character_western], axis=0)

In [4]:
print(len(character_df))
print(len(character_eastern))
print(len(character_western))

7052
3726
3326


## Topic Detection

In [8]:
nlp = spacy.load('en_core_web_sm')
words_to_remove = {'archetype', 'hero', 'heroine', 'heroism', 'antagonist','value', 'values', 'none', 'represent', 'figure', 'desire', 'seek', 'seeker'}

In [None]:
#archetypes_eastern, eastern_char_topic = get_main_character_archetypes(character_eastern, 3, 10, nlp, words_to_remove)
#pickle.dump(archetypes_eastern, open(DATA_FOLDER + "archetypes_eastern.pkl", "wb"))
#pickle.dump(eastern_char_topic, open(DATA_FOLDER + "eastern_char_topic.pkl", "wb"))

Topic: 0
Words: 0.062*"villain" + 0.028*"authority" + 0.027*"antihero" + 0.022*"resilience" + 0.020*"manipulation" + 0.019*"ambition" + 0.019*"power" + 0.017*"greed" + 0.016*"control" + 0.016*"authoritarian"

Topic: 1
Words: 0.022*"love" + 0.021*"justice" + 0.020*"resilience" + 0.017*"collective" + 0.014*"struggle" + 0.013*"community" + 0.013*"conflict" + 0.013*"family" + 0.012*"collectivism" + 0.011*"individualism"

Topic: 2
Words: 0.058*"loyalty" + 0.033*"sacrifice" + 0.025*"tragic" + 0.019*"oppression" + 0.014*"bravery" + 0.013*"rebellion" + 0.013*"determination" + 0.012*"duty" + 0.012*"freedom" + 0.011*"mentor"



In [56]:
archetypes_eastern = pickle.load(open(DATA_FOLDER + "archetypes_eastern.pkl", "rb"))
eastern_char_topic = pickle.load(open(DATA_FOLDER + "eastern_char_topic.pkl", "rb"))
#pyLDAvis.display(gensim_models.prepare(eastern_char_topic[0], eastern_char_topic[1], eastern_char_topic[2]))

In [27]:
#west_words_to_remove = words_to_remove.union({'american', 'anti'})
#archetypes_western, western_char_topic = get_main_character_archetypes(character_western, 3, 20, nlp, west_words_to_remove)
#pickle.dump(archetypes_western, open(DATA_FOLDER + "archetypes_western.pkl", "wb"))
#pickle.dump(western_char_topic, open(DATA_FOLDER + "western_char_topic.pkl", "wb"))

Topic: 0
Words: 0.114*"justice" + 0.029*"law" + 0.027*"antihero" + 0.025*"integrity" + 0.023*"protector" + 0.023*"heroic" + 0.020*"moral" + 0.018*"enforcement" + 0.017*"loyalty" + 0.016*"detective"

Topic: 1
Words: 0.048*"bravery" + 0.024*"leadership" + 0.024*"duty" + 0.023*"sacrifice" + 0.023*"courage" + 0.021*"honor" + 0.020*"ambition" + 0.013*"soldier" + 0.013*"trickster" + 0.012*"friendship"

Topic: 2
Words: 0.072*"individualism" + 0.038*"freedom" + 0.034*"loyalty" + 0.033*"determination" + 0.031*"resilience" + 0.023*"everyman" + 0.022*"bravery" + 0.019*"antihero" + 0.018*"rebel" + 0.017*"family"



In [57]:
archetypes_western = pickle.load(open(DATA_FOLDER + "archetypes_western.pkl", "rb"))
western_char_topic = pickle.load(open(DATA_FOLDER + "western_char_topic.pkl", "rb"))
#pyLDAvis.display(gensim_models.prepare(western_char_topic[0], western_char_topic[1], western_char_topic[2]))

In [30]:
len(archetypes_eastern), len(archetypes_western)

(1577, 1052)

In [49]:
EAST_ARCHETYPES_MAP = {
    0: 'The Authoritarian Villain',
    1: 'The Community Defender',
    2: 'The Tragic Hero'
}

WEST_ARCHETYPES_MAP = {
    0: 'The Moral Enforcer',
    1: 'The Heroic Leader',
    2: 'The Individualistic Rebel'
}

archetypes_western['archetype'] = archetypes_western['topic'].map(lambda x : WEST_ARCHETYPES_MAP[x[0]])
archetypes_eastern['archetype'] = archetypes_eastern['topic'].map(lambda x : EAST_ARCHETYPES_MAP[x[0]])

In [50]:
#archetypes_eastern[archetypes_eastern['archetype'] ==  '0'].sample(10)

In [51]:
# Define color palettes for Eastern and Western characters
eastern_colors = {
    'The Community Defender': '#EB8B84',  # Light red
    'The Tragic Hero': '#DD3C32',    # Medium red
    'The Authoritarian Villain': '#7B1B14'  # Dark red
}

western_colors = {
    'The Moral Enforcer': '#78C0F7',  # Light blue
    'The Heroic Leader': '#0F89E6', # Medium blue
    'The Individualistic Rebel': '#074473'   # Dark blue
}

plot_eastern_western_archetypes_distrib(archetypes_eastern, archetypes_western, eastern_colors, western_colors)

In [52]:
lda_model, corpus, dictionary = eastern_char_topic
overall_term_freq = compute_overall_term_freq(corpus, dictionary)
est_term_freq_by_topic = compute_estimated_term_freq(lda_model, corpus, dictionary, 15)
plot_term_frequencies(est_term_freq_by_topic, overall_term_freq, ['#DD3C32','#F3BDBA'], EAST_ARCHETYPES_MAP)

In [53]:
lda_model, corpus, dictionary = western_char_topic
overall_term_freq = compute_overall_term_freq(corpus, dictionary)
est_term_freq_by_topic = compute_estimated_term_freq(lda_model, corpus, dictionary, 15)
plot_term_frequencies(est_term_freq_by_topic, overall_term_freq, ['#0F89E6', '#9FD2F9'], WEST_ARCHETYPES_MAP)

## Semantic analysis

In [8]:
character_df['empath_analysis'] = character_df['character_representation'].apply(lambda x: semantical_analysis(' '.join(x), CAT))
character_empath_df = character_df.dropna(subset=['empath_analysis']).copy()

print('Total number of characters :', len(character_df))
print('Characters with empath features not None :', len(character_empath_df))

Total number of characters : 7052
Characters with empath features not None : 5652


In [9]:
plot_nb_characters_per_side_per_period(character_df)

In [14]:
global_evolution_df, topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis')
plot_empath_evolution(global_evolution_df, topic_colors)

p1_evolution_df, p1_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['Blocs Establishment'])
plot_empath_evolution(p1_evolution_df, p1_topic_colors, 'During Blocs Establishment')

p2_evolution_df, p2_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['Major tensions and crises'])
plot_empath_evolution(p2_evolution_df, p2_topic_colors, 'During Major Crises')

p3_evolution_df, p3_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['Détente'])
plot_empath_evolution(p3_evolution_df, p3_topic_colors, 'During Détente')

p4_evolution_df, p4_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['Second Cold War'])
plot_empath_evolution(p4_evolution_df, p4_topic_colors, 'During 2nd Cold War')

p5_evolution_df, p5_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['End of the Cold War'])
plot_empath_evolution(p5_evolution_df, p5_topic_colors, 'During End of Cold War')


In [11]:
plot_empath_distrib_by_year(global_evolution_df, topic_colors, 1947)
