In [1]:
import pandas as pd
import numpy as np
import pickle
import spacy
import ast

import pyLDAvis
from pyLDAvis import gensim_models

from characters_topic_detection import *
from characters_plots import *
from characters_semantical_analysis import *
from src.utils.constants import *
from src.utils.helpers import convert_csv

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chbou\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Characters Analysis

In [2]:
movies = pd.read_csv(DATA_FOLDER_PREPROCESSED + "preprocessed_movies.csv")
convert_csv(movies)

Unnamed: 0,title,languages,countries,genres,release_date,cold_war_side,character_western_bloc_representation,character_eastern_bloc_representation,western_bloc_values,eastern_bloc_values,theme
0,$,,[Russia],"[Comedy, Drama, Crime]",1971,Western,"[Joe Collins, American bank security consultan...","[Dawn Divine, hooker with a heart of gold, cun...",[None],"[Resourcefulness, cleverness, individualism, h...",[None]
1,"$1,000 on the Black","[Italian, German]","[Germany, Italy]",[Western],1966,Eastern,[None],"[Sartana, villainous, oppressive, cruel, arche...","[Johnny Liston, justice, determination, resili...","[Justice, revenge, oppressed vs. oppressor, re...","[Terror, betrayal, familial conflict, crime, r..."
2,"$10,000 Blood Money",,[Russia],"[Drama, Western]",1967,,[None],[None],[None],[None],"[crime, betrayal, revenge, bounty hunter, heis..."
3,"$100,000 for Ringo",[Italian],[Italy],"[Drama, Western]",1965,,[None],[None],[None],[None],"[Western, Civil War, mistaken identity, treasu..."
4,'Anna' i wampir,,[Russia],[Crime],1982,,[None],[None],[None],[None],"[murder mystery, horror, fog, Poland, 1960s]"
...,...,...,...,...,...,...,...,...,...,...,...
25616,Şaban Oğlu Şaban,[Turkish],[Turkey],[Comedy],1977,,[None],[None],[None],[None],[None ]
25617,Šíleně smutná princezna,[Czech],[Czech Republic],"[Comedy, Family, Music]",1968,,[None],[None],[None],[None],[None ]
25618,Убить дракона,[Russian],"[Germany, Russia]","[Drama, Fantasy]",1988,Eastern,[None],"[Lancelot, heroism, freedom, knight archetype]",[None],"[Fear, oppression, totalitarianism, resistance...","[Themes of rebellion, freedom vs. oppression, ..."
25619,’Round Midnight,"[English, French, German]","[France, United States of America]",[Drama],1986,,[None],[None],[None],[None],"[Jazz, Friendship, Paris, Music, Creativity]"


In [3]:
character_eastern = create_character_df(movies, 'character_eastern_bloc_representation', 'Eastern')
character_western = create_character_df(movies, 'character_western_bloc_representation', 'Western')
character_df = pd.concat([character_eastern, character_western], axis=0)

In [4]:
print(len(character_df))
print(len(character_eastern))
print(len(character_western))

7052
3726
3326


## Topic Detection

In [5]:
nlp = spacy.load('en_core_web_sm')
words_to_remove = {'archetype', 'hero', 'heroine', 'heroism', 'antagonist','value', 'values', 'none', 'represent', 'figure', 'desire', 'seek', 'seeker'}

In [6]:
#archetypes_eastern, eastern_char_topic = get_main_character_archetypes(character_eastern, 3, 10, nlp, words_to_remove)
#pickle.dump(archetypes_eastern, open(DATA_FOLDER + "archetypes_eastern.pkl", "wb"))
#pickle.dump(eastern_char_topic, open(DATA_FOLDER + "eastern_char_topic.pkl", "wb"))

In [7]:
archetypes_eastern = pickle.load(open("src/analysis/characters/archetypes_eastern.pkl", "rb"))
eastern_char_topic = pickle.load(open("src/analysis/characters/eastern_char_topic.pkl", "rb"))
#pyLDAvis.display(gensim_models.prepare(eastern_char_topic[0], eastern_char_topic[1], eastern_char_topic[2]))

In [8]:
#west_words_to_remove = words_to_remove.union({'american', 'anti'})
#archetypes_western, western_char_topic = get_main_character_archetypes(character_western, 3, 20, nlp, west_words_to_remove)
#pickle.dump(archetypes_western, open(DATA_FOLDER + "archetypes_western.pkl", "wb"))
#pickle.dump(western_char_topic, open(DATA_FOLDER + "western_char_topic.pkl", "wb"))

In [9]:
archetypes_western = pickle.load(open("src/analysis/characters/archetypes_western.pkl", "rb"))
western_char_topic = pickle.load(open("src/analysis/characters/western_char_topic.pkl", "rb"))
#pyLDAvis.display(gensim_models.prepare(western_char_topic[0], western_char_topic[1], western_char_topic[2]))

In [10]:
len(archetypes_eastern), len(archetypes_western)

(1577, 1052)

In [11]:
EAST_ARCHETYPES_MAP = {
    0: 'The Authoritarian Villain',
    1: 'The Community Defender',
    2: 'The Tragic Hero'
}

WEST_ARCHETYPES_MAP = {
    0: 'The Moral Enforcer',
    1: 'The Heroic Leader',
    2: 'The Individualistic Rebel'
}

archetypes_western['archetype'] = archetypes_western['topic'].map(lambda x : WEST_ARCHETYPES_MAP[x[0]])
archetypes_eastern['archetype'] = archetypes_eastern['topic'].map(lambda x : EAST_ARCHETYPES_MAP[x[0]])

In [12]:
#archetypes_eastern[archetypes_eastern['archetype'] ==  '0'].sample(10)

In [13]:
# Define color palettes for Eastern and Western characters
eastern_colors = {
    'The Community Defender': COLOR_SCALE[2],  # Light red
    'The Tragic Hero': COLOR_SCALE[1],    # Medium red
    'The Authoritarian Villain': COLOR_SCALE[0]  # Dark red
}

western_colors = {
    'The Moral Enforcer': COLOR_SCALE[4],  # Light blue
    'The Heroic Leader': COLOR_SCALE[5], # Medium blue
    'The Individualistic Rebel': COLOR_SCALE[6]   # Dark blue
}

fig = plot_eastern_western_archetypes_distrib(archetypes_eastern, archetypes_western, eastern_colors, western_colors)
fig.write_html(WEB_EXPORT_FOLDER + "western_archetypes_distrib.html")

In [14]:
lda_model, corpus, dictionary = eastern_char_topic
overall_term_freq = compute_overall_term_freq(corpus, dictionary)
est_term_freq_by_topic = compute_estimated_term_freq(lda_model, corpus, dictionary, 15)
plot_term_frequencies(est_term_freq_by_topic, overall_term_freq, [COLOR_SCALE[1], COLOR_SCALE[2]], EAST_ARCHETYPES_MAP)

In [15]:
lda_model, corpus, dictionary = western_char_topic
overall_term_freq = compute_overall_term_freq(corpus, dictionary)
est_term_freq_by_topic = compute_estimated_term_freq(lda_model, corpus, dictionary, 15)
plot_term_frequencies(est_term_freq_by_topic, overall_term_freq, [COLOR_SCALE[5], COLOR_SCALE[4]], WEST_ARCHETYPES_MAP)

## Semantic analysis

In [16]:
character_df['empath_analysis'] = character_df['character_representation'].apply(lambda x: semantical_analysis(' '.join(x), CAT))
character_empath_df = character_df.dropna(subset=['empath_analysis']).copy()

print('Total number of characters :', len(character_df))
print('Characters with empath features not None :', len(character_empath_df))

Total number of characters : 7052
Characters with empath features not None : 5652


In [17]:
plot_nb_characters_per_side_per_period(character_df)

In [18]:
global_evolution_df, topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis')
fig = plot_empath_evolution(global_evolution_df, topic_colors)
fig.write_html(WEB_EXPORT_FOLDER + "theme_char_evol.html")

p1_evolution_df, p1_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['Blocs Establishment'])
fig = plot_empath_evolution(p1_evolution_df, p1_topic_colors, 'During Blocs Establishment')
fig.write_html(WEB_EXPORT_FOLDER + "theme_char_blocs_estab.html")

p2_evolution_df, p2_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['Major tensions and crises'])
fig = plot_empath_evolution(p2_evolution_df, p2_topic_colors, 'During Major Crises')
fig.write_html(WEB_EXPORT_FOLDER + "theme_char_major_crises.html")

p3_evolution_df, p3_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['Détente'])
fig = plot_empath_evolution(p3_evolution_df, p3_topic_colors, 'During Détente')
fig.write_html(WEB_EXPORT_FOLDER + "theme_char_detente.html")

p4_evolution_df, p4_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['Second Cold War'])
fig = plot_empath_evolution(p4_evolution_df, p4_topic_colors, 'During 2nd Cold War')
fig.write_html(WEB_EXPORT_FOLDER + "theme_char_2nd_cold_war.html")

p5_evolution_df, p5_topic_colors = prepare_empath_data(character_empath_df, 'empath_analysis', COLD_WAR_PERIODS['End of the Cold War'])
fig = plot_empath_evolution(p5_evolution_df, p5_topic_colors, 'During End of Cold War')
fig.write_html(WEB_EXPORT_FOLDER + "theme_char_end_war.html")

In [19]:
fig = plot_empath_distrib_by_year(global_evolution_df, topic_colors, 1947)
fig.write_html(WEB_EXPORT_FOLDER + "empath_distrib_by_year.html")