# Setup


In [None]:
%load_ext autoreload 
%autoreload 2

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm
from gensim.models import KeyedVectors
from modules.nlp import NLP

DATA_DIR = Path().cwd() / "data"

CHINA_HEX = '#ee1c25'
RUSSIA_HEX = '#1C3578'

nlp = NLP()

articles_df = pd.read_feather(DATA_DIR / "wikiarticles_seg_data.feather")

# Drop non-substantive article sections
section_titles_to_drop = ['Early life', 'Education',
                          'External links', 'Further reading', 'References', 'See also']
articles_df = articles_df[~articles_df['title'].isin(
    section_titles_to_drop)]

# Create a combined year and month column
articles_df['month'] = articles_df['month'].astype(str).str.zfill(2)
articles_df['year_month'] = articles_df['year'].astype(str) + '-' + articles_df['month']
articles_df['year_month'] = pd.to_datetime(articles_df['year_month'])

# Embeddings


**Pretrained embeddings:** Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation.


In [None]:
word_vectors = KeyedVectors.load_word2vec_format(
    DATA_DIR / "glove.840B.300d.txt", binary=False, no_header=True)

**Generating word embeddings**


In [None]:
# Remove sections without text
articles_df.dropna(subset=['text'], inplace=True)

# Tokenize text and remove stop words
tqdm.pandas(desc="Tokenizing article text")
articles_df['tokens'] = articles_df['text'].progress_apply(
    nlp.tokenize_text)

# Generate word embeddings
articles_df['weighted_embeddings'] = nlp.generate_tfidf_weighted_embeddings(
    articles_df['tokens'], word_vectors)

# Save embeddings
articles_df.to_feather(DATA_DIR / "wikiarticles_embeddings.feather")

**Calculating cosine similarities for different types of section**


In [None]:
articles_df = pd.read_feather(DATA_DIR / "wikiarticles_embeddings.feather")

# Calculate full article monthly similarities
tqdm.pandas(desc="Calculating monthly similarities")
article_monthly_similarities = articles_df.groupby('year_month').progress_apply(
    nlp.calculate_grouped_similarities,
    figure_column='article_name',
    figure_1='Vladimir Putin',
    figure_2='Xi Jinping',
    embeddings_column='weighted_embeddings')

# Calculate policy category monthly similarities
tqdm.pandas(desc="Calculating monthly similarities")
policy_category_monthly_similarities = articles_df.groupby('year_month').progress_apply(
    nlp.calculate_grouped_similarities,
    figure_column='article_name',
    figure_1='Vladimir Putin',
    figure_2='Xi Jinping',
    embeddings_column='weighted_embeddings',
    category_column='category',
    category='policy')

# Calculate public image category monthly similarities
tqdm.pandas(desc="Calculating monthly similarities")
recognition_category_monthly_similarities = articles_df.groupby('year_month').progress_apply(
    nlp.calculate_grouped_similarities,
    figure_column='article_name',
    figure_1='Vladimir Putin',
    figure_2='Xi Jinping',
    embeddings_column='weighted_embeddings',
    category_column='category',
    category='Recognition')

**Plotting cosine similarities**


In [None]:
WINDOW = 6

plt.figure(figsize=(12, 6))
plt.plot(article_monthly_similarities.index,
         article_monthly_similarities.rolling(WINDOW).mean(), color=CHINA_HEX)
plt.title(
    "Monthly Average Cosine Similarity Between Putin and Xi's Wikipedia Articles")
plt.xlabel("Date")
plt.ylabel("Cosine Similarity")
plt.show()

plt.figure(figsize=(12, 6))
plt.plot(policy_category_monthly_similarities.index,
         policy_category_monthly_similarities.rolling(WINDOW).mean(), color=CHINA_HEX)
plt.title(
    "Monthly Average Cosine Similarity Between Putin and Xi's Policy Sections")
plt.xlabel("Date")
plt.ylabel("Cosine Similarity")
plt.show()

plt.figure(figsize=(12, 6))
plt.plot(recognition_category_monthly_similarities.index,
         recognition_category_monthly_similarities.rolling(WINDOW).mean(), color=CHINA_HEX)
plt.title(
    "Monthly Average Cosine Similarity Between Putin and Xi's Public Image Sections")
plt.xlabel("Date")
plt.ylabel("Cosine Similarity")
plt.show()