In [1]:
# System libraries:
%load_ext autoreload
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)
import pandas as pd
import json
import nltk
import numpy as np
import os
import sys
from IPython.core.interactiveshell import InteractiveShell
from pymystem3 import Mystem
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from datetime import datetime
import re
import gensim.downloader as api
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
import pickle
InteractiveShell.ast_node_interactivity = "all"

In [3]:
data_federal = pd.read_csv('data/processed/integrum_federal_lemmas.csv', converters = {'lemmas_content' : eval, 'lemmas_headline':eval})
data_local = pd.read_csv('data/processed/integrum_local_lemmas.csv', converters = {'lemmas_content' : eval, 'lemmas_headline':eval})

In [2]:
def create_models_for_multiple_years(corpus):
    years = sorted(corpus.year.unique())
    models = {}
    for i, year in enumerate(years[:-4]):
        print("Building model for " +str(year))
        sentences = corpus[corpus.year.isin(years[i:i+5])].lemmas_content
        models[year] = gensim.models.Word2Vec(sentences, min_count=1)
        
    sentences = corpus.lemmas_content
    models['all'] = gensim.models.Word2Vec(sentences, min_count=1)
    
    return models
    

def get_most_similar(models, words, n):
    results = {}
    for year in models:
        results[year] = models[year].wv.most_similar(positive=words, topn=n)
        
    return results

def compare_words(models, word1, word2):
    results = {}
    for year in models:
        results[year] = models[year].wv.similarity(word1, word2)
        
    return results

def word_vector_math(models, positive_terms, negative_terms, n):
    results = {}
    for year in models:
        results[year] = models[year].wv.most_similar(positive=positive_terms, 
                                                      negative=negative_terms, topn=n)
    
    return results

In [6]:
print("Building Federal models:")
models_federal = create_models_for_multiple_years(data_federal)
print("Building Local models:")
models_local = create_models_for_multiple_years(data_local)

Building Federal models:
Building model for 1999
Building model for 2000
Building model for 2001
Building model for 2002
Building model for 2003
Building model for 2004
Building model for 2005
Building model for 2006
Building model for 2007
Building model for 2008
Building model for 2009
Building model for 2010
Building model for 2011
Building model for 2012
Building model for 2013
Building Local models:
Building model for 1993
Building model for 1995
Building model for 1996
Building model for 1997
Building model for 1998
Building model for 1999
Building model for 2000
Building model for 2001
Building model for 2002
Building model for 2003
Building model for 2004
Building model for 2005
Building model for 2006
Building model for 2007
Building model for 2008
Building model for 2009
Building model for 2010
Building model for 2011
Building model for 2012
Building model for 2013


In [3]:
pickle_path_federal = 'pickles/integrum_federal_word_models.pickle'
pickle_path_local = 'pickles/integrum_local_word_models.pickle'

In [9]:
#pickle.dump(models_federal, open(pickle_path_federal, "wb"))
#pickle.dump(models_local, open(pickle_path_local, "wb"))

In [4]:
models_federal = pickle.load(open(pickle_path_federal, "rb"))
models_local = pickle.load(open(pickle_path_local, "rb"))

In [6]:
# Get words most similar to a topic
words = ['сосед']
number_results = 10

print("Results for Federal:")
get_most_similar(models_federal, words, number_results)
print("Results for local:")
get_most_similar(models_local, words, number_results)

Results for Federal:


{1999: [('убежище', 0.9270979762077332),
  ('демократия', 0.9269813299179077),
  ('геополитический', 0.9249426126480103),
  ('флаг', 0.9236153960227966),
  ('служить', 0.9219054579734802),
  ('относиться', 0.9201438426971436),
  ('обстановка', 0.9191399812698364),
  ('справляться', 0.9164368510246277),
  ('решительный', 0.9163156747817993),
  ('опасение', 0.9155329465866089)],
 2000: [('предел', 0.8895347118377686),
  ('воспринимать', 0.8841016292572021),
  ('по-прежнему', 0.882548987865448),
  ('иностранец', 0.8766172528266907),
  ('именно', 0.874261200428009),
  ('подлость', 0.8733522295951843),
  ('север', 0.8706205487251282),
  ('ездить', 0.868992805480957),
  ('различие', 0.8689764142036438),
  ('запад', 0.8668187856674194)],
 2001: [('запад', 0.8707575798034668),
  ('народ', 0.8463801145553589),
  ('интересовать', 0.8458619117736816),
  ('иностранец', 0.8375285267829895),
  ('мешать', 0.8364661931991577),
  ('отдельно', 0.8305209875106812),
  ('далеко', 0.8261426687240601),
  ('к

Results for local:


{1993: [('соотечественник', 0.9744364619255066),
  ('питерец', 0.964188814163208),
  ('позвать', 0.9617142677307129),
  ('народ', 0.9617129564285278),
  ('западноевропейский', 0.9578330516815186),
  ('нен', 0.9575434923171997),
  ('репутация', 0.9564357995986938),
  ('интеграция', 0.955284595489502),
  ('давний', 0.95173579454422),
  ('практика', 0.9501572847366333)],
 1995: [('соотечественник', 0.871849536895752),
  ('малобюджетный', 0.8357176780700684),
  ('усыновитель', 0.8281017541885376),
  ('держава', 0.81739342212677),
  ('выходец', 0.8153512477874756),
  ('здравница', 0.814006507396698),
  ('этотдень', 0.8098456859588623),
  ('питерец', 0.8053678274154663),
  ('запад', 0.8039109706878662),
  ('популярность', 0.8033404350280762)],
 1996: [('соотечественник', 0.7999035120010376),
  ('коптильня', 0.7749836444854736),
  ('классицистский', 0.7622711658477783),
  ('делимый', 0.747541069984436),
  ('климат', 0.7398977279663086),
  ('народ', 0.7320960760116577),
  ('край', 0.7257795929

In [None]:
# Get the distance between words over years, bigger is closer:
word1 = 'naapuri'
word2 = 'hyvä'

print("Results for Federal:")
compare_words(models_federal, word1, word2)
print("Results for local:")
compare_words(models_local, word1, word2)

In [12]:
# Calculate the vector math - "what is X to Russia as Sauna is to Finland":
positive_terms = ['россия', 'сосед']
negative_terms = ['финляндия']

number_results = 3

print("Results for Federal:")
word_vector_math(models_federal, positive_terms, negative_terms, number_results)
print("Results for local:")
word_vector_math(models_local, positive_terms, negative_terms, number_results)

Results for Federal:


{1999: [('касаться', 0.762572169303894),
  ('амбулаторный', 0.7320612072944641),
  ('артемова', 0.7293155193328857)],
 2000: [('касаться', 0.7189063429832458),
  ('селянина', 0.7115254402160645),
  ('соглашаться', 0.6888520121574402)],
 2001: [('воля', 0.6261519193649292),
  ('хотеть', 0.6115468740463257),
  ('чарпентьер', 0.6106925010681152)],
 2002: [('воля', 0.5929020643234253),
  ('судьба', 0.5748488903045654),
  ('надежда', 0.5593622922897339)],
 2003: [('подфракция', 0.5478630661964417),
  ('украина', 0.4916243851184845),
  ('воля', 0.49137839674949646)],
 2004: [('украина', 0.5359034538269043),
  ('воля', 0.5157945156097412),
  ('надежда', 0.5016772747039795)],
 2005: [('украина', 0.5036588907241821),
  ('воля', 0.4871886074542999),
  ('чуткость', 0.47855740785598755)],
 2006: [('украина', 0.5138838291168213),
  ('этически', 0.4631175398826599),
  ('антинатовский', 0.4601776599884033)],
 2007: [('оптимизм', 0.4881232678890228),
  ('семиречье', 0.47774821519851685),
  ('украина',

Results for local:


{1993: [('порочить', 0.899169921875),
  ('образно', 0.8811245560646057),
  ('кивать', 0.8764347434043884)],
 1995: [('дерготня', 0.6555582284927368),
  ('интерес', 0.6522762775421143),
  ('безграничный', 0.6418750286102295)],
 1996: [('сейсмоопасность', 0.639826774597168),
  ('коптильня', 0.6158420443534851),
  ('благовоспитанный', 0.5784878730773926)],
 1997: [('благовоспитанный', 0.5293777585029602),
  ('рантанен', 0.5043588280677795),
  ('соуперь', 0.493741899728775)],
 1998: [('нераскаянность', 0.5926346778869629),
  ('вояжировать', 0.5245988368988037),
  ('Vada', 0.5227819085121155)],
 1999: [('потогонный', 0.5566210746765137),
  ('энергосырьевой', 0.5246824622154236),
  ('подныривать', 0.5190762281417847)],
 2000: [('электроэнергетик', 0.5198485851287842),
  ('перепродавец', 0.501958429813385),
  ('страна', 0.4920766055583954)],
 2001: [('капанный', 0.5484961271286011),
  ('цивилизация', 0.4933053255081177),
  ('страна', 0.4919850826263428)],
 2002: [('волго-вятский', 0.551221728