In [2]:
# System libraries:
%load_ext autoreload
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)
import pandas as pd
import json
import nltk
import numpy as np
import os
import sys
from IPython.core.interactiveshell import InteractiveShell
from pymystem3 import Mystem
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from datetime import datetime
import re
import gensim.downloader as api
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
import pickle
InteractiveShell.ast_node_interactivity = "all"

In [3]:
data_federal = pd.read_csv('data/processed/integrum_federal_lemmas.csv', converters = {'lemmas_content' : eval, 'lemmas_headline':eval})
data_local = pd.read_csv('data/processed/integrum_local_lemmas.csv', converters = {'lemmas_content' : eval, 'lemmas_headline':eval})

In [8]:
def create_models_for_multiple_years(corpus):
    years = sorted(corpus.year.unique())
    models = {}
    for i, year in enumerate(years[:-4]):
        print("Building model for " +str(year))
        sentences = corpus[corpus.year.isin(years[i:i+5])].lemmas_content
        models[year] = gensim.models.Word2Vec(sentences, min_count=1)
        
    sentences = corpus.lemmas_content
    models['all'] = gensim.models.Word2Vec(sentences, min_count=1)
    
    return models
    

def get_most_similar(models, words, n):
    results = {}
    for year in models:
        results[year] = models[year].wv.most_similar(positive=words, topn=n)
        
    return results

def compare_words(models, word1, word2):
    results = {}
    for year in models:
        results[year] = models[year].wv.similarity(word1, word2)
        
    return results

def word_vector_math(models, positive_terms, negative_terms, n):
    results = {}
    for year in models:
        results[year] = models[year].wv.most_similar(positive=positive_terms, 
                                                      negative=negative_terms, topn=n)
    
    return results

In [6]:
print("Building Federal models:")
models_federal = create_models_for_multiple_years(data_federal)
print("Building Local models:")
models_local = create_models_for_multiple_years(data_local)

Building Federal models:
Building model for 1999
Building model for 2000
Building model for 2001
Building model for 2002
Building model for 2003
Building model for 2004
Building model for 2005
Building model for 2006
Building model for 2007
Building model for 2008
Building model for 2009
Building model for 2010
Building model for 2011
Building model for 2012
Building model for 2013
Building Local models:
Building model for 1993
Building model for 1995
Building model for 1996
Building model for 1997
Building model for 1998
Building model for 1999
Building model for 2000
Building model for 2001
Building model for 2002
Building model for 2003
Building model for 2004
Building model for 2005
Building model for 2006
Building model for 2007
Building model for 2008
Building model for 2009
Building model for 2010
Building model for 2011
Building model for 2012
Building model for 2013


In [9]:
#pickle_path_federal = 'pickles/integrum_federal_word_models.pickle'
#pickle.dump(models_federal, open(pickle_path_federal, "wb"))
#pickle_path_local = 'pickles/integrum_local_word_models.pickle'
#pickle.dump(models_local, open(pickle_path_local, "wb"))

In [13]:
# Get words most similar to a topic
words = ['финляндия']
number_results = 10

print("Results for Federal:")
get_most_similar(models_federal, words, number_results)
print("Results for local:")
get_most_similar(models_local, words, number_results)

Results for Federal:


{1999: [('швеция', 0.8072932958602905),
  ('эстония', 0.734123945236206),
  ('норвегия', 0.732284665107727),
  ('китай', 0.7147859334945679),
  ('германия', 0.6998868584632874),
  ('дания', 0.6907250881195068),
  ('даниельссон', 0.680587887763977),
  ('австрия', 0.67793869972229),
  ('корея', 0.6757954955101013),
  ('казахстан', 0.6756657361984253)],
 2000: [('швеция', 0.7282971739768982),
  ('эстония', 0.6979897618293762),
  ('норвегия', 0.6734362840652466),
  ('дания', 0.653537392616272),
  ('нидерланды', 0.6425493955612183),
  ('германия', 0.6374134421348572),
  ('абреу', 0.6300808191299438),
  ('египет', 0.6300433278083801),
  ('остерсунда', 0.619949460029602),
  ('бьоркман', 0.6158144474029541)],
 2001: [('швеция', 0.6635425090789795),
  ('норвегия', 0.6547548770904541),
  ('эстония', 0.6310954093933105),
  ('девятимиллионный', 0.6308116912841797),
  ('дания', 0.6220912933349609),
  ('германия', 0.5987108945846558),
  ('австрия', 0.5965783596038818),
  ('хельстрем', 0.589979767799

Results for local:


{1993: [('швеция', 0.9557653069496155),
  ('германия', 0.9294544458389282),
  ('эстония', 0.9221528768539429),
  ('дания', 0.916588306427002),
  ('норвегия', 0.9119482040405273),
  ('латвия', 0.9110863208770752),
  ('белоруссия', 0.8989177942276001),
  ('сборная', 0.8869310021400452),
  ('европа', 0.8865114450454712),
  ('польша', 0.8791298270225525)],
 1995: [('эстония', 0.8018003702163696),
  ('латвия', 0.7884517908096313),
  ('суоми', 0.7821050882339478),
  ('швеция', 0.7812479734420776),
  ('белоруссия', 0.7515753507614136),
  ('норвегия', 0.7511245012283325),
  ('германия', 0.7438417673110962),
  ('бомбардировать', 0.7294999957084656),
  ('страна', 0.7263015508651733),
  ('дания', 0.7260202169418335)],
 1996: [('суоми', 0.7376048564910889),
  ('эстония', 0.7027368545532227),
  ('норвегия', 0.6798259615898132),
  ('швеция', 0.6675082445144653),
  ('норшьо', 0.6664227843284607),
  ('германия', 0.6638166904449463),
  ('латвия', 0.6593422889709473),
  ('дания', 0.6561703085899353),
  

In [None]:
# Get the distance between words over years, bigger is closer:
word1 = 'naapuri'
word2 = 'hyvä'

print("Results for Federal:")
compare_words(models_federal, word1, word2)
print("Results for local:")
compare_words(models_local, word1, word2)

In [12]:
# Calculate the vector math - "what is X to Russia as Sauna is to Finland":
positive_terms = ['россия', 'сосед']
negative_terms = ['финляндия']

number_results = 3

print("Results for Federal:")
word_vector_math(models_federal, positive_terms, negative_terms, number_results)
print("Results for local:")
word_vector_math(models_local, positive_terms, negative_terms, number_results)

Results for Federal:


{1999: [('касаться', 0.762572169303894),
  ('амбулаторный', 0.7320612072944641),
  ('артемова', 0.7293155193328857)],
 2000: [('касаться', 0.7189063429832458),
  ('селянина', 0.7115254402160645),
  ('соглашаться', 0.6888520121574402)],
 2001: [('воля', 0.6261519193649292),
  ('хотеть', 0.6115468740463257),
  ('чарпентьер', 0.6106925010681152)],
 2002: [('воля', 0.5929020643234253),
  ('судьба', 0.5748488903045654),
  ('надежда', 0.5593622922897339)],
 2003: [('подфракция', 0.5478630661964417),
  ('украина', 0.4916243851184845),
  ('воля', 0.49137839674949646)],
 2004: [('украина', 0.5359034538269043),
  ('воля', 0.5157945156097412),
  ('надежда', 0.5016772747039795)],
 2005: [('украина', 0.5036588907241821),
  ('воля', 0.4871886074542999),
  ('чуткость', 0.47855740785598755)],
 2006: [('украина', 0.5138838291168213),
  ('этически', 0.4631175398826599),
  ('антинатовский', 0.4601776599884033)],
 2007: [('оптимизм', 0.4881232678890228),
  ('семиречье', 0.47774821519851685),
  ('украина',

Results for local:


{1993: [('порочить', 0.899169921875),
  ('образно', 0.8811245560646057),
  ('кивать', 0.8764347434043884)],
 1995: [('дерготня', 0.6555582284927368),
  ('интерес', 0.6522762775421143),
  ('безграничный', 0.6418750286102295)],
 1996: [('сейсмоопасность', 0.639826774597168),
  ('коптильня', 0.6158420443534851),
  ('благовоспитанный', 0.5784878730773926)],
 1997: [('благовоспитанный', 0.5293777585029602),
  ('рантанен', 0.5043588280677795),
  ('соуперь', 0.493741899728775)],
 1998: [('нераскаянность', 0.5926346778869629),
  ('вояжировать', 0.5245988368988037),
  ('Vada', 0.5227819085121155)],
 1999: [('потогонный', 0.5566210746765137),
  ('энергосырьевой', 0.5246824622154236),
  ('подныривать', 0.5190762281417847)],
 2000: [('электроэнергетик', 0.5198485851287842),
  ('перепродавец', 0.501958429813385),
  ('страна', 0.4920766055583954)],
 2001: [('капанный', 0.5484961271286011),
  ('цивилизация', 0.4933053255081177),
  ('страна', 0.4919850826263428)],
 2002: [('волго-вятский', 0.551221728