In [1]:
# System libraries:
%load_ext autoreload
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)
import pandas as pd
import json
import nltk
import numpy as np
import os
import sys
from IPython.core.interactiveshell import InteractiveShell
#from pymystem3 import Mystem
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from collections import Counter
#from wordcloud import WordCloud
import matplotlib.pyplot as plt
from datetime import datetime
import re
import gensim.downloader as api
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
import pickle
InteractiveShell.ast_node_interactivity = "all"

In [3]:
data_federal = pd.read_csv('data/processed/integrum_federal_lemmas.csv', converters = {'lemmas_content' : eval, 'lemmas_headline':eval})
data_local = pd.read_csv('data/processed/integrum_local_lemmas.csv', converters = {'lemmas_content' : eval, 'lemmas_headline':eval})

In [2]:
def create_models_for_multiple_years(corpus):
    years = sorted(corpus.year.unique())
    models = {}
    for i, year in enumerate(years[:-4]):
        print("Building model for " +str(year))
        sentences = corpus[corpus.year.isin(years[i:i+5])].lemmas_content
        models[year] = gensim.models.Word2Vec(sentences, min_count=1)
        
    sentences = corpus.lemmas_content
    models['all'] = gensim.models.Word2Vec(sentences, min_count=1)
    
    return models
    

def get_most_similar(models, words, n):
    results = {}
    for year in models:
        results[year] = models[year].wv.most_similar(positive=words, topn=n)
        
    return results

def compare_words(models, word1, word2):
    results = {}
    for year in models:
        results[year] = models[year].wv.similarity(word1, word2)
        
    return results

def word_vector_math(models, positive_terms, negative_terms, n):
    results = {}
    for year in models:
        results[year] = models[year].wv.most_similar(positive=positive_terms, 
                                                      negative=negative_terms, topn=n)
    
    return results

In [6]:
print("Building Federal models:")
models_federal = create_models_for_multiple_years(data_federal)
print("Building Local models:")
models_local = create_models_for_multiple_years(data_local)

Building Federal models:
Building model for 1999
Building model for 2000
Building model for 2001
Building model for 2002
Building model for 2003
Building model for 2004
Building model for 2005
Building model for 2006
Building model for 2007
Building model for 2008
Building model for 2009
Building model for 2010
Building model for 2011
Building model for 2012
Building model for 2013
Building Local models:
Building model for 1993
Building model for 1995
Building model for 1996
Building model for 1997
Building model for 1998
Building model for 1999
Building model for 2000
Building model for 2001
Building model for 2002
Building model for 2003
Building model for 2004
Building model for 2005
Building model for 2006
Building model for 2007
Building model for 2008
Building model for 2009
Building model for 2010
Building model for 2011
Building model for 2012
Building model for 2013


In [3]:
pickle_path_federal = 'pickles/integrum_federal_word_models.pickle'
pickle_path_local = 'pickles/integrum_local_word_models.pickle'

In [9]:
#pickle.dump(models_federal, open(pickle_path_federal, "wb"))
#pickle.dump(models_local, open(pickle_path_local, "wb"))

In [4]:
models_federal = pickle.load(open(pickle_path_federal, "rb"))
models_local = pickle.load(open(pickle_path_local, "rb"))

## Update model dicts to use strings as keys, get rid of superfluos data:

See https://radimrehurek.com/gensim/models/word2vec.html, "If you’re finished training a model (i.e. no more updates, only querying), you can switch to the KeyedVectors instance: 

```python
>>> word_vectors = model.wv
>>> del model
```

In [5]:
pickle_path_federal_new = 'pickles/integrum_federal_word_models.pkl'
pickle_path_local_new = 'pickles/integrum_local_word_models.pkl'

models_federal_new = {}

for key in models_federal:
    models_federal_new[str(key)] = models_federal[key].wv
    
models_local_new = {}

for key in models_local:
    models_local_new[str(key)] = models_local[key].wv

pickle.dump(models_federal_new, open(pickle_path_federal_new, "wb"))
pickle.dump(models_local_new, open(pickle_path_local_new, "wb"))

In [7]:
# Get words most similar to a topic
words = ['сосед']
number_results = 10

print("Results for Federal:")
get_most_similar(models_federal, words, number_results)
print("Results for local:")
get_most_similar(models_local, words, number_results)

Results for Federal:


{'all': [('соседка', 0.6145724058151245),
  ('европеец', 0.6125373244285583),
  ('согражданин', 0.583803653717041),
  ('союзник', 0.5772225260734558),
  ('соотечественник', 0.5559887886047363),
  ('страна', 0.5499905347824097),
  ('китаец', 0.531200647354126),
  ('народ', 0.528061032295227),
  ('наоборот', 0.527100682258606),
  ('менталитет', 0.5238292813301086)],
 1999: [('убежище', 0.9270979166030884),
  ('демократия', 0.9269813299179077),
  ('геополитический', 0.9249426126480103),
  ('флаг', 0.9236154556274414),
  ('служить', 0.9219054579734802),
  ('относиться', 0.9201438426971436),
  ('обстановка', 0.9191399812698364),
  ('справляться', 0.9164369106292725),
  ('решительный', 0.9163156747817993),
  ('опасение', 0.9155329465866089)],
 2000: [('предел', 0.8895346522331238),
  ('воспринимать', 0.8841016888618469),
  ('по-прежнему', 0.8825489282608032),
  ('иностранец', 0.8766172528266907),
  ('именно', 0.8742612600326538),
  ('подлость', 0.8733521699905396),
  ('север', 0.870620548725

Results for local:


{'all': [('согражданин', 0.6234585642814636),
  ('соотечественник', 0.5946056842803955),
  ('соседка', 0.5944869518280029),
  ('европеец', 0.5748534202575684),
  ('страна', 0.5522646307945251),
  ('коллега', 0.5444473624229431),
  ('широта', 0.542745053768158),
  ('эстонец', 0.5313668251037598),
  ('финн', 0.5210483074188232),
  ('менталитет', 0.5083603858947754)],
 1993: [('соотечественник', 0.9744365215301514),
  ('питерец', 0.964188814163208),
  ('позвать', 0.9617142677307129),
  ('народ', 0.9617129564285278),
  ('западноевропейский', 0.9578330516815186),
  ('нен', 0.9575434923171997),
  ('репутация', 0.9564357995986938),
  ('интеграция', 0.955284595489502),
  ('давний', 0.9517357349395752),
  ('практика', 0.9501572251319885)],
 1995: [('соотечественник', 0.871849536895752),
  ('малобюджетный', 0.8357176184654236),
  ('усыновитель', 0.8281018137931824),
  ('держава', 0.81739342212677),
  ('выходец', 0.8153512477874756),
  ('здравница', 0.814006507396698),
  ('этотдень', 0.8098456263

In [9]:
# Get the distance between words over years, bigger is closer:
word1 = 'финляндия'
word2 = 'друг'

print("Results for Federal:")
compare_words(models_federal, word1, word2)
print("Results for local:")
compare_words(models_local, word1, word2)

Results for Federal:


{'all': 0.0675205787893711,
 1999: -0.028095664134714884,
 2000: 0.14638802213322513,
 2001: 0.10370791020163118,
 2002: 0.07608741628320462,
 2003: 0.0663007426115786,
 2004: 0.11432791916894702,
 2005: 0.10817789537329106,
 2006: 0.04854370973711865,
 2007: 0.1006418762378608,
 2008: 0.08281267055302804,
 2009: 0.08598788240190922,
 2010: 0.1323981792165512,
 2011: 0.17097890727811738,
 2012: 0.15995434107776169,
 2013: 0.09333479873786099}

Results for local:


{'all': 0.229915804989533,
 1993: 0.2835335309656519,
 1995: 0.27191023825500327,
 1996: 0.22697304005094415,
 1997: 0.22134206585706018,
 1998: 0.25201073185265654,
 1999: 0.17954431084769581,
 2000: 0.24282917699769427,
 2001: 0.19575099345332578,
 2002: 0.20649478668632815,
 2003: 0.117253050300097,
 2004: 0.15590615612307807,
 2005: 0.13752626958960662,
 2006: 0.11479323816872805,
 2007: 0.11086559738233134,
 2008: 0.15941984712773372,
 2009: 0.1599603024427857,
 2010: 0.1431460337121436,
 2011: 0.23958428827025643,
 2012: 0.20462273161273,
 2013: 0.2533699271887402}

In [11]:
# Calculate the vector math - "what is X to Russia as Sauna is to Finland":
positive_terms = ['россия', 'маннергейм']
negative_terms = ['финляндия']

number_results = 3

print("Results for Federal:")
word_vector_math(models_federal, positive_terms, negative_terms, number_results)
print("Results for local:")
word_vector_math(models_local, positive_terms, negative_terms, number_results)

Results for Federal:


{'all': [('маршал', 0.536050021648407),
  ('сталин', 0.5328856706619263),
  ('ссср', 0.5276433229446411)],
 1999: [('говорухина', 0.7532452940940857),
  ('реформировать', 0.7508476972579956),
  ('самохин', 0.7505269050598145)],
 2000: [('сладостно', 0.706183910369873),
  ('48.55', 0.7047044038772583),
  ('кронштадтский', 0.7040519714355469)],
 2001: [('aegorow', 0.6870042085647583),
  ('внебрачный', 0.6831349730491638),
  ('безруков', 0.6808423399925232)],
 2002: [('сидоров', 0.6262498497962952),
  ('швидлер', 0.5968443155288696),
  ('менделеев', 0.5936644673347473)],
 2003: [('желдормаш', 0.5621464252471924),
  ('князь', 0.552970826625824),
  ('царь', 0.5352111458778381)],
 2004: [('царь', 0.556607723236084),
  ('буслов', 0.5100216269493103),
  ('галицкий', 0.5077356100082397)],
 2005: [('маршал', 0.5214092135429382),
  ('шапошников', 0.5086955428123474),
  ('мордан', 0.5086311101913452)],
 2006: [('адмирал', 0.543361485004425),
  ('евдокимович', 0.5223737359046936),
  ('ермолина', 0.

Results for local:


{'all': [('сталин', 0.5897626876831055),
  ('маршал', 0.5531381368637085),
  ('гитлер', 0.5127024054527283)],
 1993: [('обозреватель', 0.9008882641792297),
  ('экс', 0.9003258347511292),
  ('журналистский', 0.8986137509346008)],
 1995: [('госцирк', 0.7124382257461548),
  ('ДЮСШ', 0.7101243734359741),
  ('приключаться', 0.7005537152290344)],
 1996: [('уймонить', 0.5928032994270325),
  ('анель', 0.5841622948646545),
  ('плевна', 0.5818272829055786)],
 1997: [('самодержавие', 0.6019575595855713),
  ('подвиг', 0.5566482543945312),
  ('добробабин', 0.53236985206604)],
 1998: [('одрин', 0.56961989402771),
  ('катс', 0.5584726333618164),
  ('чтить', 0.552108883857727)],
 1999: [('добробабин', 0.5319408178329468),
  ('командорский', 0.5303478240966797),
  ('ссср', 0.5183157324790955)],
 2000: [('командорский', 0.5882496237754822),
  ('лардот', 0.5383530855178833),
  ('герой', 0.5262404680252075)],
 2001: [('маршал', 0.5852488279342651),
  ('фельетон', 0.5548205375671387),
  ('хеландерин', 0.53

In [13]:
models_local.keys()
models_federal.keys()

dict_keys(['all', 1993, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013])

dict_keys(['all', 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013])

In [None]:
model.save(fname)
model = Word2Vec.load(fname) 

In [16]:
num_regex = re.compile('[0-9]{4}')
num_regex.match('1f90')

In [30]:
def store_model_to_disk(models_dict, filename_prefix, path = 'models/'):
    for key in models_dict.keys():
        current_prefix = filename_prefix + '_'
        # Increment the years by 4
        if num_regex.match(str(key)):
            current_prefix += str(key + 4)
        else: 
            current_prefix += str(key)
        
        save_path = path + current_prefix
        models_dict[key].save(save_path)
        

            

In [31]:
store_model_to_disk(models_local, 'integrum_local')
store_model_to_disk(models_federal, 'integrum_federal')

In [27]:
models_federal

{'all': <gensim.models.word2vec.Word2Vec at 0x113dbf668>,
 1999: <gensim.models.word2vec.Word2Vec at 0x12720fcf8>,
 2000: <gensim.models.word2vec.Word2Vec at 0x131c6bb38>,
 2001: <gensim.models.word2vec.Word2Vec at 0x139ba33c8>,
 2002: <gensim.models.word2vec.Word2Vec at 0x1425eeeb8>,
 2003: <gensim.models.word2vec.Word2Vec at 0x14c8f9c18>,
 2004: <gensim.models.word2vec.Word2Vec at 0x155812390>,
 2005: <gensim.models.word2vec.Word2Vec at 0x15f36bfd0>,
 2006: <gensim.models.word2vec.Word2Vec at 0x169915978>,
 2007: <gensim.models.word2vec.Word2Vec at 0x173ef7d30>,
 2008: <gensim.models.word2vec.Word2Vec at 0x17dca0d30>,
 2009: <gensim.models.word2vec.Word2Vec at 0x187354320>,
 2010: <gensim.models.word2vec.Word2Vec at 0x18fe66278>,
 2011: <gensim.models.word2vec.Word2Vec at 0x1982bfb70>,
 2012: <gensim.models.word2vec.Word2Vec at 0x1a0629cc0>,
 2013: <gensim.models.word2vec.Word2Vec at 0x1a85ff5c0>}