In [1]:
# System libraries:
%load_ext autoreload
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN)
import pandas as pd
import json
import nltk
import numpy as np
import os
import sys
from IPython.core.interactiveshell import InteractiveShell
from pymystem3 import Mystem
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from datetime import datetime
import re
import gensim.downloader as api
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
import pickle
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Train word2vec models starting at each year so around 15-20 models
data = pd.read_csv('data/processed/yle_fi_lemmas.csv', converters = {'lemmas_content' : eval, 'lemmas_headline':eval})
data.iloc[:5]

Unnamed: 0.1,Unnamed: 0,headline,content,datePublished,lemmas_content,year
0,0,Kaukasian konflikti: Aseissa Georgia ja Venäjä...,Kaukasian konflikti: Aseissa Georgia ja Venäjä...,2008-08-11T11:03:41+0300,"[kaukasia, konflikti, ase, georg, venäjä, tuke...",2008
1,1,Otteita Venäjän-tuntijoiden vastauksista,Otteita Venäjän-tuntijoiden vastauksista\n\nTä...,2007-01-14T12:46:58+0200,"[ote, venäjä, vasta, tiivistelmä, koota, ote, ...",2007
2,2,Etelä-Ossetian ruutitynnyri räjähti viimein,Etelä-Ossetian ruutitynnyri räjähti viimein\n\...,2008-08-12T21:33:32+0300,"[etelä-ossetia, ruuti, räjähtää, viimein, perj...",2008
3,3,Yle Uutiset seuraa Ukrainan kriisiä hetki hetk...,Yle Uutiset seuraa Ukrainan kriisiä hetki hetk...,2014-03-18T09:06:59+0200,"[yle, uutinen, seura, ukraina, kriisi, hetki, ...",2014
4,4,Saksan ulkopolitiikan täyskäännös,Saksan ulkopolitiikan täyskäännös\n\nTuskin mi...,2014-10-02T12:30:30+0300,"[saksa, täyskäännös, tuska, mikään, muu, euroo...",2014


In [3]:
def create_models_for_multiple_years(corpus):
    years = sorted(corpus.year.unique())
    models = {}
    for i, year in enumerate(years[:-4]):
        print("Building model for " +str(year))
        sentences = corpus[corpus.year.isin(years[i:i+5])].lemmas_content
        models[year] = gensim.models.Word2Vec(sentences, min_count=1)
        
    sentences = corpus.lemmas_content
    models['all'] = gensim.models.Word2Vec(sentences, min_count=1)
    
    return models
    

#models = create_models_for_multiple_years(data)

Building model for 1997
Building model for 1998
Building model for 1999
Building model for 2000
Building model for 2001
Building model for 2002
Building model for 2003
Building model for 2004
Building model for 2005
Building model for 2006
Building model for 2007
Building model for 2008
Building model for 2009
Building model for 2010
Building model for 2011
Building model for 2012
Building model for 2013


In [7]:
#pickle_path = 'pickles/yle_word_models.pickle'
#pickle.dump(models, open(pickle_path, "wb"))

In [None]:
#serialize models with pickle

In [23]:
def get_most_similar(models, words, n):
    results = {}
    for year in models:
        results[year] = models[year].wv.most_similar(positive=words, topn=n)
        
    return results

def compare_words(models, word1, word2):
    results = {}
    for year in models:
        results[year] = models[year].wv.similarity(word1, word2)
        
    return results

def word_vector_math(models, positive_terms, negative_terms, n):
    results = {}
    for year in models:
        results[year] = models[year].wv.most_similar(positive=positive_terms, 
                                                      negative=negative_terms, topn=n)
    
    return results

In [18]:
# Get words most similar to a topic
words = ['naapuri']
number_results = 10

get_most_similar(models, words, number_results)

{1997: [('kova', 0.9622451066970825),
  ('mä', 0.9425488114356995),
  ('liika', 0.9384481310844421),
  ('pystyä', 0.932053804397583),
  ('parka', 0.9247466921806335),
  ('tosi', 0.9196494221687317),
  ('kyllä', 0.9191937446594238),
  ('tyytyä', 0.9191548824310303),
  ('aivan', 0.9187048673629761),
  ('palko', 0.9166237711906433)],
 1998: [('kova', 0.9399428963661194),
  ('tosi', 0.9090291261672974),
  ('helppo', 0.9037203788757324),
  ('pystyä', 0.9032535552978516),
  ('ihan', 0.8988174200057983),
  ('varsi', 0.8958845138549805),
  ('erittää', 0.8927438259124756),
  ('aivan', 0.8927148580551147),
  ('kyllä', 0.8925030827522278),
  ('varma', 0.886939287185669)],
 1999: [('kova', 0.8522469997406006),
  ('hieno', 0.8297808170318604),
  ('suoria', 0.8257678747177124),
  ('varmasti', 0.8252459764480591),
  ('helppo', 0.8249343633651733),
  ('mukava', 0.8249210715293884),
  ('erittää', 0.8158831596374512),
  ('erinomainen', 0.814317524433136),
  ('varsi', 0.8117330074310303),
  ('näyttää', 0

In [19]:
# Get the distance between words over years, bigger is closer:
word1 = 'naapuri'
word2 = 'hyvä'

compare_words(models, word1, word2)

{1997: 0.4587858354241383,
 1998: 0.2756432962098212,
 1999: 0.24622687787875888,
 2000: 0.1941150963025495,
 2001: 0.10008476600604832,
 2002: 0.0796201266477919,
 2003: 0.14264070700543302,
 2004: 0.049155215792477745,
 2005: 0.10637148449483241,
 2006: 0.11323065067370557,
 2007: 0.18493518621236527,
 2008: 0.18977124305855406,
 2009: 0.17671510184941228,
 2010: 0.20609463508488657,
 2011: 0.2553885168157117,
 2012: 0.2475238195808103,
 2013: 0.29418588017212083,
 'all': 0.20846976412825405}

In [22]:
# Calculate the vector math - "what is X to Russia as Sauna is to Finland":
positive_terms = ['venäjä', 'mannerheim']
negative_terms = ['suomi']

number_results = 3

word_vector_math(models, positive_terms, negative_terms, number_results)


{1997: [('karimov', 0.8205392360687256),
  ('obuch', 0.7869634032249451),
  ('tetovo', 0.7852632403373718),
  ('tshetshenia', 0.7793406248092651),
  ('sosnovyi', 0.7782284021377563)],
 1998: [('gusmseroli', 0.7299594879150391),
  ('rops:lla', 0.7259535789489746),
  ('knippschild', 0.6964709758758545),
  ('mbekki', 0.6920838356018066),
  ('berger', 0.6907790303230286)],
 1999: [('hernandez', 0.8091447353363037),
  ('lle', 0.8082089424133301),
  ('gusinsk', 0.7952283620834351),
  ('anissi', 0.7941200137138367),
  ('sandri', 0.7867745161056519)],
 2000: [('eta-järjestö', 0.7905480265617371),
  ('pakistan', 0.7869768142700195),
  ('moo', 0.7781549692153931),
  ('milosevic', 0.77313232421875),
  ('shevardnadze', 0.7716243267059326)],
 2001: [('kutsma', 0.7929550409317017),
  ('jiang', 0.7849896550178528),
  ('hyun', 0.7788676023483276),
  ('trav', 0.7760000824928284),
  ('azimbek', 0.7599808573722839)],
 2002: [('muodollinen', 0.7666659951210022),
  ('beknazarov', 0.7520277500152588),
  ('k