In [4]:
# System libraries:
%load_ext autoreload
import pandas as pd
import json
import nltk
import numpy as np
import os
import sys
from IPython.core.interactiveshell import InteractiveShell
from pymystem3 import Mystem
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from datetime import datetime
import re
import gensim.downloader as api
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
InteractiveShell.ast_node_interactivity = "all"

In [23]:
def get_corpus_tfidf(documents):
    dictionary = Dictionary(documents)
    n_items = len(dictionary)
    corpus = [dictionary.doc2bow(text) for text in documents]
    tfidf = TfidfModel(corpus)
    return tfidf[corpus], dictionary


def get_tf_idf_counts(series):
    tf_idf_flattened = [b for a in series for b in a if not type(a) == float]
    counts = Counter(tf_idf_flattened)
    return counts


def get_top_words(corpus_tfidf, dictionary):
    top_words_series = pd.Series(index=range(0,len(corpus_tfidf)))
    top_words_series.name = 'tf_idf_words'
    for i, row in enumerate(corpus_tfidf):
        # sort the list
        sorted_by_second = sorted(row, key=lambda tup: tup[1])
        # get the top 20 elements
        a = np.array(sorted_by_second[-20:], dtype=np.int32)
        # get the first indices
        idx = a[:,0].tolist()
        #dictionary[a[:,0].tolist()]
        article_top_words = [dictionary[i] for i in idx]
        top_words_series.iloc[i] = article_top_words
        
    return top_words_series


def output_wordcloud(counts, title = None, save_image = False):
    cloud = WordCloud(stopwords=None, background_color='black', width=1200, height=900
                     ).generate_from_frequencies(counts)

    plt.figure(figsize=(20,10))
    if title:
        plt.title(title, fontsize=45)
    plt.imshow(cloud)
    plt.axis('off')
    if save_image and title:
        plt.savefig('generated_images/' + title + ".png")
    plt.show()

def print_yearly_wordclouds(tfidf_data, prefix):
    for year in sorted(tfidf_data.year.unique()):
        yearly_data = tfidf_data[tfidf_data['year'] == year].tf_idf_words
        counts = get_tf_idf_counts(yearly_data)
        title = prefix + str(year)
        output_wordcloud(counts, title, True)

In [6]:
data = pd.read_csv('data/processed/yle_fi_lemmas.csv', index_col = 0, converters = {'lemmas_content' : eval})
data.iloc[:5]

Unnamed: 0,headline,content,datePublished,lemmas_content,year
0,Kaukasian konflikti: Aseissa Georgia ja Venäjä...,Kaukasian konflikti: Aseissa Georgia ja Venäjä...,2008-08-11T11:03:41+0300,"[Kaukasia, konflikti, :, ase, Georg, ja, venäj...",2008
1,Otteita Venäjän-tuntijoiden vastauksista,Otteita Venäjän-tuntijoiden vastauksista\n\nTä...,2007-01-14T12:46:58+0200,"[ote, venäjä, vasta, tämä, tiivistelmä, olla, ...",2007
2,Etelä-Ossetian ruutitynnyri räjähti viimein,Etelä-Ossetian ruutitynnyri räjähti viimein\n\...,2008-08-12T21:33:32+0300,"[Etelä-Ossetia, ruuti, räjähtää, viimein, perj...",2008
3,Yle Uutiset seuraa Ukrainan kriisiä hetki hetk...,Yle Uutiset seuraa Ukrainan kriisiä hetki hetk...,2014-03-18T09:06:59+0200,"[Yle, uutinen, seura, ukraina, kriisi, hetki, ...",2014
4,Saksan ulkopolitiikan täyskäännös,Saksan ulkopolitiikan täyskäännös\n\nTuskin mi...,2014-10-02T12:30:30+0300,"[saksa, ulko-, täyskäännös, tuska, mikään, muu...",2014


In [63]:

cleanup.match('alude')



In [8]:
# clear lemmas from special characters:
stop_words = stopwords.words('finnish') + list(string.punctuation) 
re_cleanup = re.compile('(^quot$|^lt$|^gt$|^II$|^\/.*|^http.*|^ .*|^www|^-.*|.*[0-9].*|\?|\.|^\&.*|.*-$|^lude$|^lisä$|\\n|^[A-Z]$|\*|\_|\'|\"|\`|nbsp|^.$|mdash|^Ag$|^Co|^Bank|Inc)', re.IGNORECASE)

def remove_stop_words(word_list):
    filtered_sentence = [w.lower() for w in word_list if not w in stop_words and not re_cleanup.match(w)]
    return filtered_sentence

# Replace list - Putti - Putin

In [9]:
cleaned_lemmas = data.lemmas_content.apply(remove_stop_words)

In [14]:
data.lemmas_content = cleaned_lemmas
data.to_csv('data/processed/yle_fi_lemmas.csv', index = True)


In [11]:
yle_fi_tfidf, yle_fi_dictionary = get_corpus_tfidf(data.lemmas_content)

In [15]:
top_words_yle_fi = get_top_words(yle_fi_tfidf, yle_fi_dictionary)

In [16]:
yle_fi_tfidf = pd.concat([data['year'], top_words_yle_fi], axis = 1)


In [17]:
# drop rows where tfidf is nan
yle_fi_tfidf = yle_fi_tfidf.drop(yle_fi_tfidf[yle_fi_tfidf.tf_idf_words.isnull()].index)


In [18]:
# drop rows where year is nan
yle_fi_tfidf = yle_fi_tfidf.drop(yle_fi_tfidf[yle_fi_tfidf.year.isnull()].index)


In [19]:
# cast years to int
yle_fi_tfidf.year = yle_fi_tfidf.year.astype(int)

In [20]:
yle_fi_tfidf.to_csv('data/processed/yle_fi_tfidf.csv')

In [24]:
print_yearly_wordclouds(yle_fi_tfidf, "YLE Finnish - ")

TypeError: print_yearly_wordclouds() takes 2 positional arguments but 3 were given

In [None]:
# Something broken in 2009
prefix = "YLE Finnish - "
tfidf_data = yle_fi_tfidf
year = 2009
yearly_data = tfidf_data[tfidf_data['year'] == year].tf_idf_words
counts = get_tf_idf_counts(yearly_data)  
title = prefix + str(year)
output_wordcloud(counts, title)

In [32]:
data = yle_fi_local_lemmas
col_names = ['headline','lead','content','datePublished','lemmas_content']
data.columns = col_names
datestring = data.iloc[0].datePublished
datestring
datetime.strptime(datestring[:10], '%Y-%m-%d').year
data['year'] = data['datePublished'].apply(lambda s: datetime.strptime(s[:10], '%Y-%m-%d').year)

'2008-08-11T11:03:41+0300'

2008

In [52]:
def extract_year(s):
    year = None
    try:
        year = datetime.strptime(s[:10], '%Y-%m-%d').year
    except ValueError:
        pass
    
    return year

In [53]:
data['year'] = data['datePublished'].apply(extract_year)

In [60]:
data = data.drop(data[data['year'].isna()].index)

In [65]:
data = data.drop('lead', 1)

In [74]:
data.to_csv('data/processed/yle_fi_lemmas.csv', header = True)

In [None]:
# Run tf-idf on yle_fi

