In [3]:
# System libraries:
%load_ext autoreload
import pandas as pd
import json
import nltk
import numpy as np
import os
import sys
from IPython.core.interactiveshell import InteractiveShell
from pymystem3 import Mystem
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
InteractiveShell.ast_node_interactivity = "all"

In [5]:
def import_yle_ru():
    # Original columns:
    #alternativeId - drop
    #authors - keep (roughly half of them are nulls, but non-nulls might be interesting)
    #dateJsonModified - drop
    #dateContentModified - drop
    #datePublished - keep
    #headline - keep and clean
    #id - keep
    #language - drop - they are all unique
    #lead - drop
    #lifespan - drop
    #mainMedia - drop - it's pictures
    #meta - drop - of the form {'costCenters': ['1423-novosti']}
    #properties - drop - of the form [importance:normal, uutisvahti:enabled]
    #publisher - drop - array(['Yle Novosti'], dtype=object)
    #sources - keep!
    #subjects - drop for now
    #url - keep
    #content - keep
    yle_ru_raw = None
    with open("data/yle_russian_articles.json",  encoding='utf-8') as json_data:
        yle_ru_raw = json_data.read().replace('}\n{', '},\n{')

    yle_ru_dict = json.loads("[" + yle_ru_raw + "]")

    yle_ru_data = pd.DataFrame.from_dict(yle_ru_dict)

    drop_list = ['alternativeId', 'dateJsonModified', 'dateContentModified', 'language', 'lead', 
                 'lifespan', 'mainMedia', 'meta', 'properties', 'publisher', 'subjects']

    yle_ru_data = yle_ru_data.drop(columns = drop_list)

    #Filter article texts so that the headings are not included in the "content" column, and the rest of the text is joined
    yle_ru_data['content'] = yle_ru_data['content'].apply(lambda s: ' '.join([a['text'] if a['type'] == 'text' else '' for a in s]))

    #Filter article headlines to keep only the russian version
    yle_ru_data['headline'] = yle_ru_data['headline'].apply(lambda s: s['full'])
    
    return yle_ru_data

data = import_yle_ru()

#Store the output in CSV

data.iloc[:5]

data.to_csv('data/processed/yle_ru_processed.csv', index=False)

Unnamed: 0,authors,content,coverage,datePublished,headline,id,sources,url
0,,Министр по вопросам окружающей среды Вилле Ни...,local,2013-04-07T20:12:33+0300,Министр окружающей среды Ниинистё хочет переве...,3-6594601,,{'full': 'http://yle.fi/uutiset/osasto/novosti...
1,,Труд финских работников занимает 8-е по дорог...,local,2013-04-07T20:50:16+0300,"Финский труд дешевле, чем труд в Германии",3-6594603,,{'full': 'http://yle.fi/uutiset/osasto/novosti...
2,,На дорогах Южной Финляндии летние ограничения...,local,2013-04-08T08:38:40+0300,Начинается постепенный переход на летнее огран...,3-6594605,,{'full': 'http://yle.fi/uutiset/osasto/novosti...
3,,В Хельсинкском надворном суде начинается расс...,local,2013-04-08T08:57:17+0300,В надворном суде начинается разбирательство по...,3-6594611,,{'full': 'http://yle.fi/uutiset/osasto/novosti...
4,,Несмотря на все усилия и различные кампании п...,local,2013-04-08T09:18:42+0300,Летних рабочих мест не хватает для всех,3-6594606,,{'full': 'http://yle.fi/uutiset/osasto/novosti...


In [7]:
mod = False
yle_ru_data = None
if mod:
    print("Regenerating data")
    data = import_yle_ru()
else:
    print("Loading from CSV")
    data = pd.read_csv('data/processed/yle_ru_processed.csv')
    
data.iloc[:5]

Loading from CSV


Unnamed: 0,authors,content,coverage,datePublished,headline,id,sources,url
0,,Министр по вопросам окружающей среды Вилле Ни...,local,2013-04-07T20:12:33+0300,Министр окружающей среды Ниинистё хочет переве...,3-6594601,,{'full': 'http://yle.fi/uutiset/osasto/novosti...
1,,Труд финских работников занимает 8-е по дорог...,local,2013-04-07T20:50:16+0300,"Финский труд дешевле, чем труд в Германии",3-6594603,,{'full': 'http://yle.fi/uutiset/osasto/novosti...
2,,На дорогах Южной Финляндии летние ограничения...,local,2013-04-08T08:38:40+0300,Начинается постепенный переход на летнее огран...,3-6594605,,{'full': 'http://yle.fi/uutiset/osasto/novosti...
3,,В Хельсинкском надворном суде начинается расс...,local,2013-04-08T08:57:17+0300,В надворном суде начинается разбирательство по...,3-6594611,,{'full': 'http://yle.fi/uutiset/osasto/novosti...
4,,Несмотря на все усилия и различные кампании п...,local,2013-04-08T09:18:42+0300,Летних рабочих мест не хватает для всех,3-6594606,,{'full': 'http://yle.fi/uutiset/osasto/novosti...


In [8]:
# Lemmatization functions:
stop_words = stopwords.words('russian') + list(string.punctuation)
m = Mystem()

def tokenize_text(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return filtered_sentence
    
def lemmatize_text(text):
    return [m.lemmatize(w)[0] for w in text]

def tokenize_and_lemmatize(text):
    tokenized = tokenize_text(text)
    lemmatized = lemmatize_text(tokenized)
    return lemmatized

In [9]:
# lemmatize content:
lemmas_content = data.content.apply(tokenize_and_lemmatize)
lemmas_content.name = 'lemmas_content'

# lemmatize headlines:
lemmas_headline = data.headline.apply(tokenize_and_lemmatize)
lemmas_headline.name = 'lemmas_headline'

lemmas = pd.concat([lemmas_headline, lemmas_content], axis = 1)
lemmas.to_csv('data/processed/integrum_lemmas.csv', header = True, index = False)

In [None]:
# Merge lemmas with master:
data = pd.concat([data, lemmas], header = True, index = False)

In [None]:
#Tf idf code
corpus = data.iloc[:100]['lemmas_content'].apply(lambda s: ' '.join(s))
tf = TfidfVectorizer(min_df = 0, stop_words = None)
tfidf_matrix = tf.fit_transform(corpus)
feature_names = tf.get_feature_names() 
dense = tfidf_matrix.todense()

# for each row in dense argsort
# get feature names
# return matrix with top 20 terms