In [1]:
import pandas as pd
import numpy as np
from pymorphy2 import MorphAnalyzer
from scipy.sparse import hstack, vstack, save_npz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from stop_words import get_stop_words

from itertools import chain
import json
import os
import pickle
import string

PROJ_PATH = r'F:\tmp\data science\UnnParserBot'
# CLASSES   = ['економіка', 'кримінал', 'культура', 'міжнародні новини', 'позитив', 'політика', 'спорт', 'суспільство', 'технології']
CLASSES   = ['економіка', 'кримінал', 'міжнародні новини', 'політика', 'спорт', 'суспільство']



## EDA

In [2]:
# Preprocessing the data
morph = MorphAnalyzer(lang='uk')

def strip_punctuation(text):
    for char in chain(string.punctuation, ['“', '”', '„', '–', '—', '…']):
        text = text.replace(char, '')
    return text

def stemmed_text(text):
    return ' '.join([morph.parse(tok)[0].normal_form for tok in text.split(' ')])

STOP_WORDS = stemmed_text(' '.join(get_stop_words('ukrainian'))).split(' ')

with open(os.path.join(PROJ_PATH, 'src', 'categorized_news.json'), encoding='utf-8') as f:
    content       = json.load(f)
    article_names = []
    texts         = []
    headings      = []
    
    for id in content.keys():
        article_names.append(strip_punctuation(content[id]['name']))
        texts.append(strip_punctuation(content[id]['text']))
        # there are only few samples of the following headings in the train text block:
        # 'культура', 'технології', 'позитив'
        # therefore classifying those as 'суспільство' for now
        if content[id]['category'] in ['культура', 'технології', 'позитив']:
            headings.append('суспільство')
        else:
            headings.append(content[id]['category'])

len(article_names)

31000

In [3]:
# Encoding the target
le = LabelEncoder()
le.fit(CLASSES)
headings = le.transform(headings)

for class_, val in zip(np.unique(headings), le.inverse_transform(np.unique(headings))):
    print(f'{class_}: {val}')

0: економіка
1: кримінал
2: міжнародні новини
3: політика
4: спорт
5: суспільство


In [4]:
# word vectorizer object
tfidf_w = TfidfVectorizer(analyzer='word',        # token = word
                          sublinear_tf=True,
                          ngram_range=(1, 2),     # (1, 1) - only unigrams are used, (1,2) - unigrams/bigrams, etc.
                          stop_words=STOP_WORDS,  # list of words to filter or None
                          vocabulary=None,        # or dict - own_dictionary of words to process
                          max_df=0.8,             # a frequency limit to filter the words by
                          max_features=5000,      # only top N words will be used as columns,
                          smooth_idf=True,    
                          norm='l2'               # euclidean norm is used by default
                         )

In [5]:
# char vectorizer object
tfidf_ch = TfidfVectorizer(analyzer='char',        # token = word
                           ngram_range=(2, 6),     # (1, 1) - only unigrams are used, (1,2) - unigrams/bigrams, etc.
                           vocabulary=None,        # or dict - own_dictionary of words to process
                           max_df=0.8,             # a frequency limit to filter the words by
                           max_features=15000,     # only top N words will be used as columns,
                           smooth_idf=True,    
                           norm='l2'               # euclidean norm is used by default
                          )

In [6]:
# Splitting the texts into train/test folds
texts_train, texts_test, y_train, y_test = train_test_split([[text] for text in texts],
                                                            headings,
                                                            test_size=0.2,
                                                            stratify=headings,
                                                            random_state=42)

# As texts_train, texts_test are now list of lists ([['foo'], ['bar']]), converting them to lists (['foo', 'bar'])
texts_train = [t[0] for t in texts_train]
texts_test  = [t[0] for t in texts_test]
headings    = np.concatenate((y_train, y_test))

In [7]:
%%time
# Apply TfidfVectorizers to the texts and build the combined train/test matrices
tfidf_w.fit(texts_train)
tfidf_ch.fit(texts_train)

# Collecting feature names
tfidf_w_labels  = [k for k,v in sorted(list(tfidf_w.vocabulary_.items()), key=lambda x: x[1])]
tfidf_ch_labels = [k for k,v in sorted(list(tfidf_ch.vocabulary_.items()), key=lambda x: x[1])]
orig_features   = tfidf_w_labels + tfidf_ch_labels
        
w_train  = tfidf_w.transform(texts_train)
ch_train = tfidf_ch.transform(texts_train)
train_ds = hstack([w_train, ch_train])

w_test  = tfidf_w.transform(texts_test)
ch_test = tfidf_ch.transform(texts_test)
test_ds = hstack([w_test, ch_test])

train_ds.shape, test_ds.shape

  'stop_words.' % sorted(inconsistent))


Wall time: 3min 31s


((24800, 20000), (6200, 20000))

In [8]:
# Merging train and test
all_ds = vstack([train_ds, test_ds])
all_ds.shape

(31000, 20000)

In [None]:
# In case we need to show the generated data...
#data['article_names'] = article_names
#labels_w = [k for k,v in sorted(list(tfidf_w.vocabulary_.items()), key=lambda x: x[1])]
#labels_ch = [k for k,v in sorted(list(tfidf_ch.vocabulary_.items()), key=lambda x: x[1])]
#data.update(dict(zip(labels_w + labels_ch, texts_transformed.T.toarray())))
#data.update({'headings': headings})
#dataset = pd.DataFrame(data)
#dataset.head()

In [9]:
# Saving the all_ds and headings
#save_npz(os.path.join(PROJ_PATH, 'src', 'class_data.npz'), all_ds)
#with open(os.path.join(PROJ_PATH, 'src', 'class_headings.hd'), 'wb') as f:
#    pickle.dump(headings, f)

In [10]:
# Saving the vectorizer objects
#with open(os.path.join(PROJ_PATH, 'src', 'class_tfidf_w.vct'), 'wb') as f:
#    pickle.dump(tfidf_w, f)
#    
#with open(os.path.join(PROJ_PATH, 'src', 'class_tfidf_ch.vct'), 'wb') as f:
#    pickle.dump(tfidf_ch, f)