In [9]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

!pip install pymorphy2
import re
from pymorphy2 import MorphAnalyzer
from functools import lru_cache
from nltk.corpus import stopwords

from multiprocessing import Pool
from tqdm import tqdm

import nltk
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

np.random.seed(42)
random.seed(42)
pd.set_option('display.max_colwidth', None)

Defaulting to user installation because normal site-packages is not writeable


[nltk_data] Downloading package stopwords to /home/dmitry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# !pip install nltk
# !pip install pymorphy2

In [11]:
df = pd.read_csv("train_ml.csv")

In [12]:
# Удалим нулевые и отфармотируем дату

In [13]:
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y %H:%M')
df = df.dropna()
df = df.astype({'grades': 'int32'})

In [17]:
m = MorphAnalyzer()
regex = re.compile("[А-Яа-я]+")

def words_only(text, regex=regex):
    try:
        return regex.findall(text.lower())
    except:
        return []

@lru_cache(maxsize=128)
def lemmatize_word(token, pymorphy=m):
    return pymorphy.parse(token)[0].normal_form

def lemmatize_text(text):
    return [lemmatize_word(w) for w in text]


mystopwords = stopwords.words('russian') 
def remove_stopwords(lemmas, stopwords = mystopwords):
    return [w for w in lemmas if not w in stopwords and len(w) > 3]

def clean_text(text):
    tokens = words_only(text)
    lemmas = lemmatize_text(tokens)
    
    return ' '.join(remove_stopwords(lemmas))

In [None]:
with Pool(4) as p:
    lemmas = list(tqdm(p.imap(clean_text, df['feeds']), total=len(df)))
    
df['lemmas'] = lemmas

In [19]:
y_train = df.grades
y_train = y_train.reset_index().drop(columns='index')

In [22]:
%%time
vec = TfidfVectorizer(ngram_range=(1, 2))
bow = vec.fit_transform(lemmas)
clf = LogisticRegression(C=100, random_state=42, max_iter=3000, warm_start=True, penalty="l2")

CPU times: user 25.7 s, sys: 613 ms, total: 26.4 s
Wall time: 26.3 s


In [None]:
# небольшой трюк: будем обучаться частями
ran = np.arange(y_train.shape[0])
inds = np.array_split(ran, 10) # разобъем на 10 частей
# и подмешаем к каждому куску случайных объектов
inds = [np.concatenate((chunk, np.array(random.sample(list(ran), k=1000))), axis=None) for chunk in inds]

for chunk in tqdm(inds):
    clf.fit(bow[chunk, :], y_train.iloc[chunk].values.ravel())

  0%|                                                    | 0/10 [00:00<?, ?it/s]

In [None]:
test = pd.read_csv('new_test_ml.csv', index_col=0)
test

In [None]:
with Pool(4) as p:
    lemmas_test = list(tqdm(p.imap(clean_text, test['feeds']), total=len(test)))

In [None]:
pred = clf.predict(vec.transform(lemmas_test))
pred

In [None]:
sol = pd.DataFrame({'inds': test.index,
                    'grades': pred})
sol

In [None]:
sol.to_csv('new_baseline.csv', index=False)

In [None]:
unique, counts = np.unique(pred, return_counts=True)
np.asarray((unique, counts)).T