# Классификация новостей

In [None]:
%%capture
!wget -O lenta-ru-train.csv https://www.dropbox.com/scl/fi/xv7ycyd56omonay46co7p/lenta-ru-train.csv?rlkey=bfgu5bh5r340rddwla4b1hu4l&dl=0
!wget -O lenta-ru-test.csv https://www.dropbox.com/scl/fi/3qd36eoedlfqmzg9hm446/lenta-ru-test.csv?rlkey=8qvtm2o1xdjpg365pdgys9po8&dl=0

### Загрузка данных

In [None]:
import re
import pandas as pd

In [None]:
data_train = pd.read_csv('lenta-ru-train.csv')
data_test = pd.read_csv('lenta-ru-test.csv')

In [None]:
# убираем все небуквенные символы
regex = re.compile("[А-Яа-я:=!\)\()A-z\_\%/|]+")

def words_only(text, regex=regex):
    try:
        return " ".join(regex.findall(text.lower()))
    except:
        return ""

In [None]:
%%time
data_train['clean'] = data_train['text'].apply(words_only)
data_test['clean'] = data_test['text'].apply(words_only)

CPU times: user 7.53 s, sys: 188 ms, total: 7.72 s
Wall time: 7.95 s


### Посторим модель *word2vec* на основе слов из новостей


1.   Обучим векторную модель на основе тренировочных данных
2.   Получим агрегированный вектор для любой новости с помощью усреднения векторов слов



In [None]:
import numpy as np
from tqdm.notebook import tqdm
from gensim.models import word2vec # непосредственно методы w2v

In [None]:
sentences_train = [sentence.split() for sentence in data_train['clean'].values]

In [None]:
%%time
vector_size = 16
w2v_model = word2vec.Word2Vec(sentences=sentences_train, workers=4, vector_size=vector_size, min_count=5, window=5)

CPU times: user 3min 41s, sys: 1.25 s, total: 3min 42s
Wall time: 2min 17s


In [None]:
len(sentences_train)

80000

In [None]:
w2v_model.wv.get_vector("огонь")

array([-1.9923542 , -1.1946574 , -0.50818133, -0.2888181 , -0.37654388,
       -0.75507444,  1.7764486 , -0.35918432, -0.04814074, -0.46841207,
       -0.43638903,  0.2612304 , -1.589096  ,  1.0086256 ,  0.47250256,
       -0.9601275 ], dtype=float32)

In [None]:
X_train = []
for sentence in tqdm(sentences_train):
    sentence_vector = []
    for word in sentence:
        try:
            vector = w2v_model.wv.get_vector(word)
            sentence_vector.append(vector)
        except KeyError:
            # если слово не встретилось при обучениии модели word2vec
            pass
    if len(sentence_vector) > 0:
        sentence_vector = np.array(sentence_vector).mean(axis=0)
    else:
        # вернем нулевой вектор для новости, у которой ни одно слово не встречалось в обучении
        sentence_vector = np.zeros(vector_size)
    X_train.append(sentence_vector)

  0%|          | 0/80000 [00:00<?, ?it/s]

### Обучим линейную модель классификации на основе вектора новости

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
y_train = data_train['topic_label'].values

clf_model = LogisticRegression(multi_class="multinomial")
clf_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Строим предсказание

In [None]:
sentences_test = [sentence.split() for sentence in data_test['clean'].values]

X_test = []
for sentence in tqdm(sentences_test):
    sentence_vector = []
    for word in sentence:
        try:
            vector = w2v_model.wv.get_vector(word)
            sentence_vector.append(vector)
        except KeyError:
            # если слово не встретилось при обучениии модели word2vec
            pass
    if len(sentence_vector) > 0:
        sentence_vector = np.array(sentence_vector).mean(axis=0)
    else:
        # вернем нулевой вектор для новости, у которой ни одно слово не встречалось в обучении
        sentence_vector = np.zeros(vector_size)
    X_test.append(sentence_vector)

  0%|          | 0/5000 [00:00<?, ?it/s]

In [None]:
import numpy as np
from google.colab import files

# вычисляет и скачивает файл для submission на kaggle
def submit(model, X_test):
    predictions = model.predict(X_test)

    df = pd.DataFrame(data = {'id':np.arange(len(predictions)), 'topic_label':predictions})

    df.to_csv('submission.csv', index=False)
    files.download('submission.csv')

In [None]:
submit(clf_model, X_test)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>