# Домашнее задание  № 5. Матричные разложения/Тематическое моделирование

In [106]:
import gensim
import pandas as pd
import numpy as np
from pymorphy2 import MorphAnalyzer
from collections import Counter
from string import punctuation
from razdel import tokenize as razdel_tokenize
from IPython.display import Image
from IPython.core.display import HTML 
from sklearn.decomposition import TruncatedSVD, NMF, PCA, LatentDirichletAllocation
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
from matplotlib import pyplot as plt
import seaborn as sns
morph = MorphAnalyzer()
warnings.filterwarnings("ignore")
import pyLDAvis.gensim_models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler

### Задание № 1 (4 балла)

Попробуйте матричные разложения с 5 классификаторами - SGDClassifier, KNeighborsClassifier,  RandomForest, ExtraTreesClassifier (про него подробнее почитайте в документации, он похож на RF). Используйте и NMF и SVD. Сравните результаты на кросс-валидации и выберите лучшее сочетание.

В итоге у вас должно получиться, как минимум 10 моделей (два разложения на каждый классификатор). Используйте 1 и те же параметры кросс-валидации. Параметры векторизации, параметры K в матричных разложениях, параметры классификаторов могут быть разными между экспериментами.

Можете взять поменьше данных, если все будет обучаться слишком долго (не ставьте параметр K слишком большим в NMF, иначе точно будет слишком долго)

In [107]:
# лемматизация
def normalize(text):
    normalized_text = [word.text.strip(punctuation) for word \
                                                            in razdel_tokenize(text)]
    normalized_text = [word.lower() for word in normalized_text if word and len(word) < 20 ]
    normalized_text = [morph.parse(word)[0].normal_form for word in normalized_text]
    return ' '.join(normalized_text)


#### Работа с данными

In [108]:
data = pd.read_csv('avito_category_classification.csv')

In [109]:
data

Unnamed: 0,category_name,description
0,Автомобили,"отличное состояние,обслужиание в салоне"
1,Детская одежда и обувь,В отличном состоянии. Фирма KIKO. Очень теплый...
2,Предложение услуг,"Изготовление ограждений, перил,качелей, турник..."
3,Автомобили,Автомобиль в отличном техническом состоянии. О...
4,Бытовая техника,"Продается газовая плита ""Гефест"" (Белоруссия) ..."
...,...,...
9893,Товары для детей и игрушки,Чтобы посмотреть весь ассортимент нашего магаз...
9894,Детская одежда и обувь,"Весна,осень.74-80.вопросы можно в вайбер,двухс..."
9895,"Одежда, обувь, аксессуары","Кимоно Green Hill. Состояние отличное, рост ..."
9896,Детская одежда и обувь,Б/у кроссовки на девочку. Носили только в спор...


In [110]:
data['description_norm'] = data['description'].apply(normalize)

In [111]:
data

Unnamed: 0,category_name,description,description_norm
0,Автомобили,"отличное состояние,обслужиание в салоне",отличный состояние обслужиание в салон
1,Детская одежда и обувь,В отличном состоянии. Фирма KIKO. Очень теплый...,в отличный состояние фирма kiko очень тёплый у...
2,Предложение услуг,"Изготовление ограждений, перил,качелей, турник...",изготовление ограждение перила качели турников...
3,Автомобили,Автомобиль в отличном техническом состоянии. О...,автомобиль в отличный технический состояние од...
4,Бытовая техника,"Продается газовая плита ""Гефест"" (Белоруссия) ...",продаваться газовый плита гефест белоруссия б ...
...,...,...,...
9893,Товары для детей и игрушки,Чтобы посмотреть весь ассортимент нашего магаз...,чтобы посмотреть весь ассортимент наш магазин ...
9894,Детская одежда и обувь,"Весна,осень.74-80.вопросы можно в вайбер,двухс...",весна осень 74-80 вопрос можно в вайбер двухст...
9895,"Одежда, обувь, аксессуары","Кимоно Green Hill. Состояние отличное, рост ...",кимоно green hill состояние отличный рост 160-...
9896,Детская одежда и обувь,Б/у кроссовки на девочку. Носили только в спор...,б у кроссовок на девочка носить только в спортзал


#### Функция classification report со встроенной кросс-валидацией

In [112]:
def eval_table(X, y, pipeline, N=6):
    # зафиксируем порядок классов
    labels = list(set(y))
    
    # метрики отдельных фолдов будет хранить в табличке
    fold_metrics = pd.DataFrame(index=labels)
    # дополнительно также соберем таблицу ошибок
    errors = np.zeros((len(labels), len(labels)))
    
    # создаем стратегию кросс-валидации
    # shuffle=True (перемешивание) - часто критично важно указать
    # т.к. данные могут быть упорядочены и модель на этом обучится
    kfold = StratifiedKFold(n_splits=N, shuffle=True, )
    
    for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
        # fit-predict как и раньше, но сразу пайплайном
        pipeline.fit(X[train_index], y[train_index])
        preds = pipeline.predict(X[test_index])
        
        # записываем метрику и индекс фолда
        fold_metrics[f'precision_{i}'] = precision_score(y[test_index], preds, labels=labels, average=None)
        fold_metrics[f'recall_{i}'] = recall_score(y[test_index], preds, labels=labels, average=None)
        fold_metrics[f'f1_{i}'] = f1_score(y[test_index], preds, labels=labels, average=None)
        errors += confusion_matrix(y[test_index], preds, labels=labels, normalize='true')
    
    # таблица для усредненных значений
    # тут мы берем колонки со значениями и усредняем их
    # часто также все метрики сразу суммируют и в конце просто делят на количество фолдов
    # но мы тут помимо среднего также хотим посмотреть на стандартное отклонение
    # чтобы понять как сильно варьируются оценки моделей
    result = pd.DataFrame(index=labels)
    result['precision'] = fold_metrics[[f'precision_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['precision_std'] = fold_metrics[[f'precision_{i}' for i in range(N)]].std(axis=1).round(2)
    
    result['recall'] = fold_metrics[[f'recall_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['recall_std'] = fold_metrics[[f'recall_{i}' for i in range(N)]].std(axis=1).round(2)
    
    result['f1'] = fold_metrics[[f'f1_{i}' for i in range(N)]].mean(axis=1).round(2)
    result['f1_std'] = fold_metrics[[f'f1_{i}' for i in range(N)]].std(axis=1).round(2)
    
    # добавим одну колонку со средним по всем классам
    result.loc['mean'] = result.mean().round(2)
    # проценты ошибок просто усредняем
    errors /= N
    
    return result, errors

#### Классификаторы с SVD

In [113]:
# Random forest
pipeline_svd_RF = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(500)),
    ('clf', RandomForestClassifier(n_estimators=100, max_depth=10))
])

In [114]:
# K-Neighbors
pipeline_svd_KNN = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(500)),
    ('clf', KNeighborsClassifier(n_neighbors=6))
])

In [115]:
# Extra Trees
pipeline_svd_X3 = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(500)),
    ('clf', ExtraTreesClassifier(random_state=0))
])

In [116]:
# SGD
pipeline_svd_SGD = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('svd', TruncatedSVD(500)),
    ('clf', SGDClassifier(max_iter=1000, tol=1e-3))
])

#### Классификаторы с NMF

In [117]:
# Random forest
pipeline_nmf_RF = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('decomposition', NMF(50)),
    ('clf', RandomForestClassifier(n_estimators=100, max_depth=10))
])

In [118]:
# K-Neighbors
pipeline_nmf_KNN = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('decomposition', NMF(50)),
    ('clf', KNeighborsClassifier(n_neighbors=6))
])

In [119]:
# Extra Trees
pipeline_nmf_X3 = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('decomposition', NMF(50)),
    ('clf', ExtraTreesClassifier(random_state=0))
])

In [120]:
# SGD
pipeline_nmf_SGD = Pipeline([
    ('bow', CountVectorizer(tokenizer=lambda x: x.split(), ngram_range=(1,2), min_df=5, max_df=0.4)),
    ('decomposition', NMF(50)),
    ('clf', SGDClassifier(max_iter=1000, tol=1e-3))
])

#### Random forest

In [121]:
metrics_svd_RF, errors_svd_RF = eval_table(data['description_norm'], data['category_name'], pipeline_svd_RF)

In [122]:
metrics_nmf_RF, errors_nmf_RF = eval_table(data['description_norm'], data['category_name'], pipeline_nmf_RF)

In [123]:
metrics_svd_RF

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.47,0.01,0.77,0.01,0.58,0.01
Автомобили,0.85,0.05,0.61,0.06,0.7,0.04
Предложение услуг,0.74,0.06,0.46,0.04,0.57,0.04
Квартиры,0.88,0.02,0.89,0.03,0.88,0.02
Мебель и интерьер,0.88,0.21,0.04,0.02,0.07,0.04
Ремонт и строительство,0.71,0.11,0.09,0.03,0.16,0.04
Товары для детей и игрушки,0.81,0.07,0.18,0.03,0.29,0.04
Бытовая техника,0.89,0.13,0.06,0.03,0.12,0.05
Телефоны,0.94,0.03,0.34,0.04,0.49,0.05
Детская одежда и обувь,0.45,0.01,0.68,0.03,0.54,0.02


In [124]:
metrics_nmf_RF

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.5,0.02,0.74,0.03,0.6,0.02
Автомобили,0.79,0.04,0.78,0.05,0.78,0.03
Предложение услуг,0.67,0.04,0.64,0.02,0.65,0.02
Квартиры,0.94,0.02,0.93,0.03,0.93,0.01
Мебель и интерьер,0.69,0.09,0.24,0.07,0.35,0.08
Ремонт и строительство,0.67,0.08,0.23,0.04,0.34,0.06
Товары для детей и игрушки,0.76,0.03,0.43,0.07,0.54,0.06
Бытовая техника,0.8,0.4,0.04,0.04,0.07,0.07
Телефоны,0.75,0.03,0.64,0.05,0.69,0.04
Детская одежда и обувь,0.56,0.02,0.66,0.01,0.61,0.01


In [125]:
metrics_svd_RF - metrics_nmf_RF

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",-0.03,-0.01,0.03,-0.02,-0.02,-0.01
Автомобили,0.06,0.01,-0.17,0.01,-0.08,0.01
Предложение услуг,0.07,0.02,-0.18,0.02,-0.08,0.02
Квартиры,-0.06,0.0,-0.04,0.0,-0.05,0.01
Мебель и интерьер,0.19,0.12,-0.2,-0.05,-0.28,-0.04
Ремонт и строительство,0.04,0.03,-0.14,-0.01,-0.18,-0.02
Товары для детей и игрушки,0.05,0.04,-0.25,-0.04,-0.25,-0.02
Бытовая техника,0.09,-0.27,0.02,-0.01,0.05,-0.02
Телефоны,0.19,0.0,-0.3,-0.01,-0.2,0.01
Детская одежда и обувь,-0.11,-0.01,0.02,0.02,-0.07,0.01


Среди Random Forest классификаторов наиболее эффективным оказался классификатор **Random Forest с NMF**.

**KNeighborsClassifier**

In [126]:
metrics_svd_KNN, errors_svd_KNN = eval_table(data['description_norm'], data['category_name'], pipeline_svd_KNN)

In [127]:
metrics_nmf_KNN, errors_nmf_KNN = eval_table(data['description_norm'], data['category_name'], pipeline_nmf_KNN)

In [128]:
metrics_svd_KNN

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.46,0.02,0.55,0.03,0.5,0.02
Автомобили,0.57,0.04,0.59,0.03,0.58,0.04
Предложение услуг,0.57,0.04,0.57,0.06,0.57,0.04
Квартиры,0.94,0.01,0.72,0.04,0.82,0.02
Мебель и интерьер,0.35,0.06,0.25,0.05,0.29,0.05
Ремонт и строительство,0.27,0.05,0.15,0.02,0.19,0.02
Товары для детей и игрушки,0.51,0.04,0.23,0.03,0.32,0.03
Бытовая техника,0.31,0.06,0.27,0.1,0.28,0.08
Телефоны,0.7,0.1,0.31,0.04,0.43,0.05
Детская одежда и обувь,0.47,0.01,0.66,0.03,0.55,0.01


In [129]:
metrics_nmf_KNN

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.46,0.02,0.52,0.02,0.49,0.01
Автомобили,0.52,0.03,0.62,0.03,0.57,0.03
Предложение услуг,0.58,0.02,0.6,0.04,0.59,0.02
Квартиры,0.92,0.02,0.83,0.05,0.87,0.02
Мебель и интерьер,0.26,0.04,0.17,0.04,0.2,0.04
Ремонт и строительство,0.38,0.07,0.19,0.04,0.26,0.05
Товары для детей и игрушки,0.54,0.08,0.23,0.05,0.32,0.06
Бытовая техника,0.2,0.06,0.14,0.05,0.16,0.04
Телефоны,0.51,0.08,0.28,0.06,0.36,0.07
Детская одежда и обувь,0.41,0.02,0.59,0.02,0.49,0.02


In [130]:
metrics_svd_KNN - metrics_nmf_KNN

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.0,0.0,0.03,0.01,0.01,0.01
Автомобили,0.05,0.01,-0.03,0.0,0.01,0.01
Предложение услуг,-0.01,0.02,-0.03,0.02,-0.02,0.02
Квартиры,0.02,-0.01,-0.11,-0.01,-0.05,0.0
Мебель и интерьер,0.09,0.02,0.08,0.01,0.09,0.01
Ремонт и строительство,-0.11,-0.02,-0.04,-0.02,-0.07,-0.03
Товары для детей и игрушки,-0.03,-0.04,0.0,-0.02,0.0,-0.03
Бытовая техника,0.11,0.0,0.13,0.05,0.12,0.04
Телефоны,0.19,0.02,0.03,-0.02,0.07,-0.02
Детская одежда и обувь,0.06,-0.01,0.07,0.01,0.06,-0.01


Среди KNN классификаторов более эффективным оказался классификатор **KNN с SVD**.

**ExtraTreesClassifier**

In [131]:
metrics_svd_X3, errors_svd_X3 = eval_table(data['description_norm'], data['category_name'], pipeline_svd_X3)

In [132]:
metrics_nmf_X3, errors_nmf_X3 = eval_table(data['description_norm'], data['category_name'], pipeline_nmf_X3)

In [133]:
metrics_svd_X3

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.46,0.01,0.7,0.02,0.56,0.01
Автомобили,0.8,0.06,0.45,0.01,0.58,0.01
Предложение услуг,0.88,0.09,0.28,0.06,0.42,0.07
Квартиры,0.72,0.04,0.8,0.05,0.76,0.04
Мебель и интерьер,0.75,0.08,0.11,0.03,0.2,0.05
Ремонт и строительство,0.62,0.17,0.09,0.04,0.16,0.06
Товары для детей и игрушки,0.63,0.08,0.15,0.04,0.25,0.05
Бытовая техника,0.59,0.18,0.14,0.06,0.23,0.08
Телефоны,0.86,0.04,0.33,0.06,0.48,0.06
Детская одежда и обувь,0.43,0.01,0.71,0.03,0.54,0.01


In [134]:
metrics_nmf_X3

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.59,0.03,0.65,0.03,0.62,0.02
Автомобили,0.76,0.03,0.83,0.03,0.79,0.01
Предложение услуг,0.68,0.04,0.74,0.06,0.71,0.04
Квартиры,0.93,0.02,0.95,0.02,0.94,0.01
Мебель и интерьер,0.59,0.06,0.33,0.02,0.42,0.03
Ремонт и строительство,0.52,0.05,0.33,0.07,0.4,0.06
Товары для детей и игрушки,0.62,0.04,0.47,0.02,0.53,0.03
Бытовая техника,0.57,0.1,0.2,0.06,0.29,0.07
Телефоны,0.76,0.05,0.68,0.05,0.71,0.05
Детская одежда и обувь,0.58,0.02,0.7,0.03,0.63,0.02


In [135]:
metrics_svd_X3 - metrics_nmf_X3

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",-0.13,-0.02,0.05,-0.01,-0.06,-0.01
Автомобили,0.04,0.03,-0.38,-0.02,-0.21,0.0
Предложение услуг,0.2,0.05,-0.46,0.0,-0.29,0.03
Квартиры,-0.21,0.02,-0.15,0.03,-0.18,0.03
Мебель и интерьер,0.16,0.02,-0.22,0.01,-0.22,0.02
Ремонт и строительство,0.1,0.12,-0.24,-0.03,-0.24,0.0
Товары для детей и игрушки,0.01,0.04,-0.32,0.02,-0.28,0.02
Бытовая техника,0.02,0.08,-0.06,0.0,-0.06,0.01
Телефоны,0.1,-0.01,-0.35,0.01,-0.23,0.01
Детская одежда и обувь,-0.15,-0.01,0.01,0.0,-0.09,-0.01


Среди ExtraTrees классификаторов более эффективным оказался классификатор **ExtraTrees с NMF**.

**SGD**

In [136]:
metrics_svd_SGD, errors_svd_SGD = eval_table(data['description_norm'], data['category_name'], pipeline_svd_SGD)

In [137]:
metrics_nmf_SGD, errors_nmf_SGD = eval_table(data['description_norm'], data['category_name'], pipeline_nmf_SGD)

In [138]:
metrics_svd_SGD

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.7,0.03,0.78,0.03,0.74,0.01
Автомобили,0.87,0.03,0.9,0.03,0.89,0.02
Предложение услуг,0.77,0.03,0.75,0.02,0.76,0.01
Квартиры,0.95,0.02,0.96,0.02,0.96,0.01
Мебель и интерьер,0.74,0.05,0.61,0.04,0.67,0.04
Ремонт и строительство,0.58,0.06,0.5,0.07,0.54,0.06
Товары для детей и игрушки,0.76,0.06,0.63,0.04,0.69,0.03
Бытовая техника,0.64,0.17,0.52,0.05,0.57,0.09
Телефоны,0.83,0.04,0.78,0.08,0.8,0.04
Детская одежда и обувь,0.74,0.03,0.76,0.02,0.75,0.02


In [139]:
metrics_nmf_SGD

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.47,0.09,0.76,0.11,0.57,0.03
Автомобили,0.77,0.11,0.71,0.07,0.73,0.04
Предложение услуг,0.67,0.06,0.55,0.13,0.6,0.08
Квартиры,0.77,0.2,0.95,0.02,0.84,0.13
Мебель и интерьер,0.39,0.15,0.17,0.08,0.22,0.03
Ремонт и строительство,0.65,0.13,0.2,0.11,0.29,0.14
Товары для детей и игрушки,0.67,0.16,0.41,0.06,0.5,0.05
Бытовая техника,0.5,0.21,0.08,0.04,0.13,0.07
Телефоны,0.76,0.17,0.39,0.11,0.49,0.06
Детская одежда и обувь,0.52,0.09,0.4,0.17,0.42,0.07


In [140]:
metrics_svd_SGD - metrics_nmf_SGD

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.23,-0.06,0.02,-0.08,0.17,-0.02
Автомобили,0.1,-0.08,0.19,-0.04,0.16,-0.02
Предложение услуг,0.1,-0.03,0.2,-0.11,0.16,-0.07
Квартиры,0.18,-0.18,0.01,0.0,0.12,-0.12
Мебель и интерьер,0.35,-0.1,0.44,-0.04,0.45,0.01
Ремонт и строительство,-0.07,-0.07,0.3,-0.04,0.25,-0.08
Товары для детей и игрушки,0.09,-0.1,0.22,-0.02,0.19,-0.02
Бытовая техника,0.14,-0.04,0.44,0.01,0.44,0.02
Телефоны,0.07,-0.13,0.39,-0.03,0.31,-0.02
Детская одежда и обувь,0.22,-0.06,0.36,-0.15,0.33,-0.05


Среди SGD классификаторов более эффективным оказался классификатор **SGD с SVD**.

#### Полуфинал

In [141]:
metrics_svd_SGD-metrics_nmf_RF

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",0.2,0.01,0.04,0.0,0.14,-0.01
Автомобили,0.08,-0.01,0.12,-0.02,0.11,-0.01
Предложение услуг,0.1,-0.01,0.11,0.0,0.11,-0.01
Квартиры,0.01,0.0,0.03,-0.01,0.03,0.0
Мебель и интерьер,0.05,-0.04,0.37,-0.03,0.32,-0.04
Ремонт и строительство,-0.09,-0.02,0.27,0.03,0.2,0.0
Товары для детей и игрушки,0.0,0.03,0.2,-0.03,0.15,-0.03
Бытовая техника,-0.16,-0.23,0.48,0.01,0.5,0.02
Телефоны,0.08,0.01,0.14,0.03,0.11,0.0
Детская одежда и обувь,0.18,0.01,0.1,0.01,0.14,0.01


Классификатор **SGD с SVD** более эффективен, чем классификатор Random Forest с NMF.

In [145]:
metrics_svd_KNN-metrics_nmf_X3

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",-0.13,-0.01,-0.1,0.0,-0.12,0.0
Автомобили,-0.19,0.01,-0.24,0.0,-0.21,0.03
Предложение услуг,-0.11,0.0,-0.17,0.0,-0.14,0.0
Квартиры,0.01,-0.01,-0.23,0.02,-0.12,0.01
Мебель и интерьер,-0.24,0.0,-0.08,0.03,-0.13,0.02
Ремонт и строительство,-0.25,0.0,-0.18,-0.05,-0.21,-0.04
Товары для детей и игрушки,-0.11,0.0,-0.24,0.01,-0.21,0.0
Бытовая техника,-0.26,-0.04,0.07,0.04,-0.01,0.01
Телефоны,-0.06,0.05,-0.37,-0.01,-0.28,0.0
Детская одежда и обувь,-0.11,-0.01,-0.04,0.0,-0.08,-0.01


Классификатор KNN с SVD менее эффективен, чем классификатор **Extra Tress с NMF**.

#### Финал

In [147]:
metrics_nmf_X3-metrics_svd_SGD

Unnamed: 0,precision,precision_std,recall,recall_std,f1,f1_std
"Одежда, обувь, аксессуары",-0.11,0.0,-0.13,0.0,-0.12,0.01
Автомобили,-0.11,0.0,-0.07,0.0,-0.1,-0.01
Предложение услуг,-0.09,0.01,-0.01,0.04,-0.05,0.03
Квартиры,-0.02,0.0,-0.01,0.0,-0.02,0.0
Мебель и интерьер,-0.15,0.01,-0.28,-0.02,-0.25,-0.01
Ремонт и строительство,-0.06,-0.01,-0.17,0.0,-0.14,0.0
Товары для детей и игрушки,-0.14,-0.02,-0.16,-0.02,-0.16,0.0
Бытовая техника,-0.07,-0.07,-0.32,0.01,-0.28,-0.02
Телефоны,-0.07,0.01,-0.1,-0.03,-0.09,0.01
Детская одежда и обувь,-0.16,-0.01,-0.06,0.01,-0.12,0.0


По итогам эксперимента самой эффективной моделью оказался классификатор **SGD с SVD**.

### Задание № 2 (6 баллов)

In [152]:
otexts = open('wiki_data.txt','r', encoding = 'utf-8').read().splitlines()[:1000]

In [153]:
otexts[0]

'######Новостройка (Нижегородская область)############Новостро́йка — сельский посёлок в Дивеевском районе Нижегородской области. Входит в состав Сатисского сельсовета.############Посёлок расположен в 12,5 км к югу от села Дивеева и 1 км к западу от города Сарова, на правом берегу реки Вичкинза (правый приток реки Сатис). Окружён смешанными лесами. Соединён асфальтовой дорогой с посёлком Цыгановка (1,5 км) и грунтовыми просёлочными дорогами с посёлком Сатис (3,5 км). Название Новостройка является сугубо официальным, местное население использует исключительно альтернативное название — Хитрый. Употребляется языковой оборот «…на Хитром». Ранее используемые названия — Песчаный, Известковый.############Основан в 1920-х годах переселенцами из соседних сёл Аламасово и Нарышкино (расположенных соответственно в 8 и 14 км к западу в Вознесенском районе).############Традиционно в посёлке жили рабочие совхоза «Вперёд» (центр в посёлке Сатис). Возле посёлка расположен карьер где активно добывали дол

In [154]:
texts = ([normalize(text) for text in otexts])

### Без TF-IDF и n-граммов

In [156]:
dictinary = gensim.corpora.Dictionary((text.split() for text in texts))

In [157]:
dictinary.filter_extremes(no_above=0.1, no_below=10)
dictinary.compactify()

In [158]:
print(dictinary)

Dictionary(2301 unique tokens: ['1,5', '12', '14', '16', '1920-й']...)


In [159]:
corpus = [dictinary.doc2bow(text.split()) for text in texts]

In [160]:
lda = gensim.models.LdaMulticore(corpus, 
                                 100, # колиество тем
                                 alpha='asymmetric',
                                 id2word=dictinary, 
                                 passes=10) 
# если не поддерживается многопоточность
# lda = gensim.models.LdaModel(200, id2word=dictinary, passes=5)

In [310]:
lda.print_topics()[:10]

[(99,
  '0.038*"италия" + 0.034*"рим" + 0.027*"1940" + 0.025*"март" + 0.018*"1941" + 0.018*"тип" + 0.017*"итальянский" + 0.017*"вступить" + 0.017*"исключить" + 0.016*"эсминец"'),
 (97,
  '0.032*"день" + 0.019*"дерево" + 0.019*"департамент" + 0.018*"форма" + 0.017*"участник" + 0.017*"северо-запад" + 0.014*"простой" + 0.014*"восток" + 0.013*"германия" + 0.013*"регион"'),
 (98,
  '0.055*"язык" + 0.043*"семейство" + 0.036*"1965" + 0.029*"остров" + 0.026*"распространение" + 0.023*"часто" + 0.019*"версия" + 0.017*"пара" + 0.017*"встречаться" + 0.017*"австралия"'),
 (96,
  '0.031*"памятник" + 0.018*"сад" + 0.017*"фон" + 0.016*"сопротивление" + 0.016*"германия" + 0.014*"университет" + 0.013*"северо-восточный" + 0.012*"степень" + 0.010*"установить" + 0.009*"мужчина"'),
 (94,
  '0.058*"музей" + 0.036*"здание" + 0.028*"художник" + 0.026*"искусство" + 0.022*"улица" + 0.020*"дом" + 0.018*"век" + 0.014*"произведение" + 0.014*"азербайджан" + 0.013*"среди"'),
 (93,
  '0.023*"южный" + 0.022*"тысяча" + 

In [225]:
print("Перплексия:", "\n", np.exp2(-lda.log_perplexity(corpus[:1000])))

Перплексия: 
 214.93282684557354


In [163]:
topics = []
for topic_id, topic in lda.show_topics(num_topics=100, formatted=False):
    topic = [word for word, _ in topic]
    topics.append(topic)

In [164]:
coherence_model_lda = gensim.models.CoherenceModel(topics=topics, 
                                                   texts=[text.split() for text in texts], 
                                                   dictionary=dictinary, coherence='c_v')

In [224]:
print("Когерентность:", "\n", coherence_model_lda.get_coherence())

Когерентность: 
 0.49362914937785335


### C TF-IDF

In [247]:
dictinary1 = gensim.corpora.Dictionary((text.split() for text in texts))
dictinary1.filter_extremes(no_above=0.1, no_below=10)
dictinary1.compactify()

In [280]:
print(dictinary1)

Dictionary(2301 unique tokens: ['1,5', '12', '14', '16', '1920-й']...)


In [301]:
corpus1 = [dictinary1.doc2bow(text.split()) for text in texts]

In [302]:
tfidf = gensim.models.TfidfModel(corpus1, id2word=dictinary1)
corpus1 = tfidf[corpus1]

In [303]:
lda1 = gensim.models.LdaMulticore(corpus1, 
                                 100, # колиество тем
                                 alpha='asymmetric',
                                 id2word=dictinary1, 
                                 passes=10) 

In [304]:
lda1.print_topics()[:10]

[(99,
  '0.000*"конечный" + 0.000*"круглый" + 0.000*"изобразить" + 0.000*"историк" + 0.000*"каменный" + 0.000*"комиссар" + 0.000*"конгресс" + 0.000*"здание" + 0.000*"красивый" + 0.000*"лишить"'),
 (97,
  '0.000*"конечный" + 0.000*"круглый" + 0.000*"изобразить" + 0.000*"историк" + 0.000*"каменный" + 0.000*"комиссар" + 0.000*"конгресс" + 0.000*"здание" + 0.000*"красивый" + 0.000*"лишить"'),
 (98,
  '0.000*"конечный" + 0.000*"круглый" + 0.000*"изобразить" + 0.000*"историк" + 0.000*"каменный" + 0.000*"комиссар" + 0.000*"конгресс" + 0.000*"здание" + 0.000*"красивый" + 0.000*"лишить"'),
 (96,
  '0.000*"конечный" + 0.000*"круглый" + 0.000*"изобразить" + 0.000*"историк" + 0.000*"каменный" + 0.000*"комиссар" + 0.000*"конгресс" + 0.000*"здание" + 0.000*"красивый" + 0.000*"лишить"'),
 (94,
  '0.000*"конечный" + 0.000*"круглый" + 0.000*"изобразить" + 0.000*"историк" + 0.000*"каменный" + 0.000*"комиссар" + 0.000*"конгресс" + 0.000*"здание" + 0.000*"красивый" + 0.000*"лишить"'),
 (95,
  '0.000*"коне

In [305]:
print("Перплексия:", "\n", np.exp2(-lda1.log_perplexity(corpus[:1000])))

Перплексия: 
 546.8289615381884


In [306]:
topics1 = []
for topic_id, topic in lda1.show_topics(num_topics=100, formatted=False):
    topic = [word for word, _ in topic]
    topics1.append(topic)
    
coherence_model_lda1 = gensim.models.CoherenceModel(topics=topics1, 
                                                   texts=[text.split() for text in texts], 
                                                   dictionary=dictinary1, coherence='c_v')

In [307]:
print("Когерентность:", "\n", coherence_model_lda1.get_coherence())

Когерентность: 
 0.4211784760246522


### C N-граммами

In [190]:
texts4ngrams = [text.split() for text in texts]
ph = gensim.models.Phrases(texts4ngrams, scoring='npmi', threshold=0.4) # threshold можно подбирать
p = gensim.models.phrases.Phraser(ph)
ngrammed_texts = p[texts4ngrams] 


In [191]:
dictinary2 = gensim.corpora.Dictionary((text for text in ngrammed_texts))
dictinary2.filter_extremes(no_above=0.1, no_below=10)
dictinary2.compactify()

In [192]:
corpus2 = [dictinary2.doc2bow(text) for text in ngrammed_texts]

In [193]:
lda2 = gensim.models.LdaMulticore(corpus2, 
                                 100, # колиество тем
                                 alpha='asymmetric',
                                 id2word=dictinary2, 
                                 passes=10)

In [194]:
lda2.print_topics()[:10]

[(99,
  '0.040*"сезон" + 0.032*"клуб" + 0.021*"занять" + 0.020*"гонка" + 0.015*"чемпионат_мир" + 0.015*"выиграть" + 0.013*"контракт" + 0.012*"завоевать" + 0.011*"мир" + 0.010*"победа"'),
 (96,
  '0.040*"длина" + 0.033*"смотреть" + 0.030*"лист" + 0.027*"мм" + 0.024*"вид" + 0.012*"я" + 0.012*"молодой" + 0.011*"ты" + 0.011*"семейство" + 0.011*"северный_америка"'),
 (98,
  '0.035*"–" + 0.022*"село" + 0.021*"пункт" + 0.020*"солдат" + 0.016*"сельский_поселение" + 0.015*"фильм" + 0.014*"на_территория" + 0.014*"железнодорожный" + 0.013*"поселение" + 0.012*"россия"'),
 (97,
  '0.053*"символ" + 0.021*"значение" + 0.020*"содержать" + 0.020*"s" + 0.018*"тема" + 0.016*"a" + 0.016*"термин" + 0.015*"x" + 0.013*"канада" + 0.012*"серебряный"'),
 (95,
  '0.018*"университет" + 0.015*"книга" + 0.014*"знание" + 0.014*"животное" + 0.010*"себя" + 0.010*"наука" + 0.010*"научный" + 0.008*"деятельность" + 0.008*"общество" + 0.007*"язык"'),
 (92,
  '0.096*"сельский_поселение" + 0.054*"поселение" + 0.050*"№" + 0.

In [227]:
print("Перплексия:", "\n", np.exp2(-lda2.log_perplexity(corpus2[:1000])))

Перплексия: 
 264.52880081056134


In [196]:
topics2 = []
for topic_id, topic in lda2.show_topics(num_topics=100, formatted=False):
    topic = [word for word, _ in topic]
    topics2.append(topic)

In [211]:
coherence_model_lda2 = gensim.models.CoherenceModel(topics=topics2, 
                                                   texts=[text.split() for text in texts], 
                                                   dictionary=dictinary, coherence='c_v')

In [222]:
print("Когерентность:", "\n", coherence_model_lda2.get_coherence())

Когерентность: 
 0.4920089611600607


### C TF-IDF и n-граммами

In [200]:
dictinary3 = gensim.corpora.Dictionary((text for text in ngrammed_texts))
dictinary3.filter_extremes(no_above=0.1, no_below=10)
dictinary3.compactify()

In [201]:
corpus3 = [dictinary3.doc2bow(text) for text in ngrammed_texts]

In [202]:
tfidf = gensim.models.TfidfModel(corpus3, id2word=dictinary3)
corpus3 = tfidf[corpus3]

In [203]:
lda3 = gensim.models.LdaMulticore(corpus3, 
                                 100, # колиество тем
                                 alpha='asymmetric',
                                 id2word=dictinary3, 
                                 passes=10)
%time

Wall time: 0 ns


In [204]:
lda3.print_topics()[:10]

[(98,
  '0.000*"священник" + 0.000*"символ" + 0.000*"религиозный" + 0.000*"решение" + 0.000*"рим" + 0.000*"рядом" + 0.000*"самостоятельный" + 0.000*"свой_очередь" + 0.000*"расположение" + 0.000*"северный_часть"'),
 (99,
  '0.000*"база" + 0.000*"тип" + 0.000*"эсминец" + 0.000*"эксплуатация" + 0.000*"честь" + 0.000*"американский" + 0.000*"1983" + 0.000*"1984" + 0.000*"ввести" + 0.000*"согласно"'),
 (97,
  '0.002*"священник" + 0.001*"убивать" + 0.001*"пол" + 0.001*"местный_житель" + 0.001*"решать" + 0.000*"пытаться" + 0.000*"начинать" + 0.000*"церковь" + 0.000*"вариант" + 0.000*"здание"'),
 (96,
  '0.000*"священник" + 0.000*"символ" + 0.000*"религиозный" + 0.000*"решение" + 0.000*"рим" + 0.000*"рядом" + 0.000*"самостоятельный" + 0.000*"свой_очередь" + 0.000*"расположение" + 0.000*"северный_часть"'),
 (94,
  '0.000*"священник" + 0.000*"символ" + 0.000*"религиозный" + 0.000*"решение" + 0.000*"рим" + 0.000*"рядом" + 0.000*"самостоятельный" + 0.000*"свой_очередь" + 0.000*"расположение" + 0.00

In [228]:
print("Перплексия:", "\n", np.exp2(-lda3.log_perplexity(corpus3[:1000])))

Перплексия: 
 12056.535885954205


In [206]:
topics3 = []
for topic_id, topic in lda3.show_topics(num_topics=100, formatted=False):
    topic = [word for word, _ in topic]
    topics3.append(topic)

In [213]:
coherence_model_lda3 = gensim.models.CoherenceModel(topics=topics3, 
                                                   texts=[text.split() for text in texts], 
                                                   dictionary=dictinary, coherence='c_v')

In [221]:
print("Когерентность:", "\n", coherence_model_lda3.get_coherence())

Когерентность: 
 0.3750193299910641


Как можно видеть, модели с tf-idf показывают очень высокую перплексию - скорее всего в коде ошибка, из-за чего модели и дают такой результат. Тем не менее модель без n-граммов и tf-idf показывает самую низкую перплексию, при этом её результаты когерентности схожи с таковыми у модели с n-граммами; обе модели дают достаточно осмысленные и логичные темы.


### Самые красивые темы

#### Без tf-idf и n-граммов

#### С tf-idf
(здесь выбор был небольшой)

#### С n-граммами

#### С tf-idf и n-граммами
(тоже небогатый выбор)