## Home work 4
### Grigory Arshinov

### Задание 1

На нескольких алгоритмах кластеризации, умеющих работать с sparse матрицами, проверьте, что работает лучше Count_Vectorizer или TfidfVectorizer (попробуйте выжать максимум из каждого - попробуйте нграммы, символьные нграммы, разные значения max_features и min_df) (3 балла)

In [1]:
import pandas as pd
import re
from pymorphy2 import MorphAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
all_data = pd.read_csv('data/train.csv')
all_data.dropna(subset=['description'], inplace=True)

In [3]:
data = all_data[["title", "category_name"]]

In [4]:
morph = MorphAnalyzer()
class FasterMorphology:
    
    def __init__(self):
        self.__morph = MorphAnalyzer()
        self.__cache = {}
    
    def parse(self, word: str):
        if word in self.__cache:
            return self.__cache[word]
        else:
            analysis = self.__morph.parse(word)
            self.__cache[word] = analysis[0]
            return analysis[0]

parser = FasterMorphology()

def lemmatizer(text):
    return [parser.parse(word).normal_form for word in re.findall(r'(?u)\b\w\w+\b', text)]

In [6]:
from sklearn import cluster
from sklearn import metrics

In [25]:
def evaluate_model(model_cls,
                   model_params=None,
                   vectorizer_cls=None,
                   vectorizer_params=None,
                   decomposer_cls=None,
                   decomposer_params=None):
    if not model_params:
        model_params = {}
    if vectorizer_cls and not vectorizer_params:
        vectorizer_params = {}
    text_vectors = vectorizer_cls(**vectorizer_params).fit_transform(data.title)
    if decomposer_cls:
        if not decomposer_params:
            decomposer_params = {}
        decomposer = decomposer_cls(**decomposer_params)
        text_vectors = decomposer.fit_transform(text_vectors)
    model = model_cls(**model_params)
    model.fit(text_vectors)
    labels = model.labels_
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(text_vectors[:10000], labels[:10000]))
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(data.category_name, labels)) # проверяет, что в кластере объекты одного класса
    print("Completeness: %0.3f" % metrics.completeness_score(data.category_name, labels)) # проверяет, что объекты класса только в одном кластере
    print("V-measure: %0.3f" % metrics.v_measure_score(data.category_name, labels)) # превращает обе метрики в одну 
    print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(data.category_name, labels))
    print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(data.category_name, labels))

In [16]:
def grid_iterator(parameters):
    if not parameters:
        yield dict()
    else:
        key_to_iterate = list(parameters.keys())[0]
        next_round_parameters = {p : parameters[p]
                    for p in parameters if p != key_to_iterate}
        for val in parameters[key_to_iterate]:
            for pars in grid_iterator(next_round_parameters):
                temp_res = pars
                temp_res[key_to_iterate] = val
                yield temp_res

In [22]:
def remove_prefix(text, prefix):
    if text.startswith(prefix):
        return text[len(prefix):]
    return text

def cluster_grid_search(model_cls,
                        vectorizer_cls,
                        decomposer_cls,
                        param_grid,
                        fixed_params):
    print("Model class: %s" % model_cls)
    print("Vectorizer class: %s" % vectorizer_cls)
    for params in grid_iterator(param_grid):
        params.update(fixed_params)
        decomposer_params = {remove_prefix(key, "dec__"):value for key, value in params.items() if key.startswith("dec__")}
        model_params = {remove_prefix(key, "model__"):value for key, value in params.items() if key.startswith("model__")}
        vec_params = {remove_prefix(key, "vec__"):value for key, value in params.items() if key.startswith("vec__")}
        print("Params: ", params)
        evaluate_model(model_cls=model_cls,
                       model_params=model_params,
                       vectorizer_cls=vectorizer_cls,
                       vectorizer_params=vec_params,
                       decomposer_cls=decomposer_cls,
                       decomposer_params=decomposer_params or None)
        print("*" * 50)

In [34]:
fixed_params = {"vec__tokenizer": lemmatizer}
params = {
    "model__n_clusters": [2, 4, 6, 8],
    "vec__ngram_range": [(1, 3), (1, 2)],
    "vec__min_df": [.02, .05],
    "vec__analyzer": ["word", "char"]
}
cluster_grid_search(cluster.MiniBatchKMeans, TfidfVectorizer, params, fixed_params)

Model class: <class 'sklearn.cluster._kmeans.MiniBatchKMeans'>
Vectorizer class: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
Params:  {'vec__analyzer': 'word', 'vec__min_df': 0.02, 'vec__ngram_range': (1, 3), 'model__n_clusters': 2, 'vec__tokenizer': <function lemmatizer at 0x7f9e7614be18>}
Silhouette Coefficient: 0.540
Homogeneity: 0.082
Completeness: 0.931
V-measure: 0.150
Adjusted Rand Index: 0.028
Adjusted Mutual Information: 0.150
**************************************************
Params:  {'vec__analyzer': 'char', 'vec__min_df': 0.02, 'vec__ngram_range': (1, 3), 'model__n_clusters': 2, 'vec__tokenizer': <function lemmatizer at 0x7f9e7614be18>}
Silhouette Coefficient: 0.054
Homogeneity: 0.102
Completeness: 0.523
V-measure: 0.171
Adjusted Rand Index: 0.080
Adjusted Mutual Information: 0.171
**************************************************
Params:  {'vec__analyzer': 'word', 'vec__min_df': 0.05, 'vec__ngram_range': (1, 3), 'model__n_clusters': 2, 'vec__tokenizer': <fu

Silhouette Coefficient: 0.971
Homogeneity: 0.125
Completeness: 0.493
V-measure: 0.200
Adjusted Rand Index: 0.034
Adjusted Mutual Information: 0.200
**************************************************
Params:  {'vec__analyzer': 'char', 'vec__min_df': 0.05, 'vec__ngram_range': (1, 2), 'model__n_clusters': 6, 'vec__tokenizer': <function lemmatizer at 0x7f9e7614be18>}
Silhouette Coefficient: 0.070
Homogeneity: 0.191
Completeness: 0.363
V-measure: 0.250
Adjusted Rand Index: 0.098
Adjusted Mutual Information: 0.250
**************************************************
Params:  {'vec__analyzer': 'word', 'vec__min_df': 0.02, 'vec__ngram_range': (1, 3), 'model__n_clusters': 8, 'vec__tokenizer': <function lemmatizer at 0x7f9e7614be18>}
Silhouette Coefficient: 0.794
Homogeneity: 0.122
Completeness: 0.326
V-measure: 0.178
Adjusted Rand Index: 0.008
Adjusted Mutual Information: 0.177
**************************************************
Params:  {'vec__analyzer': 'char', 'vec__min_df': 0.02, 'vec__ngram_r

In [36]:
cluster_grid_search(cluster.MiniBatchKMeans, CountVectorizer, params, fixed_params)

Model class: <class 'sklearn.cluster._kmeans.MiniBatchKMeans'>
Vectorizer class: <class 'sklearn.feature_extraction.text.CountVectorizer'>
Params:  {'vec__analyzer': 'word', 'vec__min_df': 0.02, 'vec__ngram_range': (1, 3), 'model__n_clusters': 2, 'vec__tokenizer': <function lemmatizer at 0x7f9e7614be18>}
Silhouette Coefficient: 0.686
Homogeneity: 0.069
Completeness: 0.917
V-measure: 0.129
Adjusted Rand Index: 0.022
Adjusted Mutual Information: 0.129
**************************************************
Params:  {'vec__analyzer': 'char', 'vec__min_df': 0.02, 'vec__ngram_range': (1, 3), 'model__n_clusters': 2, 'vec__tokenizer': <function lemmatizer at 0x7f9e7614be18>}
Silhouette Coefficient: 0.180
Homogeneity: 0.036
Completeness: 0.175
V-measure: 0.060
Adjusted Rand Index: 0.031
Adjusted Mutual Information: 0.060
**************************************************
Params:  {'vec__analyzer': 'word', 'vec__min_df': 0.05, 'vec__ngram_range': (1, 3), 'model__n_clusters': 2, 'vec__tokenizer': <fu

Silhouette Coefficient: 0.967
Homogeneity: 0.115
Completeness: 0.469
V-measure: 0.184
Adjusted Rand Index: 0.026
Adjusted Mutual Information: 0.184
**************************************************
Params:  {'vec__analyzer': 'char', 'vec__min_df': 0.05, 'vec__ngram_range': (1, 2), 'model__n_clusters': 6, 'vec__tokenizer': <function lemmatizer at 0x7f9e7614be18>}
Silhouette Coefficient: 0.096
Homogeneity: 0.147
Completeness: 0.278
V-measure: 0.193
Adjusted Rand Index: 0.093
Adjusted Mutual Information: 0.192
**************************************************
Params:  {'vec__analyzer': 'word', 'vec__min_df': 0.02, 'vec__ngram_range': (1, 3), 'model__n_clusters': 8, 'vec__tokenizer': <function lemmatizer at 0x7f9e7614be18>}
Silhouette Coefficient: 0.725
Homogeneity: 0.131
Completeness: 0.440
V-measure: 0.202
Adjusted Rand Index: 0.020
Adjusted Mutual Information: 0.202
**************************************************
Params:  {'vec__analyzer': 'char', 'vec__min_df': 0.02, 'vec__ngram_r

Судя по выводу метрик, TfIdf vectorizer дает лучший результат

## Задание 2
На нескольких алгоритмах кластеризации проверьте, какое матричное разложение (TruncatedSVD или NMF) работает лучше для кластеризации. (3 балла)

In [7]:
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.cluster import AgglomerativeClustering, MeanShift, SpectralClustering, DBSCAN

In [8]:
from tqdm.notebook import trange

In [None]:
fixed_params = {"vec__tokenizer": lemmatizer,
                "model__n_clusters": 1100,
                "dec__n_components": 50}
params = {
    "model__linkage": ["ward", "complete", "average", "single"],
    "vec__ngram_range": [(1, 3), (1, 2)],
    "vec__min_df": [.02, .05],
    "vec__analyzer": ["word", "char"]
}
cluster_grid_search(AgglomerativeClustering, TfidfVectorizer, NMF, params, fixed_params)

Model class: <class 'sklearn.cluster._agglomerative.AgglomerativeClustering'>
Vectorizer class: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
Params:  {'vec__analyzer': 'word', 'vec__min_df': 0.02, 'vec__ngram_range': (1, 3), 'model__linkage': 'ward', 'vec__tokenizer': <function lemmatizer at 0x7f815f24cea0>, 'model__n_clusters': 1100, 'dec__n_components': 50}


Честно говорю, не доделал. Оцените, пожалуйста, что есть. Спасибо!