In [None]:
import collections
import itertools
import logging
import os
import shelve

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from multiprocessing import Pool

from gensim import corpora, models
from gensim.models import KeyedVectors
from gensim.models import TfidfModel
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn import metrics

from nltools.exceptions import WrongExtensionError
from nltools.streams.io_.basic import read_csv
from nltools.streams.preprocessing import basic as basic_prp
from nltools.streams.preprocessing.preloading import stop_words
from nltools.streams.preprocessing import w2v

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Предобработка данных

In [None]:
def preprocess(text, stop_words=None, min_len=1):
    text = basic_prp.clean_text(text)
    return basic_prp.tokenize(text, stop_words, min_len)


def tm_preprocess(text, stop_words=None, min_len=1):
    morph = pymorphy2.MorphAnalyzer()
    return [basic_prp.lemmatize(i, morph) for i preprocess(text, stop_words=stop_words, min_len=min_len)]


def read_preprocess(input_file, mode):
    if mode == 'd2v':
        cur_document = [preprocess(line, stop_words=stop_words, min_len=0) for *_, line in read_csv(f'tg_data/{input_file}', msg_brd=None)]            
        return list(itertools.chain(*cur_document)), [input_file]
    
    elif mode == 'tm':
        cur_document = [tm_preprocess(line, stop_words=stop_words, min_len=0) for *_, line in read_csv(f'tg_data/{input_file}', msg_brd=None)]            
        return list(itertools.chain(*cur_document))


def read_preprocess_multi(input_files,
                          prp_mode,
                          batch_size=5, 
                          workers=5,
                          db_name='temp.db',
                          res_var_name='train_corpus'):
    for i in range(0, len(input_files), batch_size):
        print(f'\rPreprocessing files {i}-{i+batch_size}.', end='')
        pool = Pool(processes=workers)
        batch = list(pool.map(lambda x: d2v_read_preprocess(x, prp_mode), input_files[i:i+batch_size]))
        pool.close()
        pool.join()

        with shelve.open(db_name) as db:
            try:
                temp = db[res_var_name]
            except KeyError:
                db[res_var_name] = batch
                continue

            temp += batch
            db[res_var_name] = temp
            del temp, batch

In [None]:
read_preprocess_multi(list(os.listdir('tg_data/')), 
                      'd2v',
                      db_name='objs/temp-d2v.db', 
                      res_var_name='train_corpus')

In [None]:
read_preprocess_multi(list(os.listdir('tg_data/')), 
                      'tm',
                      db_name='objs/temp-d2v.db', 
                      res_var_name='train_corpus')

# Чтение предобработанных данных для D2V

In [None]:
class ShelveReader:
    def __init__(self, filenames, mode='td'):
        self.filenames = filenames
        self.mode = mode
        
    def __iter__(self):
        for fn in self.filenames:
            with shelve.open(fn) as db:
                for doc, label in db[list(db.keys())[0]]:
                    if self.mode == 'td':
                        yield TaggedDocument(doc, label)  
                    elif self.mode == 'asis':
                        yield doc

In [None]:
train_corpus = [doc for doc in ShelveReader(['objs/temp.db'])]

# Doc2Vec
## Создание и обучение модели

In [None]:
%%time
model = Doc2Vec(vector_size=100, dbow_words=1, min_count=2, window=10, sample=1e-3, workers=3, seed=42)
model.build_vocab(train_corpus)

In [None]:
%%time
model.train(train_corpus, 
            total_examples=model.corpus_count, 
            epochs=55)

## Проверка на вменяемость для Doc2Vec модели 

In [None]:
def get_ranks(train_corpus):
    ranks = []

    for doc in train_corpus:
        inferred_vector = model.infer_vector(doc.words)
        sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
        rank = [docid for docid, sim in sims].index(doc.tags[0])
        ranks.append(rank)
    
    return collections.Counter(ranks)

In [None]:
%%time
get_ranks(train_corpus)

In [None]:
model.save('d2v-215')

In [None]:
del model

## Проверка результатов векторизации путем проведения кластеризации и отрисовки векторов в сжатом пространстве

In [None]:
true_labels_df = pd.read_csv('mu/ds_true.csv')
true_labels = [j-1 for i in range(len(true_labels_df)) for j, el in enumerate(true_labels_df.iloc[i]) if el == 1]

In [None]:
model = Doc2Vec.load('objs/d2v-215')

In [None]:
doc_vectors = []

for fname in true_labels_df['dump_id']:
    doc_vectors.append(model.docvecs[fname].reshape(-1, 1))

In [None]:
kmeans = KMeans(n_clusters=8, random_state=42)
pred_labels = kmeans.fit_predict(model.docvecs.vectors_docs)

In [None]:
metrics.fowlkes_mallows_score(true_labels, pred_labels)  

model = Doc2Vec(vector_size=50, min_count=2, sample=1e-5, workers=4) -- 0.14086823960956465

model = Doc2Vec(vector_size=400, min_count=2, window=6, sample=1e-5, workers=4) -- 0.14888268334614022

model = Doc2Vec(vector_size=400, min_count=2, window=6, sample=1e-6, workers=4) -- 0.18628878806361712

model = Doc2Vec(vector_size=400, min_count=2, window=3, sample=1e-6, workers=4) -- 0.177933197245823

model = Doc2Vec(vector_size=400, min_count=2, window=10, sample=1e-6, workers=4) -- 0.21118463712915594

model = Doc2Vec(vector_size=400, min_count=2, window=10, sample=1e-7, workers=4) -- 0.15144894062234954

model = Doc2Vec(vector_size=400, dm=0, min_count=2, window=10, sample=1e-6, workers=4) -- 0.1635102868186538

model = Doc2Vec(vector_size=400, dbow_words=1, min_count=2, window=10, sample=1e-6, workers=4) -- 0.21547558866834945

model = Doc2Vec(vector_size=400, negative=10, dbow_words=1, min_count=1, window=10, sample=1e-6, workers=4) -- 0.13814767369837347

### Отрисовка векторов (PCA)

In [None]:
pca = PCA(n_components=2)
X = pca.fit_transform(model.docvecs.vectors_docs)

plt.figure(figsize=(20, 20))
plt.scatter(X[:, 0], X[:, 1], c=true_labels, cmap='jet')

for label, x, y in zip(pred_labels, X[:, 0], X[:, 1]):
    plt.annotate(label, xy=(x, y))

plt.show()

### Отрисовка векторов (TSNE)

In [None]:
X = TSNE(n_components=2).fit_transform(model.docvecs.vectors_docs)

plt.figure(figsize=(10, 10))
plt.scatter(X[:, 0], X[:, 1], c=true_labels, cmap='jet')

for label, x, y in zip(pred_labels, X[:, 0], X[:, 1]):
    plt.annotate(label, xy=(x, y))

plt.show()

In [None]:
del model, X, train_corpus, true_labels

# Тематическое моделирование

In [None]:
train_corpus = [doc for doc in ShelveReader(['objs/temp.db'], mode='asis')]

In [None]:
dct = corpora.Dictionary(train_corpus)
corpus = [dct.doc2bow(line) for line in train_corpus]
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [None]:
lsi = models.LsiModel(corpus_tfidf, id2word=dct, num_topics=8)

In [None]:
lsi.print_topics(8)

In [None]:
# test_cluster = [i for i in range(len(labels)) if labels[i] == 2]

# # for i in test_cluster:
# #     print(f'i:{i} {vld.iloc[i]["IT"]}')
# for i in range(43):
#     print(f'{labels[i]}')
# collections.Counter(labels)