In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import artm
import pymorphy2
from nltk.corpus import stopwords
from vowpalwabbit import pyvw
import gensim
import matplotlib.pyplot as plt

morph = pymorphy2.MorphAnalyzer()

stop_words = stopwords.words('russian')
stop_words.extend(['такой', 'который', 'какой', 'свой', 'мой', 'наш', 'этот', 'очень'])
stop_words.extend(['тысяча', 'сотня', 'всякий', 'любой', 'источник', 'сегодня', 'сейчас'])

# Извлечь данные
ds = 'C:\\Users\\vinov\\OneDrive\\Документы\\Универ\\Диплом\\Код\\archive\\lenta-ru-news.csv'
df = pd.read_csv(ds, low_memory = False, dtype = str, nrows = 400)
df.head()
data = df.text.values.tolist()

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True убирает пунктуауцию

data_words = list(sent_to_words(data))

# построение моделей биграм
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # выше threshold, меньше фраз
bigram_mod = gensim.models.phrases.Phraser(bigram)

def remove_stopwords(texts):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def lemmatize(texts, allowed_postags = ['NOUN', 'ADJF', 'INFN', 'PRTF', 'GRND', 'ADVB']):
    texts_out = []
    for sent in texts:
        to1 = []
        for part in sent:
            butyavka = morph.parse(part)[0]
            if butyavka.tag.POS in allowed_postags:
                to1.append(butyavka.normal_form)
        texts_out.append(to1)
    return texts_out

data_words_bigrams = make_bigrams(data_words)
data_lemmatized = lemmatize(data_words_bigrams)
data_nonstop = remove_stopwords(data_lemmatized)

"""
vw = pyvw.vw(quiet = True)
f_line = str(data_nonstop[0])
ex = vw.example(f_line)
for line in data_nonstop:
    ex.push_feature(str(line))
wf = open('docword.vw.txt', 'w+')
wf.writelines(vw)
wf.close()"""

# Создать батчи и словарь
n_wd_bigr = np.empty((len(data_nonstop), len(data_words)))
for i in range(len(data_nonstop)):
    for j in range(len(data_words)):
        n_wd_bigr[i][j] = data_words[j].count(data_nonstop[i])
        
cv = CountVectorizer(max_features = features)
n_wd = np.array(cv.fit_transform(data_words).todense()).T
vocabulary = cv.get_feature_names()

n_wd = np.concatenate((n_wd, n_wd_bigr))
vocabulary += bigrams

# Простая модель
model_artm = artm.ARTM(topic_names = topic_names, cache_theta=True, scores=[artm.PerplexityScore(name='PerplexityScore', dictionary=vocabulary), 
                                                                    artm.SparsityPhiScore(name='SparsityPhiScore'),
                                                                    artm.SparsityThetaScore(name='SparsityThetaScore'),
                                                                    artm.TopicKernelScore(name='TopicKernelScore',probability_mass_threshold=0.3), 
                                                                    artm.TopTokensScore(name='TopTokensScore', num_tokens=8)],
                       regularizers=[artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.4),
                                     artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=2.5e+5)])
                                     
model_artm.num_document_passes = 10
model_artm.initialize(dictionary)
model_artm.fit_offline(batch_vectorizer=bv, num_collection_passes=20)

# Вывод сложности
def print_measures(model_artm):
    print('Sparsity Phi ARTM:{}'.format(model_artm.score_tracker['SparsityPhiScore'].last_value))
    print('Sparsity Theta ARTM:{}'.format(model_artm.score_tracker['SparsityThetaScore'].last_value))
    print('Perplexity ARTM: {}'.format(model_artm.score_tracker['PerplexityScore'].last_value))
    
    ig, axs = plt.subplots(1, 3, figsize = (30, 5))
    
    for idx, score, y_label in zip(range(3), ['PerplexityScore', 'SparsityPhiScore', 'SparsityThetaScore'], ['ARTM perplexity', 'ARTM Phi sparsity', 'ARTM Theta sparsity']):
        axs[idx].plot(range(model_artm.num_phi_updates), model_artm.score_tracker[score].value, 'r--', linewidth=2)
        axs[idx].set_xlabel('Iterations count')
        axs[idx].set_ylabel(y_label)
        axs[idx].grid(True)
        
print_measures(model_artm)

ModuleNotFoundError: ignored