In [86]:
import pandas as pd
import pymorphy2
from collections import Counter
import csv
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora.dictionary import Dictionary
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from string import punctuation
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [9]:
df = pd.read_csv('./data/covid_tweets.csv.gz', compression='gzip')

In [7]:
# размеры данных
df.shape

(1491177, 3)

In [11]:
morph = pymorphy2.MorphAnalyzer()

def lemmatize(text: str) -> list:
    words = text.split() # разбиваем текст на слова
    res = list()
    for word in words:
        p = morph.parse(word)[0]
        res.append(p.normal_form)
    return res

In [99]:
# получаем список предложений
lemmatized_data = []
for text in df['text'].head(1_000_000):
    lemmatized_data.append(lemmatize(text))

In [100]:
# получаем список стоп-слов
with open('./data/stopwords-ru.txt', 'r', encoding='utf-8') as f:
    stopwords = [str(i) for i in f.read().split()] + list(punctuation)

In [102]:
# удаляем стоп-слова если таковые есть
for sentence in lemmatized_data:
    for word in sentence:
        if word in stopwords:
            sentence.remove(word)

In [55]:
for sentence in lemmatized_data:
    for word in sentence:
        if sentence.count(word) < 5:
            sentence.remove(word)

In [103]:
# for lda_model
dictionary = Dictionary(lemmatized_data)
corpus = [dictionary.doc2bow(doc) for doc in lemmatized_data]


# for csv-file
words_counter = Counter()
for sentence in lemmatized_data:
    for word in sentence:
        words_counter[word] += 1


In [104]:
# create csv-fie
with open('word_frequency.csv','w', encoding='utf-8') as csvfile:
    columns = ['word', 'frequency']
    writer=csv.writer(csvfile)
    writer.writerow(columns)
    writer.writerows(words_counter.most_common())

In [110]:
# create LDA_model
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=50)

In [111]:
with open(f'topics_{lda_model.num_topics}.txt', mode='w', encoding='utf-8') as f:
        for topic in lda_model.print_topics():
            f.write(str(topic) + '\n')


In [112]:
pyLDAvis.enable_notebook()
vis_data = gensimvis.prepare(lda_model, corpus, dictionary, mds='mmds')
pyLDAvis.display(vis_data)