In [1]:
from gensim import corpora, models
from gensim.models import Phrases
from gensim.models.ldamulticore import LdaMulticore
from operator import itemgetter
import re
from pymorphy2 import MorphAnalyzer
import pandas as pd

In [2]:
from nltk.corpus import names
import nltk; nltk.download('stopwords')
# NLTK Stop words
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sych_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv('stopwords.csv', encoding = 'utf-8', sep=";")
for index, row in df.iterrows():
    russian_stopwords.append(row['stopword'])

In [4]:
class LdaPredictor():
    
    def __init__(self, lda_path, dict_path, bigram_path, trigram_path):
        """
        lda_path - путь к lda модели
        dict_path - путь к словарю 
        bigram_path - путь к биграммам
        trigram_path - путь к триграммам
        
        param: lda_path str
        param: dict_path str
        param: bigram_path str
        param: trigram_path str
        """
        self.dictionary = corpora.Dictionary.load(dict_path)
        self.lda = LdaMulticore.load(lda_path)
        self.bigram_path = bigram_path
        self.trigram_path = trigram_path
        
    def to_lemmatize2(self, text):
        all_word_str = " ".join(text)
        all_word_list = all_word_str.split()
        all_unique_word = pd.Series(all_word_list).unique()
        lemmatized_word_dict = {}
        lemmatizer = MorphAnalyzer()
        for word in all_unique_word:
            lemmatized_word_dict[word] = lemmatizer.normal_forms(word)[0]
        text = ' '.join([lemmatized_word_dict[word] for word in text])
        return text, all_unique_word
        
    def clean(self, text):
        deleted_symols = '[\\\\\'[\]!"$%&()*+,-./:;<=>?@^_`{|}~«»\n]'
        text = re.sub(deleted_symols, ' ', text)
        
        text = ' '.join([elem for elem in str(text).split(' ') if elem.isdigit() == False])
        
        text = text.lower()
        text = [token for token in text.split() if token not in russian_stopwords]

        text, _ = self.to_lemmatize2(text)
        return text.split(' ')
    
    def bigram(self, text):
        bigram = Phrases.load(self.bigram_path)
        trigram = Phrases.load(self.trigram_path)
        text_clean = text
        for idx in range(len(text_clean)):
            for token in bigram[text_clean[idx]]:
                if '_' in token:
                    text_clean[idx].append(token)
            for token in trigram[text_clean[idx]]:
                if '_' in token:
                    text_clean[idx].append(token)
        return text_clean
    
    def predict(self, text):
        clean_text = self.clean(text)
        bigram = self.bigram([clean_text])
        new_review_bow = self.dictionary.doc2bow(bigram[0])
        new_review_lda = self.lda[new_review_bow]
        return sorted(new_review_lda, reverse=True, key=itemgetter(1))

In [5]:
lda_path = "./model/best_model.lda"
dict_path = "./model/dictionary.dict"
bigram_path = "./model/bigram.phs"
trigram_path = "./model/trigram.phs"
lda = LdaPredictor(lda_path, dict_path,  bigram_path, trigram_path)

In [7]:
text = "Добрый день. Ничего не работает, все сломалось!!! Помогите мне."
predict = lda.predict(text)
print(predict)
lda.lda.print_topic(predict[0][0], topn=50)

[(0, 0.10157022), (5, 0.099726394), (9, 0.09570883), (1, 0.08199987), (4, 0.08175852), (6, 0.078894146), (3, 0.07884336), (7, 0.07805747), (2, 0.07728669), (11, 0.07597409), (10, 0.075552344), (8, 0.07462801)]


'0.020*"база" + 0.015*"висеть" + 0.013*"очень" + 0.012*"программа" + 0.011*"сервер" + 0.010*"перезагрузить" + 0.010*"зайти" + 0.009*"заработать" + 0.009*"долго" + 0.008*"минута" + 0.008*"весь_равно" + 0.008*"зависнуть" + 0.008*"почта" + 0.008*"терминал" + 0.007*"ничто_измениться" + 0.007*"ничто" + 0.007*"очень_долго" + 0.007*"ок" + 0.006*"открываться" + 0.006*"тест" + 0.006*"1с" + 0.006*"хороший" + 0.006*"перестать_работать" + 0.006*"рабочий_место" + 0.005*"dev" + 0.005*"зависать" + 0.005*"очень_медленно" + 0.005*"помочь" + 0.005*"упр" + 0.005*"вроде" + 0.005*"перезагружать" + 0.005*"запускаться" + 0.005*"перезагрузить_компьютер" + 0.005*"снова" + 0.004*"ещё" + 0.004*"очень_сильно" + 0.004*"компьютер" + 0.004*"вылетать" + 0.004*"измениться" + 0.004*"нормальный" + 0.003*"грузить" + 0.003*"стать" + 0.003*"место" + 0.003*"erp" + 0.003*"выкидывать" + 0.003*"чёрный_экран" + 0.003*"большой" + 0.003*"сильно" + 0.003*"перезагрузить_rc" + 0.003*"загружаться"'