In [39]:
from gensim import corpora, models
from gensim.models import Phrases
from gensim.models.ldamulticore import LdaMulticore
from operator import itemgetter
import re
from pymorphy2 import MorphAnalyzer
from pandas import Series

In [40]:
from nltk.corpus import names
import nltk; nltk.download('stopwords')
# NLTK Stop words
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")
russian_stopwords.append('привет')
russian_stopwords.append('спасибо')
russian_stopwords.append('пожалуйста')
russian_stopwords.append('пока')
russian_stopwords.append('добрый')
russian_stopwords.append('день')
russian_stopwords.append('nan')
russian_stopwords.append('end')
russian_stopwords.append('утро')
russian_stopwords.append('ок')
russian_stopwords.append('здравствуйте')
russian_stopwords.append('мочь')
russian_stopwords.append('не')
russian_stopwords.append('работать')
russian_stopwords.append('сей')
russian_stopwords.append('пора')
russian_stopwords.append('очень')
russian_stopwords.append('проблема')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sych_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [89]:
class LdaPredictor():
    
    def __init__(self, lda_path, dict_path, bigram_path, trigram_path):
        """
        lda_path - путь к lda модели
        dict_path - путь к словарю 
        bigram_path - путь к биграммам
        trigram_path - путь к триграммам
        
        param: lda_path str
        param: dict_path str
        param: bigram_path str
        param: trigram_path str
        """
        self.dictionary = corpora.Dictionary.load(dict_path)
        self.lda = LdaMulticore.load(lda_path)
        self.bigram_path = bigram_path
        self.trigram_path = trigram_path
        
    def to_lemmatize2(self, text):
        all_word_str = " ".join(text)
        all_word_list = all_word_str.split()
        all_unique_word = Series(all_word_list).unique()
        lemmatized_word_dict = {}
        lemmatizer = MorphAnalyzer()
        for word in all_unique_word:
            lemmatized_word_dict[word] = lemmatizer.normal_forms(word)[0]
        text = ' '.join([lemmatized_word_dict[word] for word in text])
        return text, all_unique_word
        
    def clean(self, text):
        deleted_symols = '[\\\\\'[\]!"$%&()*+,-./:;<=>?@^_`{|}~«»\n]'
        text = re.sub(deleted_symols, ' ', text)
        
        text = ' '.join([elem for elem in str(text).split(' ') if elem.isdigit() == False])
        
        text = text.lower()
        text = [token for token in text.split() if token not in russian_stopwords]

        text, _ = self.to_lemmatize2(text)
        return text.split(' ')
    
    def bigram(self, text):
        bigram = Phrases.load(self.bigram_path)
        trigram = Phrases.load(self.trigram_path)
        text_clean = text
        for idx in range(len(text_clean)):
            for token in bigram[text_clean[idx]]:
                if '_' in token:
                    text_clean[idx].append(token)
            for token in trigram[text_clean[idx]]:
                if '_' in token:
                    text_clean[idx].append(token)
        return text_clean
    
    def predict(self, text):
        clean_text = self.clean(text)
        bigram = self.bigram([clean_text])
        new_review_bow = self.dictionary.doc2bow(bigram[0])
        new_review_lda = self.lda[new_review_bow]
        return sorted(new_review_lda, reverse=True, key=itemgetter(1))

In [90]:
lda_path = "./model/best_model.lda"
dict_path = "./model/dictionary.dict"
bigram_path = "./model/bigram.phs"
trigram_path = "./model/trigram.phs"
lda = LdaPredictor(lda_path, dict_path,  bigram_path, trigram_path)

In [105]:
text = "Здравствуйте. Нужна помощь по лкп. На телефон не могу дозвониться. Нужны права на управление мерчендайзерами. В личном кабинете вкладки мерчендайзер нет"
predict = lda.predict(text)
print(predict)
lda.lda.print_topic(predict[0][0], topn=10)

[(5, 0.27512696), (9, 0.13909876), (4, 0.10990869), (6, 0.10839632), (3, 0.06605234), (2, 0.06023523), (11, 0.047455013), (10, 0.043355893), (1, 0.04008576), (7, 0.038678057), (0, 0.036912337), (8, 0.034694623)]


'0.051*"выдавать_ошибка" + 0.030*"выдавать" + 0.020*"фото_—" + 0.015*"ерп" + 0.015*"сервис_деск" + 0.014*"личный_кабинет" + 0.011*"фото" + 0.010*"ар00" + 0.010*"помочь" + 0.010*"вечер"'