In [118]:
from gensim import corpora, models
from gensim.models import Phrases
from gensim.models.ldamulticore import LdaMulticore
from operator import itemgetter
import re
from pymorphy2 import MorphAnalyzer
import pandas as pd

In [126]:
from nltk.corpus import names
import nltk; nltk.download('stopwords')
# NLTK Stop words
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sych_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [128]:
df = pd.read_csv('stopwords.csv', encoding = 'utf-8', sep=";")
for index, row in df.iterrows():
    russian_stopwords.append(row['stopword'])

In [129]:
class LdaPredictor():
    
    def __init__(self, lda_path, dict_path, bigram_path, trigram_path):
        """
        lda_path - путь к lda модели
        dict_path - путь к словарю 
        bigram_path - путь к биграммам
        trigram_path - путь к триграммам
        
        param: lda_path str
        param: dict_path str
        param: bigram_path str
        param: trigram_path str
        """
        self.dictionary = corpora.Dictionary.load(dict_path)
        self.lda = LdaMulticore.load(lda_path)
        self.bigram_path = bigram_path
        self.trigram_path = trigram_path
        
    def to_lemmatize2(self, text):
        all_word_str = " ".join(text)
        all_word_list = all_word_str.split()
        all_unique_word = pd.Series(all_word_list).unique()
        lemmatized_word_dict = {}
        lemmatizer = MorphAnalyzer()
        for word in all_unique_word:
            lemmatized_word_dict[word] = lemmatizer.normal_forms(word)[0]
        text = ' '.join([lemmatized_word_dict[word] for word in text])
        return text, all_unique_word
        
    def clean(self, text):
        deleted_symols = '[\\\\\'[\]!"$%&()*+,-./:;<=>?@^_`{|}~«»\n]'
        text = re.sub(deleted_symols, ' ', text)
        
        text = ' '.join([elem for elem in str(text).split(' ') if elem.isdigit() == False])
        
        text = text.lower()
        text = [token for token in text.split() if token not in russian_stopwords]

        text, _ = self.to_lemmatize2(text)
        return text.split(' ')
    
    def bigram(self, text):
        bigram = Phrases.load(self.bigram_path)
        trigram = Phrases.load(self.trigram_path)
        text_clean = text
        for idx in range(len(text_clean)):
            for token in bigram[text_clean[idx]]:
                if '_' in token:
                    text_clean[idx].append(token)
            for token in trigram[text_clean[idx]]:
                if '_' in token:
                    text_clean[idx].append(token)
        return text_clean
    
    def predict(self, text):
        clean_text = self.clean(text)
        bigram = self.bigram([clean_text])
        new_review_bow = self.dictionary.doc2bow(bigram[0])
        new_review_lda = self.lda[new_review_bow]
        return sorted(new_review_lda, reverse=True, key=itemgetter(1))

In [130]:
lda_path = "./model/best_model.lda"
dict_path = "./model/dictionary.dict"
bigram_path = "./model/bigram.phs"
trigram_path = "./model/trigram.phs"
lda = LdaPredictor(lda_path, dict_path,  bigram_path, trigram_path)

In [131]:
text = "Здравствуйте. Нужна помощь по лкп. На телефон не могу дозвониться. Нужны права на управление мерчендайзерами. В личном кабинете вкладки мерчендайзер нет"
predict = lda.predict(text)
print(predict)
lda.lda.print_topic(predict[0][0], topn=50)

[(5, 0.2751035), (9, 0.13903603), (4, 0.11002234), (6, 0.10841717), (3, 0.06604457), (2, 0.06021121), (11, 0.04744645), (10, 0.043350853), (1, 0.040084515), (7, 0.038676728), (0, 0.036911834), (8, 0.034694772)]


'0.051*"выдавать_ошибка" + 0.030*"выдавать" + 0.020*"фото_—" + 0.015*"ерп" + 0.015*"сервис_деск" + 0.014*"личный_кабинет" + 0.011*"фото" + 0.010*"ар00" + 0.010*"помочь" + 0.010*"вечер" + 0.010*"зайти" + 0.009*"ар00_ар00" + 0.007*"кабинет" + 0.006*"личный" + 0.006*"закрыть" + 0.005*"документ" + 0.005*"подключиться_удалёнка" + 0.005*"делать" + 0.005*"принтер" + 0.005*"ок" + 0.005*"просто" + 0.005*"деск" + 0.005*"программа" + 0.005*"сервис" + 0.005*"сэд" + 0.004*"вмс" + 0.004*"большой" + 0.004*"карточка_покупатель" + 0.004*"получиться" + 0.004*"печать" + 0.004*"вкладка" + 0.004*"написать" + 0.004*"самый" + 0.004*"портал" + 0.004*"закрывать" + 0.004*"кор_портал" + 0.004*"окно" + 0.004*"светлый" + 0.004*"1с" + 0.004*"заходить" + 0.004*"сделать" + 0.003*"заработать" + 0.003*"почта" + 0.003*"перезагрузка" + 0.003*"единый_сервис" + 0.003*"открываться" + 0.003*"решить" + 0.003*"нужно" + 0.003*"открыть" + 0.003*"печатать"'