In [105]:
import numpy as np
import pandas as pd

from nltk.corpus import names
import nltk; nltk.download('stopwords')
# NLTK Stop words
from nltk.corpus import stopwords

import re

from pymorphy2 import MorphAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt


n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/artemzraev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [95]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
import dill

In [96]:
data = pd.read_csv("Заявки в чатбот.csv", encoding = 'utf-16', sep=";")
data.tail(20)

Unnamed: 0,mesTExt
20136,Добрый день! В базе Д11 по арт 449499 Свинина ...
20137,"Добрый день. Не работает апс Выключаешь его, в..."
20138,Добрый день.В автозаказе поставщик Новозеланск...
20139,Добрый день Не смогу сменить пароль nan но я в...
20140,Доброго утра: Поставщик Ист Лоджистикал. Подтв...
20141,Добрый день Сегодня была смена юр.лица на самб...
20142,Доброе утро. \nПрошу перезагрузить RC-LOG59. С...
20143,"Здравствуйте! ПОдскажите, пожалуйста, как пере..."
20144,"Да Добрый день.\nНе могу свести заказ, не кото..."
20145,Добрый день! Прошу помочь в настройке работы п...


In [98]:
class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value
        
    def get_stopwords(self):
        russian_stopwords = stopwords.words("russian")
        df_sw = pd.read_csv('stopwords.csv', encoding = 'utf-8', sep=";")
        for index, row in df_sw.iterrows():
            russian_stopwords.append(row['stopword'])
        return russian_stopwords
        
    def to_lemmatize2(self, df, key):
        all_word_str = " ".join(df[key])
        all_word_list = all_word_str.split()
        all_unique_word = pd.Series(all_word_list).unique()
        lemmatized_word_dict = {}
        lemmatizer = MorphAnalyzer()
        for word in all_unique_word:
            lemmatized_word_dict[word] = lemmatizer.normal_forms(word)[0]
        lemm_func = lambda text: ' '.join([lemmatized_word_dict[word] for word in text.split()])
        df[key] = df[key].apply(lemm_func)
        return df, all_unique_word
    
    def fit(self, X, y=None):
        return self
    def transform(self, X):

        X[self.key] = X[self.key].replace('—','-')
        
        #1. удаляем пунктуацию
        deleted_symbols = r'[\\\\\'[\]!"$%&()*+,-./:;<=>?№@^_`{|}~«»\n]'  
        func = lambda text : re.sub(deleted_symbols, ' ', str(text))
        X[self.key] = X[self.key].apply(func)
        
        #2. удалим смайлики
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
        func = lambda text : re.sub(emoji_pattern, ' ', str(text))
        X[self.key] = X[self.key].apply(func)
        
        #3. удалим отдельно стоящие цифры
        func = lambda text : ' '.join([elem for elem in str(text).split(' ') if elem.isdigit() == False])   
        X[self.key] = X[self.key].apply(func)
        
        #4. приводим к нижнему регистру
        X[self.key] = X[self.key].apply(lambda text : text.lower())
        
        #5. лемматизация (приводим слова к начальной форме)
        X, _ = self.to_lemmatize2(X, self.key)
        
        #6. удаляем стоп слова
        sw = self.get_stopwords()
        func = lambda text : ' '.join([elem for elem in str(text).split(' ') if elem not in sw and not elem in ['nan', np.nan]])   
        X[self.key] = X[self.key].apply(func)
        
        return X 
    
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        #приведем к виду списка списков, потом этот список списков пойдет в модель LDA
        lst = X[self.key].to_list()
        texts = []
        for i in range(len(lst)):    
            texts.append(lst[i].split(' '))
        return texts

In [103]:
#example
description = Pipeline([
                ('imputer', TextImputer('mesTExt', '')),
                ('selector', ColumnSelector(key='mesTExt'))
            ])

#description.fit(data)
#description.transform(data.iloc[:10])

In [126]:
pipeline = Pipeline([
    ('description', description),
    ('tfidf_vectorizer', TfidfVectorizer(max_df=0.95, min_df=2, analyzer=lambda x: x, 
                                   max_features=n_features,
                                   stop_words='english')),
    ('lda', LatentDirichletAllocation(n_components=n_components, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)),
])

model = pipeline.fit(data.iloc[:])

In [130]:
#сразу pandas dataframe сделаем
test_preds = pd.DataFrame(pipeline.transform(data.iloc[:10]))
test_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.05,0.05,0.05,0.05,0.05,0.55,0.05,0.05,0.05,0.05
1,0.02653,0.02653,0.761178,0.02654,0.026525,0.026525,0.026529,0.026528,0.026586,0.02653
2,0.037421,0.037421,0.23478,0.148857,0.037421,0.037421,0.037421,0.03743,0.354405,0.037421
3,0.027139,0.027135,0.336122,0.027137,0.027121,0.027121,0.44682,0.027148,0.027134,0.027123
4,0.05,0.05,0.05,0.05,0.05,0.55,0.05,0.05,0.05,0.05
5,0.05,0.050004,0.050001,0.549987,0.05,0.05,0.050007,0.050001,0.05,0.05
6,0.029035,0.313047,0.276153,0.207598,0.029022,0.029022,0.029028,0.029031,0.029031,0.029033
7,0.027433,0.643955,0.096357,0.027412,0.027394,0.027394,0.027401,0.067842,0.027403,0.027409
8,0.035098,0.68418,0.035088,0.035112,0.035085,0.035085,0.035092,0.035087,0.035087,0.035086
9,0.026665,0.654423,0.02669,0.026664,0.026654,0.026671,0.026658,0.026664,0.026675,0.132237


Посмотреть на топ слова

In [128]:
print("\nTopics in LDA model:")
tf_feature_names = pipeline.steps[1][1].get_feature_names()
print_top_words(pipeline.steps[2][1], tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: отчёт зайти база программа хороший 1с запись посмотреть помочь весь учётный висеть виснуть подвисать получиться erp выкидывать разблокировать плохо сильно
Topic #1: пз подтверждение заказ мм упр реализация вопрос суворов весь документ эбуп долго проверить отгрузить проводиться уп прогружаться ждать открываться зависать
Topic #2: весь зайти пароль удалённый компьютер рабочий доступ подключиться почта войти это — стол удалёнка помочь ошибка просить получиться заявка работа
Topic #3: весь пко ошибка заработать ерп почта чек интернет эник печатать касса хабаровск норма печать благовещенск уйти маркета ру вроде нормальный
Topic #4: this ru whatsapp message system a is to client start with chat which allows mail dv nevada com https gmail
Topic #5:  закрывать pdf сэд задача спс шкотовый карточка покупатель сеть линк заявка артём понедельник денис возражение дв дмитрий половина находиться
Topic #6: касса камчатка марс кабинет выгрузка корма весь ошибка заявка св

In [129]:
data.iloc[1]

mesTExt    сломаться ноготь сломать ноготь сломать ноготь...
Name: 1, dtype: object