In [14]:
import os
import pandas as pd
from sklearn.utils import shuffle
from stop_words import get_stop_words
from string import punctuation


import re, nltk
from sklearn.feature_extraction.text import CountVectorizer        
from nltk.stem.porter import PorterStemmer

In [15]:
DATA = os.path.join(os.getcwd(),'data.xlsx')

df = pd.read_excel(DATA)
df = shuffle(df)
df = df[df['Краткое описание'].notnull()]

category_encoding = dict(zip(list(df['Категория'].unique()), [0,1,2,3]))

df['label'] = df['Категория']#.apply(lambda x: category_encoding[x])
df['desc'] = df['Краткое описание']
df = df.drop(labels=['Краткое описание', 'Категория'], axis=1)

punctuation += '№'

stopwords = get_stop_words('russian')

In [16]:
def preprocess_data(inp_str):
    inp_str = inp_str.lower()

    # replace redundant_signs
    for item in punctuation:
        inp_str = inp_str.replace(item, ' ')

    # replace digits
    inp_str = re.sub('\d', ' ', inp_str)
    inp_str = ' '.join(list(filter(None, inp_str.split(' '))))
    return inp_str

In [17]:
text_data = df['desc'].apply(preprocess_data)

In [18]:
txt = text_data.str.lower().str.cat(sep=' ')
words = nltk.word_tokenize(txt)
word_dist = nltk.FreqDist(words)
most_common_words = word_dist.most_common(25)
most_common_words

[('для', 4433),
 ('доступа', 4127),
 ('действия', 3983),
 ('срок', 3971),
 ('сотрудника', 3967),
 ('таб', 3525),
 ('в', 2534),
 ('пароль', 2471),
 ('sap', 2435),
 ('пароля', 1391),
 ('к', 1379),
 ('hr', 1250),
 ('доступы', 1222),
 ('закрыты', 1077),
 ('с', 910),
 ('обнулить', 907),
 ('работы', 861),
 ('обнуление', 836),
 ('увольнение', 810),
 ('jrm', 645),
 ('просьба', 578),
 ('ир', 569),
 ('прошу', 454),
 ('доступ', 383),
 ('володимирович', 369)]

In [19]:
from nltk.stem import SnowballStemmer
def tokenize(text):
    text = re.sub("[а-я]", " ", text)
    word_list = nltk.word_tokenize(text)
    stemmer = SnowballStemmer("russian")
    stems = [stemmer.stem(word) for word in word_list]
    return stems

In [20]:
vectorizer = CountVectorizer(tokenizer=tokenize, stop_words = most_common_words)
data_features = vectorizer.fit_transform(text_data)

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(
        df['desc'], 
        df['label'],
        test_size=0.2, 
        random_state=42)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('vectorizer', vectorizer), 
                     ('classifier', LogisticRegression())])
model = pipeline.fit(X=X_train, y=y_train)

In [23]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
print('accuracy:', accuracy_score(y_test, y_pred))

                                  precision    recall  f1-score   support

                Обнуление пароля       0.90      0.97      0.93      1013
             Отзыв доступов к ИС       0.97      0.96      0.96      1255
  Предоставление доступов в сеть       0.78      0.29      0.42        24
Сетевые папки - Файловые ресурсы       1.00      0.09      0.16        57

                     avg / total       0.94      0.93      0.92      2349

accuracy: 0.934014474244


In [24]:
import pickle


# save the classifier
with open('my_dumped_classifier.pkl', 'wb') as fid:
    pickle.dump(model, fid)
    
del model

In [25]:
with open('my_dumped_classifier.pkl', 'rb') as model_bin:
    model = pickle.load(model_bin)

In [26]:
test_sample = ['help me. please!']
test_sample1 = ['Срок действия доступа для сотрудника Бичков Андрій Валерійович  (таб.№ 50137450']

model.predict(test_sample1)

array(['Отзыв доступов к ИС'], dtype=object)