In [14]:
import os
import re

import pandas as pd
import numpy as np
from joblib import dump

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import BallTree
from sklearn.base import BaseEstimator
from sklearn.pipeline import make_pipeline

In [15]:
scout_name_list = [
    'argentum',
    'ckayt',
    'cкaут',
    'cкаут',
    'scout',
    'scout33',
    'scoutуроки',
    'xek',
    '}{ek',
    'вельзeвул',
    'вельзевул',
    'мастер}{ek',
    'мастер}{ек',
    'скаут',
    'скаут|ушел',
    'скаутболеет'
]

friend_black_list = [
    'bot',
]


def filter_name(name):
    name = re.sub(r'[-\[\]_\^\x03\x0f]+[\d]*', r'', name)
    return name


def filter_text(text):
    text = re.sub(r'[\x02\x03\x95\x0f\x16\x1f\x7f]+[,\d]*', '', text)
    if len(set(re.sub(r'\W', r'', text))) < 3:
        return ''
    return text


def softmax(x):
    proba = np.exp(-x)
    return proba / sum(proba)


class NeighborSampler(BaseEstimator):
    def __init__(self, k=5, temperature=1.0):
        self.k = k
        self.temperature = temperature
    def fit(self, X, y):
        self.tree_ = BallTree(X)
        self.y_ = np.array(y)
    def predict(self, X, random_state=None):
        distances, indices = self.tree_.query(X, return_distance=True, k=self.k)
        result = []
        for distance, index in zip(distances, indices):
            result.append(np.random.choice(index, p=softmax(distance * self.temperature)))
        return self.y_[result]

In [16]:
# Преобразуем логи переписок в список диалогов
# Должны остаться только диалоги со Скаутом
# Предусмотреть отчистку от спама

file_dir = 'messages_log'
regex_message = re.compile(r'<(\S+)>\s([\s\S]+)$')

files = os.listdir(file_dir)
dialog_list = []
for file in files:
    prev_name = ''
    prev_text = ''
    scout_name = ''
    friend_name = ''
    question = []
    answer = []
    filepath = os.path.join(file_dir, file)
    if not file.startswith('#'):
        f = open(filepath, 'r', encoding='cp1251', errors='ignore')
        for line in f.readlines():
            line = line.strip().lower()
            search_message = re.search(regex_message, line)
            if search_message:
                name = filter_name(search_message.group(1))
                text = filter_text(search_message.group(2))

                # spam filter
                if name == prev_name and text==prev_text:
                    continue

                if name != prev_name and question != [] and answer != []:
                    dialog_list.append([
                        friend_name,
                        ' '.join(question),
                        scout_name,
                        ' '.join(answer),
                    ])
                    question = []
                    answer = []
                
                if name in friend_black_list:
                    continue
                elif name not in scout_name_list and text:
                    friend_name = name
                    question.append(text)
                elif name in scout_name_list and text:
                    if question == []:
                        continue
                    scout_name = name
                    answer.append(text)
                prev_name = name
                prev_text = text

len(dialog_list)

51962

In [17]:
# Список в датафрейм
df = pd.DataFrame(dialog_list, columns = ['friend', 'question', 'scout', 'answer'])
df.shape

(51962, 4)

In [18]:
# Фильтрация коротких и длинных сообщений
df = df[(df.answer.str.len() < 200) & (df.question.str.len() < 200)]
df = df[(df.answer.str.len() > 2) & (df.question.str.len() > 2)]
df.shape

(51030, 4)

In [19]:
# Векторизация текстов
vectorizer = TfidfVectorizer()
vectorizer.fit(df.question)
matrix_big = vectorizer.transform(df.question)
matrix_big.shape

(51030, 35551)

In [20]:
# Сокращение размерности
svd = TruncatedSVD(n_components=1000)
svd.fit(matrix_big)
matrix_small = svd.transform(matrix_big)
# Процент сжатия
svd.explained_variance_ratio_.sum()

0.5594199232659739

In [21]:
# Поиск ближайших соседей
ns = NeighborSampler()
ns.fit(matrix_small, df.answer)
pipe = make_pipeline(vectorizer, svd, ns)

In [22]:
# Тест модели
pipe.predict(['Привет скаут!'])

array(['привет'], dtype=object)

In [23]:
# Сохраняем модель
dump(pipe, 'pipe.joblib')

['pipe.joblib']