# Курсовой проект «Введение в обработку естественного языка» 

In [32]:
# !pip install pymorphy2
# !pip install stop_words
!pip install annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting annoy
  Downloading annoy-1.17.0.tar.gz (646 kB)
[K     |████████████████████████████████| 646 kB 5.0 MB/s 
[?25hBuilding wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.0-cp37-cp37m-linux_x86_64.whl size=391571 sha256=8ee8598d42a436dd8c4852d39bf265c1a9a5c71e23124a3f9aafc22283f8e027
  Stored in directory: /root/.cache/pip/wheels/4f/e8/1e/7cc9ebbfa87a3b9f8ba79408d4d31831d67eea918b679a4c07
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.0


In [33]:
import numpy as np
import pandas as pd

from tqdm import tqdm_notebook
from pymorphy2 import MorphAnalyzer
from functools import lru_cache
import pickle
from stop_words import get_stop_words
import string
from gensim.models import FastText  # Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

import annoy
import re
import time

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
PATH_NLP = "/content/drive/MyDrive/Colab Notebooks/nlp/"
PATH_MODEL = "/content/drive/MyDrive/Colab Notebooks/nlp/models/"

## 1.1. Обучение разговорной модели

In [7]:
question = None
written = False
with open(PATH_NLP + "prepared_answers.txt", "w") as fout:
    with open(PATH_NLP + "Otvety.txt", "r") as fin:
        for line in tqdm_notebook(fin):
            if line.startswith("---"):
                written = False
                continue
            if not written and question is not None:
                fout.write(question.replace("\t", " ").strip() + "\t" + line.replace("\t", " "))
                written = True
                question = None
                continue
            if not written:
                question = line.strip()
                continue

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


0it [00:00, ?it/s]

In [11]:
@lru_cache(maxsize=128, typed=False)

def parse_morpher(text):
    return morpher.parse(text)[0].normal_form

In [12]:
def preprocess_txt(line):
    spls = "".join(i for i in line.strip() if i not in exclude).split()
    spls = [morpher.parse(i.lower())[0].normal_form for i in spls]
    spls = [i for i in spls if i not in sw and i != ""]
    return spls

In [13]:
morpher = MorphAnalyzer()
sw = set(get_stop_words("ru"))
exclude = set(string.punctuation)

### Токенизация текста

In [14]:
import os

In [15]:
%%time
sentences = []
c = 0 

with open(f'{PATH_NLP}Otvety.txt', "r", encoding="utf8") as fin:
    for line in tqdm_notebook(fin):
        spls = preprocess_txt(line)
        sentences.append(spls)   

        c += 1
        if c > 1000000:
            break

sentences = [i for i in sentences if len(i) > 2]    

with open(f'{PATH_MODEL}sentences.pkl', 'wb') as f:
    pickle.dump(sentences, f)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


0it [00:00, ?it/s]

CPU times: user 47min 21s, sys: 20.4 s, total: 47min 41s
Wall time: 49min 2s


In [16]:
with open(f'{PATH_MODEL}sentences.pkl', 'rb') as f:
    sentences = pickle.load(f)

# Обучение разговорных моделей (болталки)

In [17]:
def simple_tokenizer(x):
    return x

In [18]:
%%time
tfidf_vectorizer = TfidfVectorizer(tokenizer=simple_tokenizer, lowercase=False, min_df=2)
tfidf_vectorizer.fit_transform(sentences)

idfs = {v[0]: v[1] for v in zip(tfidf_vectorizer.vocabulary_, tfidf_vectorizer.idf_)}
midf = np.mean(tfidf_vectorizer.idf_)

with open(f'{PATH_MODEL}idfs.pkl', 'wb') as f:
    pickle.dump(idfs, f)
    
with open(f'{PATH_MODEL}midf.pkl', 'wb') as f:
    pickle.dump(midf, f)

CPU times: user 7.97 s, sys: 60.8 ms, total: 8.03 s
Wall time: 8.43 s


In [19]:
with open(f'{PATH_MODEL}idfs.pkl', 'rb') as f:
    idfs = pickle.load(f)

In [20]:
with open(f'{PATH_MODEL}midf.pkl', 'rb') as f:
    midf = pickle.load(f)

### Обучение модели FastText



In [21]:
SIZE_EMB = 200

In [23]:
%%time
modelFT = FastText(sentences=sentences, size=SIZE_EMB, min_count=2, window=5, workers=8, seed=34)
modelFT.save(f'{PATH_MODEL}modelFT')

CPU times: user 12min 54s, sys: 10.2 s, total: 13min 4s
Wall time: 8min 18s


In [24]:
modelFT = FastText.load(f'{PATH_MODEL}modelFT')

### Загружаем ответы в модель приближенного поиска

In [35]:
%%time
t = time.perf_counter()

index_ft = annoy.AnnoyIndex(SIZE_EMB, 'angular')
index_map_ft = {}

counter = 0
c=0

with open(f'{PATH_NLP}prepared_answers.txt', "r", encoding="utf8") as f:
    for line in tqdm_notebook(f):
        n_ft = 0
        spls = line.split("\t")
        index_map_ft[counter] = re.sub(r'\<[^>]*\>', '', spls[1])
        question = preprocess_txt(spls[0])
        vector_ft = np.zeros(SIZE_EMB)
        
        for word in question:
            if word in modelFT:
                vector_ft += modelFT[word]
                n_ft += idfs.get(word, midf)
        if n_ft > 0:
            vector_ft = vector_ft / n_ft
        index_ft.add_item(counter, vector_ft)
        counter += 1

        c += 1
        if c > 1000000:
            break
        
index_ft.build(50)
print('Elapsed, s:', time.perf_counter() - t)

index_ft.save(f'{PATH_MODEL}index_ft')

# Сохраняем индекс вопросов из болталки
with open(f'{PATH_MODEL}index_map_ft.pkl', 'wb') as f:
    pickle.dump(index_map_ft, f)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


0it [00:00, ?it/s]



Elapsed, s: 6075.172832234
CPU times: user 1h 42min 34s, sys: 1min 9s, total: 1h 43min 43s
Wall time: 1h 41min 26s


In [36]:
ft_index = annoy.AnnoyIndex(SIZE_EMB, 'angular')
ft_index.load(f'{PATH_MODEL}index_ft')

True

In [37]:
with open(f'{PATH_MODEL}index_map_ft.pkl', 'rb') as f:
    index_map = pickle.load(f)

# 1.2 Обучение продуктовых моделей

### Поиск похожих товаров

#### Препроцессинг и векторизация продуктовых названий

In [None]:
# def products_preprocess_txt(line):
#     line = re.sub("[\x0b\x0c\s]", ' ', line.lower())
#     line = re.sub("(б\.у)|(б\/у)|(б\\у)", 'бу', line)
#     line = re.sub("[^а-яё \d\w]", ' ', line)

#     spls = "".join(i for i in line.strip() if i not in exclude).split()
#     spls = [morpher.parse(i.lower())[0].normal_form for i in spls]
#     spls = [i for i in spls if i not in sw and i != ""]
#     return spls

In [38]:
%%time
shop_data = pd.read_csv(f'{PATH_NLP}ProductsDataset.csv')

shop_data['text'] = shop_data['title'] + " " + shop_data["descrirption"]
shop_data['text'] = shop_data['text'].apply(lambda x: preprocess_txt(str(x)))
shop_data.head(3)

CPU times: user 2min 45s, sys: 699 ms, total: 2min 45s
Wall time: 2min 46s


In [39]:
shop_data.head(3)

Unnamed: 0,title,descrirption,product_id,category_id,subcategory_id,properties,image_links,text
0,Юбка детская ORBY,"Новая, не носили ни разу. В реале красивей чем...",58e3cfe6132ca50e053f5f82,22.0,2211,"{'detskie_razmer_rost': '81-86 (1,5 года)'}",http://cache3.youla.io/files/images/360_360/58...,"[юбка, детский, orby, новый, носить, реал, кра..."
1,Ботильоны,"Новые,привезены из Чехии ,указан размер 40,но ...",5667531b2b7f8d127d838c34,9.0,902,"{'zhenskaya_odezhda_tzvet': 'Зеленый', 'visota...",http://cache3.youla.io/files/images/360_360/5b...,"[ботильон, новыепривезти, чехия, указать, разм..."
2,Брюки,Размер 40-42. Брюки почти новые - не знаю как ...,59534826aaab284cba337e06,9.0,906,{'zhenskaya_odezhda_dzhinsy_bryuki_tip': 'Брюк...,http://cache3.youla.io/files/images/360_360/59...,"[брюки, размер, 4042, брюки, новый, знать, мер..."


In [40]:
%%time
idxs = set(np.random.randint(0, len(index_map), len(shop_data)))
negative_texts = [" ".join(preprocess_txt(index_map[i])) for i in idxs]
positive_texts = [" ".join(val) for val in shop_data['text'].values]

CPU times: user 7min 56s, sys: 1.36 s, total: 7min 58s
Wall time: 8min


In [41]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

vectorizer = CountVectorizer(ngram_range=(1, 2))

In [42]:
dataset = negative_texts + positive_texts
labels = np.zeros(len(dataset))
labels[len(negative_texts):] = np.ones(len(positive_texts))

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset, labels, test_size=0.2, stratify=labels,
                                                    random_state=13)

In [44]:
%%time
x_train_vec = vectorizer.fit_transform(X_train)
x_test_vec = vectorizer.transform(X_test)

CPU times: user 10.7 s, sys: 609 ms, total: 11.3 s
Wall time: 11.4 s


In [45]:
with open(f'{PATH_MODEL}vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [46]:
vectorizer = CountVectorizer(ngram_range=(1, 2))
with open(f'{PATH_MODEL}vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

In [47]:
lr = LogisticRegression().fit(x_train_vec, y_train)

In [48]:
with open(f'{PATH_MODEL}lr.pkl', 'wb') as f:
    pickle.dump(lr, f)

In [49]:
lr = LogisticRegression()
with open(f'{PATH_MODEL}lr.pkl', 'rb') as f:
    lr = pickle.load(f) 

In [50]:
from sklearn.metrics import accuracy_score

accuracy_score(y_true=y_test, y_pred=lr.predict(x_test_vec))

0.9778754786555098

### Обучение модели TF-IDF

In [51]:
%%time
tfidf_vect_prod = TfidfVectorizer(lowercase=False, min_df=2)
tfidf_vect_prod.fit(X_train)

idfs_prod = {v[0]: v[1] for v in zip(tfidf_vect_prod.vocabulary_, tfidf_vect_prod.idf_)}
midf_prod = np.mean(tfidf_vect_prod.idf_)

with open(f'{PATH_MODEL}idfs_prod.pkl', 'wb') as f:
    pickle.dump(idfs_prod, f)
with open(f'{PATH_MODEL}midf_prod.pkl', 'wb') as f:
    pickle.dump(midf_prod, f)

CPU times: user 2.2 s, sys: 34 ms, total: 2.24 s
Wall time: 2.26 s


In [52]:
with open(f'{PATH_MODEL}idfs_prod.pkl', 'rb') as f:
    idfs_prod = pickle.load(f)

In [53]:
with open(f'{PATH_MODEL}midf_prod.pkl', 'rb') as f:
    midf_prod = pickle.load(f)

### Annoy. Алгоритм приблизительного поиска 

In [57]:
%%time
ft_index_shop = annoy.AnnoyIndex(SIZE_EMB ,'angular')
index_map_shop = {}
counter = 0

for i in tqdm_notebook(range(len(shop_data))):
    n_ft = 0
    index_map_shop[counter] = (shop_data.loc[i, "title"], shop_data.loc[i, "image_links"])
    vector_ft = np.zeros(SIZE_EMB)
    for word in shop_data.loc[i, "text"]:
        if word in modelFT:
            vector_ft += modelFT[word] * idfs.get(word, midf_prod)
            n_ft += idfs.get(word, midf_prod)
    if n_ft > 0:
        vector_ft = vector_ft / n_ft
    ft_index_shop.add_item(counter, vector_ft)
    counter += 1

ft_index_shop.build(50)
# ft_index_shop.save('shop.ann')
ft_index_shop.save(f'{PATH_MODEL}ft_index_shop')

with open(f'{PATH_MODEL}index_map_shop.pkl', 'wb') as f:
    pickle.dump(index_map_shop, f)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/35548 [00:00<?, ?it/s]

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


CPU times: user 32 s, sys: 1.95 s, total: 33.9 s
Wall time: 30.2 s


In [58]:
ft_index_shop = annoy.AnnoyIndex(SIZE_EMB, 'angular')
ft_index_shop.load(f'{PATH_MODEL}ft_index_shop') 

True

In [59]:
with open(f'{PATH_MODEL}index_map_shop.pkl', 'rb') as f:
    index_map_shop = pickle.load(f)