### In this final assignment your goal is to process a large corpus of unstructured chat data, discover it's structure and build a Question Answering system based on it

In [1]:
import re
import os
import json
import nltk
import scipy
import pymystem3
import numpy as np
import pandas as pd

from nltk.tag import pos_tag
from nltk.stem.snowball import SnowballStemmer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from gensim.models import KeyedVectors
from itertools import chain
from tqdm import tqdm

### 3.1 Unzip and explore the data

In [2]:
# !unzip ./support_logs.zip

In [5]:
faq = open("./support_faq.txt", encoding='utf8').read()

In [6]:
raw_chats = open("./support_chats.txt", encoding='utf8').read()

In [7]:
# print(faq[:995])

In [8]:
def change_med(med_data):
    
    client_reply = med_data['client_reply']
    personal_reply = med_data['personal_reply']

    concat_client = []
    concat_personal = []
    
    summary = []

    if len(client_reply) != 1:
        for n,repl in enumerate(client_reply):
            if n == 0:
                client = repl[0]
            else:
                if repl[2] == client_reply[n-1][2]:
                    client += ' '
                    client += repl[0]
                else:
                    concat_client.append(client)
                    client = repl[0]
        concat_client.append(client)
    else:
        concat_personal = [personal_reply[0][2]]
        
    if len(personal_reply) != 1:
        for n,repl in enumerate(personal_reply):
            if n == 0:
                personal = repl[0]
            else:
                if repl[1] == personal_reply[n-1][1]:
                    personal += ' '
                    personal += repl[0]
                else:
                    concat_personal.append(personal)
                    personal = repl[0]
        concat_personal.append(personal)
    else:
        concat_personal = [personal_reply[0][2]]
        
    if len(concat_personal) > len(concat_client):
        for i in range(len(concat_personal) - len(concat_client)):
            concat_client.append('')
    elif len(concat_personal) < len(concat_client):
        for i in range(len(concat_client) - len(concat_personal)):
            concat_personal.append('')
    
    for v in range(len(concat_personal)):
        summary.append([concat_client[v], concat_personal[v]])
        
    return summary

In [9]:
data = []
for chat in tqdm(raw_chats.split('-- -- -- --')):
    client = 0
    personal = 0
    chat_id = -1
    client_id = -1
    med_data = {'client_reply':[], 'personal_reply':[]}
    for string in chat.split('\n'):
        try:
            if string.find('Chat number') != -1:
                chat_id = string.split('=')[1].strip()
            elif string.find('Client Id') != -1:
                client_id = string.split('=')[1].strip()
            elif string.find('Клиент') != -1:
                med_data['client_reply'].append([string.split('Клиент :')[1].strip(), client, personal])
                client += 1
            elif string.find('Сотрудник') != -1:
                med_data['personal_reply'].append([string.split('Сотрудник :')[1].strip(), client, personal])
                personal += 1
        except:
            pass
    try:
        for d in change_med(med_data):
            data.append([client_id, chat_id, d[0], d[1]])
    except:
        print(med_data, chat)

 22%|██▏       | 6178/28363 [00:00<00:01, 15434.38it/s]

{'client_reply': [], 'personal_reply': []}  Пересылаемое сообщение 
{'client_reply': [], 'personal_reply': []}  Пересылаемое сообщение 


100%|██████████| 28363/28363 [00:01<00:00, 14228.51it/s]

{'client_reply': [], 'personal_reply': []} 







In [10]:
dialogues = pd.DataFrame(data, columns=['client_id', 'chat_id', 'question', 'answer'])

In [14]:
lemmatizer = pymystem3.Mystem()
def tokenize_and_lemmatize(text):
    analysis = lemmatizer.analyze(text.strip())
    tokens = []
    for an in analysis:
        if 'analysis' in an:
            try:
                tokens.append(an['analysis'][0]['lex'])
            except IndexError:
                tokens.append(an['text'])
    return ' '.join(tokens)

In [15]:
dialogues['token_question'] = [tokenize_and_lemmatize(i) for i in tqdm(dialogues.question)]
dialogues['token_answer'] = [tokenize_and_lemmatize(i) for i in tqdm(dialogues.answer)]


  0%|          | 0/171017 [00:00<?, ?it/s][A
  0%|          | 1/171017 [00:00<44:44:53,  1.06it/s][A
  0%|          | 184/171017 [00:01<16:07, 176.60it/s][A
  0%|          | 406/171017 [00:01<08:00, 355.26it/s][A
  0%|          | 593/171017 [00:01<05:57, 477.18it/s][A
  0%|          | 817/171017 [00:01<04:39, 608.07it/s][A
  1%|          | 1002/171017 [00:01<04:05, 693.74it/s][A
  1%|          | 1200/171017 [00:01<03:38, 776.89it/s][A
  1%|          | 1449/171017 [00:01<03:12, 880.34it/s][A
  1%|          | 1645/171017 [00:01<03:00, 940.22it/s][A
  1%|          | 1863/171017 [00:01<02:47, 1006.92it/s][A
  1%|          | 2064/171017 [00:01<02:40, 1054.44it/s][A
  1%|▏         | 2261/171017 [00:02<02:34, 1089.18it/s][A
  1%|▏         | 2484/171017 [00:02<02:27, 1141.56it/s][A
  2%|▏         | 2683/171017 [00:02<02:23, 1174.20it/s][A
  2%|▏         | 2877/171017 [00:02<02:19, 1205.97it/s][A
  2%|▏         | 3071/171017 [00:02<02:17, 1225.32it/s][A
  2%|▏         | 3281/1

 16%|█▌        | 27105/171017 [00:15<01:20, 1780.66it/s][A
 16%|█▌        | 27319/171017 [00:15<01:20, 1783.02it/s][A
 16%|█▌        | 27532/171017 [00:15<01:20, 1785.25it/s][A
 16%|█▌        | 27757/171017 [00:15<01:20, 1788.22it/s][A
 16%|█▋        | 27965/171017 [00:15<01:19, 1788.33it/s][A
 16%|█▋        | 28174/171017 [00:15<01:19, 1790.21it/s][A
 17%|█▋        | 28394/171017 [00:15<01:19, 1792.72it/s][A
 17%|█▋        | 28602/171017 [00:15<01:19, 1793.73it/s][A
 17%|█▋        | 28810/171017 [00:16<01:19, 1795.50it/s][A
 17%|█▋        | 29033/171017 [00:16<01:18, 1798.18it/s][A
 17%|█▋        | 29244/171017 [00:16<01:18, 1797.79it/s][A
 17%|█▋        | 29444/171017 [00:16<01:18, 1798.11it/s][A
 17%|█▋        | 29640/171017 [00:16<01:18, 1798.37it/s][A
 17%|█▋        | 29832/171017 [00:16<01:18, 1797.71it/s][A
 18%|█▊        | 30020/171017 [00:16<01:18, 1798.20it/s][A
 18%|█▊        | 30217/171017 [00:16<01:18, 1799.19it/s][A
 18%|█▊        | 30407/171017 [00:16<01:

 32%|███▏      | 55378/171017 [00:29<01:01, 1884.64it/s][A
 33%|███▎      | 55597/171017 [00:29<01:01, 1885.68it/s][A
 33%|███▎      | 55821/171017 [00:29<01:01, 1886.82it/s][A
 33%|███▎      | 56034/171017 [00:29<01:00, 1886.92it/s][A
 33%|███▎      | 56241/171017 [00:29<01:00, 1886.42it/s][A
 33%|███▎      | 56439/171017 [00:29<01:00, 1886.49it/s][A
 33%|███▎      | 56635/171017 [00:30<01:00, 1886.33it/s][A
 33%|███▎      | 56827/171017 [00:30<01:00, 1886.23it/s][A
 33%|███▎      | 57018/171017 [00:30<01:00, 1885.61it/s][A
 33%|███▎      | 57210/171017 [00:30<01:00, 1885.70it/s][A
 34%|███▎      | 57397/171017 [00:30<01:00, 1884.73it/s][A
 34%|███▎      | 57618/171017 [00:30<01:00, 1885.74it/s][A
 34%|███▍      | 57822/171017 [00:30<01:00, 1886.25it/s][A
 34%|███▍      | 58027/171017 [00:30<00:59, 1886.70it/s][A
 34%|███▍      | 58225/171017 [00:30<00:59, 1886.50it/s][A
 34%|███▍      | 58433/171017 [00:30<00:59, 1887.13it/s][A
 34%|███▍      | 58631/171017 [00:31<00:

 49%|████▉     | 83626/171017 [00:43<00:45, 1913.40it/s][A
 49%|████▉     | 83827/171017 [00:43<00:45, 1913.09it/s][A
 49%|████▉     | 84023/171017 [00:43<00:45, 1913.17it/s][A
 49%|████▉     | 84218/171017 [00:44<00:45, 1912.45it/s][A
 49%|████▉     | 84404/171017 [00:44<00:45, 1911.79it/s][A
 49%|████▉     | 84613/171017 [00:44<00:45, 1912.16it/s][A
 50%|████▉     | 84802/171017 [00:44<00:45, 1911.80it/s][A
 50%|████▉     | 85024/171017 [00:44<00:44, 1912.47it/s][A
 50%|████▉     | 85220/171017 [00:44<00:44, 1912.56it/s][A
 50%|████▉     | 85444/171017 [00:44<00:44, 1913.27it/s][A
 50%|█████     | 85653/171017 [00:44<00:44, 1913.68it/s][A
 50%|█████     | 85861/171017 [00:44<00:44, 1914.02it/s][A
 50%|█████     | 86086/171017 [00:44<00:44, 1914.76it/s][A
 50%|█████     | 86298/171017 [00:45<00:44, 1915.15it/s][A
 51%|█████     | 86543/171017 [00:45<00:44, 1916.32it/s][A
 51%|█████     | 86764/171017 [00:45<00:43, 1915.31it/s][A
 51%|█████     | 86971/171017 [00:45<00:

 65%|██████▌   | 111717/171017 [00:57<00:30, 1928.34it/s][A
 65%|██████▌   | 111909/171017 [00:58<00:30, 1928.33it/s][A
 66%|██████▌   | 112099/171017 [00:58<00:30, 1927.70it/s][A
 66%|██████▌   | 112289/171017 [00:58<00:30, 1927.59it/s][A
 66%|██████▌   | 112472/171017 [00:58<00:30, 1927.39it/s][A
 66%|██████▌   | 112684/171017 [00:58<00:30, 1927.72it/s][A
 66%|██████▌   | 112876/171017 [00:58<00:30, 1927.69it/s][A
 66%|██████▌   | 113081/171017 [00:58<00:30, 1927.87it/s][A
 66%|██████▌   | 113276/171017 [00:58<00:29, 1927.30it/s][A
 66%|██████▋   | 113485/171017 [00:58<00:29, 1927.57it/s][A
 66%|██████▋   | 113692/171017 [00:58<00:29, 1927.80it/s][A
 67%|██████▋   | 113923/171017 [00:59<00:29, 1928.45it/s][A
 67%|██████▋   | 114130/171017 [00:59<00:29, 1928.52it/s][A
 67%|██████▋   | 114334/171017 [00:59<00:29, 1928.09it/s][A
 67%|██████▋   | 114536/171017 [00:59<00:29, 1928.24it/s][A
 67%|██████▋   | 114733/171017 [00:59<00:29, 1928.10it/s][A
 67%|██████▋   | 114948/

 81%|████████▏ | 139143/171017 [01:12<00:16, 1931.99it/s][A
 81%|████████▏ | 139332/171017 [01:12<00:16, 1931.94it/s][A
 82%|████████▏ | 139520/171017 [01:12<00:16, 1931.82it/s][A
 82%|████████▏ | 139726/171017 [01:12<00:16, 1931.97it/s][A
 82%|████████▏ | 139919/171017 [01:12<00:16, 1931.87it/s][A
 82%|████████▏ | 140134/171017 [01:12<00:15, 1932.17it/s][A
 82%|████████▏ | 140332/171017 [01:12<00:15, 1932.09it/s][A
 82%|████████▏ | 140527/171017 [01:12<00:15, 1931.78it/s][A
 82%|████████▏ | 140718/171017 [01:12<00:15, 1931.75it/s][A
 82%|████████▏ | 140912/171017 [01:12<00:15, 1931.74it/s][A
 83%|████████▎ | 141103/171017 [01:13<00:15, 1931.59it/s][A
 83%|████████▎ | 141305/171017 [01:13<00:15, 1931.69it/s][A
 83%|████████▎ | 141498/171017 [01:13<00:15, 1931.68it/s][A
 83%|████████▎ | 141690/171017 [01:13<00:15, 1931.35it/s][A
 83%|████████▎ | 141880/171017 [01:13<00:15, 1931.29it/s][A
 83%|████████▎ | 142074/171017 [01:13<00:14, 1931.28it/s][A
 83%|████████▎ | 142273/

 98%|█████████▊| 166968/171017 [01:25<00:02, 1941.73it/s][A
 98%|█████████▊| 167171/171017 [01:26<00:01, 1941.79it/s][A
 98%|█████████▊| 167366/171017 [01:26<00:01, 1941.53it/s][A
 98%|█████████▊| 167555/171017 [01:26<00:01, 1941.23it/s][A
 98%|█████████▊| 167765/171017 [01:26<00:01, 1941.40it/s][A
 98%|█████████▊| 167956/171017 [01:26<00:01, 1941.30it/s][A
 98%|█████████▊| 168182/171017 [01:26<00:01, 1941.66it/s][A
 98%|█████████▊| 168383/171017 [01:26<00:01, 1941.57it/s][A
 99%|█████████▊| 168600/171017 [01:26<00:01, 1941.84it/s][A
 99%|█████████▊| 168811/171017 [01:26<00:01, 1942.02it/s][A
 99%|█████████▉| 169016/171017 [01:27<00:01, 1941.84it/s][A
 99%|█████████▉| 169214/171017 [01:27<00:00, 1941.42it/s][A
 99%|█████████▉| 169409/171017 [01:27<00:00, 1941.43it/s][A
 99%|█████████▉| 169599/171017 [01:27<00:00, 1941.04it/s][A
 99%|█████████▉| 169784/171017 [01:27<00:00, 1940.93it/s][A
 99%|█████████▉| 169997/171017 [01:27<00:00, 1941.15it/s][A
100%|█████████▉| 170224/

  9%|▉         | 15590/171017 [00:12<02:02, 1270.15it/s][A
  9%|▉         | 15733/171017 [00:12<02:02, 1271.35it/s][A
  9%|▉         | 15868/171017 [00:12<02:01, 1271.94it/s][A
  9%|▉         | 16021/171017 [00:12<02:01, 1273.88it/s][A
  9%|▉         | 16158/171017 [00:12<02:01, 1274.00it/s][A
 10%|▉         | 16302/171017 [00:12<02:01, 1275.24it/s][A
 10%|▉         | 16440/171017 [00:12<02:01, 1274.70it/s][A
 10%|▉         | 16575/171017 [00:12<02:01, 1275.21it/s][A
 10%|▉         | 16712/171017 [00:13<02:00, 1275.78it/s][A
 10%|▉         | 16848/171017 [00:13<02:00, 1276.37it/s][A
 10%|▉         | 16983/171017 [00:13<02:00, 1276.48it/s][A
 10%|█         | 17116/171017 [00:13<02:00, 1275.44it/s][A
 10%|█         | 17244/171017 [00:13<02:00, 1275.26it/s][A
 10%|█         | 17399/171017 [00:13<02:00, 1277.24it/s][A
 10%|█         | 17535/171017 [00:13<02:00, 1277.17it/s][A
 10%|█         | 17710/171017 [00:13<01:59, 1280.56it/s][A
 10%|█         | 17856/171017 [00:13<01:

 20%|█▉        | 33857/171017 [00:26<01:47, 1270.86it/s][A
 20%|█▉        | 33988/171017 [00:26<01:47, 1271.00it/s][A
 20%|█▉        | 34113/171017 [00:26<01:47, 1270.41it/s][A
 20%|██        | 34235/171017 [00:26<01:47, 1270.08it/s][A
 20%|██        | 34356/171017 [00:27<01:47, 1269.84it/s][A
 20%|██        | 34477/171017 [00:27<01:47, 1269.56it/s][A
 20%|██        | 34610/171017 [00:27<01:47, 1269.72it/s][A
 20%|██        | 34744/171017 [00:27<01:47, 1269.95it/s][A
 20%|██        | 34871/171017 [00:27<01:47, 1269.44it/s][A
 20%|██        | 34994/171017 [00:27<01:47, 1269.12it/s][A
 21%|██        | 35136/171017 [00:27<01:47, 1269.62it/s][A
 21%|██        | 35270/171017 [00:27<01:46, 1269.85it/s][A
 21%|██        | 35417/171017 [00:27<01:46, 1270.56it/s][A
 21%|██        | 35552/171017 [00:27<01:46, 1270.57it/s][A
 21%|██        | 35685/171017 [00:28<01:46, 1270.06it/s][A
 21%|██        | 35813/171017 [00:28<01:46, 1269.96it/s][A
 21%|██        | 35940/171017 [00:28<01:

 30%|███       | 51951/171017 [00:40<01:33, 1269.85it/s][A
 30%|███       | 52087/171017 [00:41<01:33, 1270.05it/s][A
 31%|███       | 52214/171017 [00:41<01:33, 1269.84it/s][A
 31%|███       | 52339/171017 [00:41<01:33, 1269.47it/s][A
 31%|███       | 52463/171017 [00:41<01:33, 1269.39it/s][A
 31%|███       | 52595/171017 [00:41<01:33, 1269.50it/s][A
 31%|███       | 52727/171017 [00:41<01:33, 1269.61it/s][A
 31%|███       | 52854/171017 [00:41<01:33, 1269.11it/s][A
 31%|███       | 52992/171017 [00:41<01:32, 1269.34it/s][A
 31%|███       | 53118/171017 [00:41<01:32, 1269.15it/s][A
 31%|███       | 53250/171017 [00:41<01:32, 1269.24it/s][A
 31%|███       | 53377/171017 [00:42<01:32, 1268.76it/s][A
 31%|███▏      | 53499/171017 [00:42<01:32, 1268.52it/s][A
 31%|███▏      | 53619/171017 [00:42<01:32, 1268.26it/s][A
 31%|███▏      | 53748/171017 [00:42<01:32, 1268.28it/s][A
 32%|███▏      | 53884/171017 [00:42<01:32, 1268.48it/s][A
 32%|███▏      | 54010/171017 [00:42<01:

 41%|████      | 69936/171017 [00:55<01:19, 1264.46it/s][A
 41%|████      | 70068/171017 [00:55<01:19, 1264.54it/s][A
 41%|████      | 70190/171017 [00:55<01:19, 1264.32it/s][A
 41%|████      | 70317/171017 [00:55<01:19, 1264.33it/s][A
 41%|████      | 70439/171017 [00:55<01:19, 1264.17it/s][A
 41%|████▏     | 70560/171017 [00:55<01:19, 1264.04it/s][A
 41%|████▏     | 70686/171017 [00:55<01:19, 1264.00it/s][A
 41%|████▏     | 70812/171017 [00:56<01:19, 1263.98it/s][A
 41%|████▏     | 70935/171017 [00:56<01:19, 1263.90it/s][A
 42%|████▏     | 71064/171017 [00:56<01:19, 1263.93it/s][A
 42%|████▏     | 71189/171017 [00:56<01:18, 1263.80it/s][A
 42%|████▏     | 71313/171017 [00:56<01:18, 1263.75it/s][A
 42%|████▏     | 71443/171017 [00:56<01:18, 1263.80it/s][A
 42%|████▏     | 71571/171017 [00:56<01:18, 1263.82it/s][A
 42%|████▏     | 71718/171017 [00:56<01:18, 1264.17it/s][A
 42%|████▏     | 71850/171017 [00:56<01:18, 1264.04it/s][A
 42%|████▏     | 71979/171017 [00:56<01:

 51%|█████▏    | 87845/171017 [01:09<01:05, 1262.82it/s][A
 51%|█████▏    | 87988/171017 [01:09<01:05, 1263.06it/s][A
 52%|█████▏    | 88130/171017 [01:09<01:05, 1262.91it/s][A
 52%|█████▏    | 88265/171017 [01:09<01:05, 1262.97it/s][A
 52%|█████▏    | 88399/171017 [01:09<01:05, 1263.00it/s][A
 52%|█████▏    | 88541/171017 [01:10<01:05, 1263.22it/s][A
 52%|█████▏    | 88676/171017 [01:10<01:05, 1263.22it/s][A
 52%|█████▏    | 88809/171017 [01:10<01:05, 1263.24it/s][A
 52%|█████▏    | 88949/171017 [01:10<01:04, 1263.41it/s][A
 52%|█████▏    | 89082/171017 [01:10<01:04, 1263.40it/s][A
 52%|█████▏    | 89213/171017 [01:10<01:04, 1263.41it/s][A
 52%|█████▏    | 89343/171017 [01:10<01:04, 1263.22it/s][A
 52%|█████▏    | 89469/171017 [01:10<01:04, 1263.08it/s][A
 52%|█████▏    | 89609/171017 [01:10<01:04, 1263.25it/s][A
 52%|█████▏    | 89758/171017 [01:11<01:04, 1263.56it/s][A
 53%|█████▎    | 89892/171017 [01:11<01:04, 1263.57it/s][A
 53%|█████▎    | 90028/171017 [01:11<01:

 62%|██████▏   | 105796/171017 [01:23<00:51, 1263.26it/s][A
 62%|██████▏   | 105939/171017 [01:23<00:51, 1263.46it/s][A
 62%|██████▏   | 106072/171017 [01:23<00:51, 1263.42it/s][A
 62%|██████▏   | 106203/171017 [01:24<00:51, 1263.43it/s][A
 62%|██████▏   | 106348/171017 [01:24<00:51, 1263.64it/s][A
 62%|██████▏   | 106502/171017 [01:24<00:51, 1263.94it/s][A
 62%|██████▏   | 106641/171017 [01:24<00:50, 1264.08it/s][A
 62%|██████▏   | 106780/171017 [01:24<00:50, 1264.22it/s][A
 63%|██████▎   | 106925/171017 [01:24<00:50, 1264.43it/s][A
 63%|██████▎   | 107066/171017 [01:24<00:50, 1264.31it/s][A
 63%|██████▎   | 107206/171017 [01:24<00:50, 1264.47it/s][A
 63%|██████▎   | 107354/171017 [01:24<00:50, 1264.70it/s][A
 63%|██████▎   | 107493/171017 [01:24<00:50, 1264.65it/s][A
 63%|██████▎   | 107627/171017 [01:25<00:50, 1264.71it/s][A
 63%|██████▎   | 107760/171017 [01:25<00:50, 1264.63it/s][A
 63%|██████▎   | 107906/171017 [01:25<00:49, 1264.85it/s][A
 63%|██████▎   | 108054/

 72%|███████▏  | 123318/171017 [01:37<00:37, 1258.96it/s][A
 72%|███████▏  | 123443/171017 [01:38<00:37, 1258.72it/s][A
 72%|███████▏  | 123564/171017 [01:38<00:37, 1258.66it/s][A
 72%|███████▏  | 123690/171017 [01:38<00:37, 1258.66it/s][A
 72%|███████▏  | 123813/171017 [01:38<00:37, 1258.63it/s][A
 72%|███████▏  | 123945/171017 [01:38<00:37, 1258.69it/s][A
 73%|███████▎  | 124070/171017 [01:38<00:37, 1258.64it/s][A
 73%|███████▎  | 124195/171017 [01:38<00:37, 1258.63it/s][A
 73%|███████▎  | 124334/171017 [01:38<00:37, 1258.76it/s][A
 73%|███████▎  | 124463/171017 [01:38<00:36, 1258.51it/s][A
 73%|███████▎  | 124585/171017 [01:39<00:36, 1258.38it/s][A
 73%|███████▎  | 124708/171017 [01:39<00:36, 1258.34it/s][A
 73%|███████▎  | 124828/171017 [01:39<00:36, 1258.20it/s][A
 73%|███████▎  | 124946/171017 [01:39<00:36, 1258.06it/s][A
 73%|███████▎  | 125066/171017 [01:39<00:36, 1258.00it/s][A
 73%|███████▎  | 125184/171017 [01:39<00:36, 1257.84it/s][A
 73%|███████▎  | 125311/

 82%|████████▏ | 140386/171017 [01:52<00:24, 1252.91it/s][A
 82%|████████▏ | 140511/171017 [01:52<00:24, 1252.84it/s][A
 82%|████████▏ | 140634/171017 [01:52<00:24, 1252.63it/s][A
 82%|████████▏ | 140752/171017 [01:52<00:24, 1252.43it/s][A
 82%|████████▏ | 140866/171017 [01:52<00:24, 1252.31it/s][A
 82%|████████▏ | 140989/171017 [01:52<00:23, 1252.29it/s][A
 83%|████████▎ | 141125/171017 [01:52<00:23, 1252.38it/s][A
 83%|████████▎ | 141248/171017 [01:52<00:23, 1252.36it/s][A
 83%|████████▎ | 141377/171017 [01:52<00:23, 1252.37it/s][A
 83%|████████▎ | 141503/171017 [01:52<00:23, 1252.38it/s][A
 83%|████████▎ | 141628/171017 [01:53<00:23, 1252.23it/s][A
 83%|████████▎ | 141750/171017 [01:53<00:23, 1252.19it/s][A
 83%|████████▎ | 141877/171017 [01:53<00:23, 1252.21it/s][A
 83%|████████▎ | 142000/171017 [01:53<00:23, 1252.11it/s][A
 83%|████████▎ | 142121/171017 [01:53<00:23, 1252.07it/s][A
 83%|████████▎ | 142259/171017 [01:53<00:22, 1252.18it/s][A
 83%|████████▎ | 142385/

 92%|█████████▏| 157865/171017 [02:06<00:10, 1252.49it/s][A
 92%|█████████▏| 157993/171017 [02:06<00:10, 1252.36it/s][A
 92%|█████████▏| 158116/171017 [02:06<00:10, 1252.25it/s][A
 93%|█████████▎| 158236/171017 [02:06<00:10, 1252.17it/s][A
 93%|█████████▎| 158364/171017 [02:06<00:10, 1252.19it/s][A
 93%|█████████▎| 158486/171017 [02:06<00:10, 1252.06it/s][A
 93%|█████████▎| 158623/171017 [02:06<00:09, 1252.12it/s][A
 93%|█████████▎| 158746/171017 [02:06<00:09, 1252.05it/s][A
 93%|█████████▎| 158883/171017 [02:06<00:09, 1252.13it/s][A
 93%|█████████▎| 159044/171017 [02:06<00:09, 1252.42it/s][A
 93%|█████████▎| 159180/171017 [02:07<00:09, 1252.48it/s][A
 93%|█████████▎| 159330/171017 [02:07<00:09, 1252.65it/s][A
 93%|█████████▎| 159472/171017 [02:07<00:09, 1252.78it/s][A
 93%|█████████▎| 159625/171017 [02:07<00:09, 1252.99it/s][A
 93%|█████████▎| 159769/171017 [02:07<00:08, 1253.00it/s][A
 94%|█████████▎| 159908/171017 [02:07<00:08, 1252.85it/s][A
 94%|█████████▎| 160038/

In [16]:
dialogues.to_csv('diag.csv',sep=';',index=False)

In [17]:
# dialogues[dialogues.question.str.find('?')==-1][:10]

In [18]:
# dialogues[dialogues.question.str.find('?')!=-1].shape, dialogues[dialogues.question.str.find('?')==-1].shape

In [19]:
# dialogues.answer.nunique()

In [20]:
# dialogues[dialogues.question.str.find('?')!=-1][:10]

In [21]:
# tfidf = TfidfVectorizer()

In [22]:
# tfidf.fit(list(dialogues.question) + list(dialogues.answer))

In [23]:
# x = tfidf.transform(dialogues.question)

In [24]:
# res = []
# for i in range(5,50):
#     print(i)
#     km = KMeans(n_clusters=i)
#     km.fit_predict(x)
#     res.append(np.sqrt(km.inertia_))

array([0, 0, 0, ..., 0, 0, 0])

### The data consists of 2 files. The first one contains raw logs of some bank's customer support. The second one contains frequently asked questions with answers. 

### 3.2 Building a structured FAQ

### A good place to start would be to turn the FAQ into a more structured form, then find paraphrases for each question from the chat logs.

In [25]:
faq_qna = []
buf = []
for line in faq.split("\n"):
    if line == '' and len(buf):
        faq_qna.append(buf)
        buf = []
    else:
        buf.append(line)
        
FAQ = []
for faq_entry in faq_qna:
    faq_dict = {}
    faq_dict['answer'] = faq_entry[1]
    faq_dict['question'] = faq_entry[0]
    faq_dict['paraphrased_questions'] = []
    FAQ.append(faq_dict)

In [35]:
faq_data = []
for i in tqdm(FAQ):
    faq_data.append([tokenize_and_lemmatize(i['question']), tokenize_and_lemmatize(i['answer'])])


  0%|          | 0/171 [00:00<?, ?it/s][A
 33%|███▎      | 56/171 [00:00<00:00, 549.88it/s][A
 65%|██████▍   | 111/171 [00:00<00:00, 546.99it/s][A
 98%|█████████▊| 168/171 [00:00<00:00, 554.38it/s][A
100%|██████████| 171/171 [00:00<00:00, 555.37it/s][A

In [36]:
pd.DataFrame(faq_data,columns=['question', 'answer']).to_csv('faq.csv',sep=';',index=False)

In [32]:
faq_train = json.load(open("./faq_train.json"))
faq_test = json.load(open("./faq_val.json"))

In [37]:
faq_data = []
for i in tqdm(faq_train):
    faq_data.append([tokenize_and_lemmatize(i['question']), tokenize_and_lemmatize(i['answer'])])
    for j in i['paraphrased_questions']:
        faq_data.append([tokenize_and_lemmatize(j), tokenize_and_lemmatize(i['answer'])])
    
pd.DataFrame(faq_data,columns=['question', 'answer']).to_csv('faq_test.csv',sep=';',index=False)

faq_data = []
for i in tqdm(faq_test):
    faq_data.append([tokenize_and_lemmatize(i['question']), tokenize_and_lemmatize(i['answer'])])
    for j in i['paraphrased_questions']:
        faq_data.append([tokenize_and_lemmatize(j), tokenize_and_lemmatize(i['answer'])])
    
pd.DataFrame(faq_data,columns=['question', 'answer']).to_csv('faq_val.csv',sep=';',index=False)


  0%|          | 0/30 [00:00<?, ?it/s][A
  7%|▋         | 2/30 [00:00<00:01, 17.48it/s][A
 20%|██        | 6/30 [00:00<00:00, 26.67it/s][A
 33%|███▎      | 10/30 [00:00<00:00, 28.13it/s][A
 43%|████▎     | 13/30 [00:00<00:00, 27.77it/s][A
 60%|██████    | 18/30 [00:00<00:00, 26.29it/s][A
 70%|███████   | 21/30 [00:00<00:00, 25.15it/s][A
 80%|████████  | 24/30 [00:01<00:00, 23.94it/s][A
100%|██████████| 30/30 [00:01<00:00, 26.87it/s][A
[A
  0%|          | 0/30 [00:00<?, ?it/s][A
 17%|█▋        | 5/30 [00:00<00:00, 43.43it/s][A
 37%|███▋      | 11/30 [00:00<00:00, 47.63it/s][A
 60%|██████    | 18/30 [00:00<00:00, 48.25it/s][A
 73%|███████▎  | 22/30 [00:00<00:00, 46.48it/s][A
 90%|█████████ | 27/30 [00:00<00:00, 46.05it/s][A
100%|██████████| 30/30 [00:00<00:00, 49.27it/s][A

In [46]:
FAQ

[{'answer': 'Да , если банкомат поддерживает данную функцию .',
  'paraphrased_questions': [],
  'question': '1 . Могу ли я поменять ПИН-код своей карты <bankname> Банка в банкомате ?'},
 {'answer': 'Расплачиваться картой и снимать наличные можно в любой валюте .',
  'paraphrased_questions': [],
  'question': '2 . В каких валютах можно расплачиваться картой <bankname> Банка ?'},
 {'answer': 'Да , возможно . Для этого вам необходимо в магазине , в котором вы приобретаете товар , попросить предоставить дополнительный чек Tax Free . На данный чек необходимо поставить штамп на таможне . Возврат средств по чекам Tax Free возможен в международном аэропорту ( в стране , где совершалась покупка ) в специальном пункте возврата НДС .',
  'paraphrased_questions': [],
  'question': '3 . При оплате товаров за границей , я оплачиваю НДС в казну другого государства , возможно ли вернуть эти средства ?'},
 {'answer': 'Получать выписку по ссылке в письме так же безопасно , как получать выписку в виде в

### From now on, you are on your own. 

![architecture](showme.png)

### 3.3 Find paraphrases for existing questions

### 3.4 Find clusters of frequent questions