In [2]:
import pandas as pd
import pymorphy2
import re
import json
from collections import defaultdict, Counter
from math import log
from tqdm import tqdm

In [3]:
from flask import Flask, render_template, request, url_for

In [4]:
import numpy as np

In [5]:
from gensim.models import Word2Vec, KeyedVectors

# The Project on Information Search

In [6]:
def json_read(filename, encoding='utf-8'):
    with open(filename, 'r', encoding=encoding) as inf:
        res = json.load(inf)
        return res

def json_dump(obj, filename, ea=False, indent=4, encoding='utf-8'):
    with open(filename, 'w', encoding=encoding) as ouf:
        json.dump(obj, ouf, ensure_ascii=ea, indent=indent)

Downloading raw data for the corpus

In [7]:
data = pd.read_csv('./quora_question_pairs_rus.csv',encoding='utf-8')

In [8]:
#data.head()

In [9]:
corpus = [item for item in list(data['question1']) if type(item) != float]
corpus.extend([item for item in list(data['question2']) if type(item) != float])

Functions to get lemmas and do preprocessing

In [10]:
morph = pymorphy2.MorphAnalyzer()
def normalize(doc):
    tokens = re.sub('  ', ' ', re.sub(r'[^\w\s]','',doc)).split()
    lemmas = []
    for i, token in enumerate(tokens):
        tokens[i] = token.lower()
        lemmas.append(morph.parse(token)[0].normal_form)
    return lemmas

def preprocessing(corpus):
    raw_texts = {}
    morph = pymorphy2.MorphAnalyzer()
    num = 0
    for doc in corpus:
        if type(doc) != float:
            raw_texts[num] = normalize(doc)
        num += 1
    return raw_texts

In [11]:
#preprocessed = preprocessing(corpus)

In [12]:
# for i in preprocessed:
#     try:
#         json_dump(preprocessed[i], 'lemmatized_corpus.json')
#     except Exception as e:
#         print(e)
#         print(i, preprocessed[i])
#         break

In [13]:
# for i in preprocessed:
#     for j, word in enumerate (preprocessed[i]):
#         if word == 'español':
#             preprocessed[i][j] = 'espanol'

In [14]:
# json_dump(preprocessed, 'lemmatized_corpus.json')

Building inverted index + tf

In [15]:
# ind2 = defaultdict(dict)
# for i in tqdm(preprocessed):
#     for word in preprocessed[i]:
#         c = Counter(preprocessed[i])
#         ind2[word][int(i)] = c[word] / len(preprocessed[i]) #doc num: tf

In [16]:
#ind2['как']

по непонятным причинам в json не хотело ложиться слово español. поменяла:

In [17]:
#for i in ind2.keys():
#     try:
#         json_dump(list(i), 'keys.json')
#     except Exception as e:
#         print(e)
#         print(i)

In [18]:
# ind2['espanol'] = ind2['español']
# del ind2['español']

In [19]:
#'español' in ind2.keys()

In [None]:
#json_dump(ind2, 'index_tf.json')

Loading all saved data

In [None]:
ind_tf = json_read('index_tf.json')

In [None]:
preprocessed = json_read('lemmatized_corpus.json', encoding='utf-8')

In [None]:
#preprocessed['14']

In [None]:
lemma_corp = {int(key): preprocessed[key] for key in tqdm(preprocessed)}

100%|██████████████████████████████████████████████████████████████████████| 808555/808555 [00:02<00:00, 330288.70it/s]


In [None]:
#lemma_corp[0]

In [None]:
#corpus[0]

## TF-IDF and BM-25

Implementing tf and idf functions for TF-IDF and BM-25 search

In [None]:
def tf(term, doc_num, indexed):
    try:
        return indexed[term][str(doc_num)]
    except KeyError:
        return 0

def idf(term, corp, indexed):
    try:
        df = len(indexed[term]) 
    except KeyError:
        df = 0
    finally:
        return log((len(corp) - df + 0.5) / (df + 0.5))

In [None]:
#idf('перестать', lemma_corp)

In [None]:
#tf('президентство', 14)

In [None]:
#idf('президентство', lemma_corp)

avgld одна на весь корпус, поэтому посчитаем её просто в теле программы

In [None]:
avgld = sum([len(doc) for doc in lemma_corp.values()]) / len(lemma_corp)

In [None]:
#avgld

Implementing function that counts BM-25 metric for a term in a doc:

In [None]:
def bm25(term, doc_num, corp, indexed, k=2.0, b=0.75):
    return idf(term, corp, indexed) * (tf(term, doc_num, indexed) * (k + 1)/ (tf(term, doc_num, indexed) \
                                + k * (1 - b + b * (len(lemma_corp[doc_num])/avgld))))

узнаем, высоко ли ранжируется слово "президентство" в одном из запросов про Дональда Козыря))00)0)

In [None]:
#bm25('президентство', 14, lemma_corp)

## FastText

In [None]:
fttxt_model = KeyedVectors.load('models/fasttext/model.model')

Implementing a function for making a vector from a corpus item

In [None]:
def vectorize(model, doc):
    vecs = []
    for word in doc:
        try:
            model.wv[word]
        except AttributeError:
            continue
        vecs.append(model.wv[word])
    return np.mean(vecs, axis=0)

Building FastText vectors from the corpus

In [None]:
ft_matr = []
for i in tqdm(sorted(lemma_corp)):
    ft_matr.append(vectorize(fttxt_model, lemma_corp[i]))

  """
  
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
100%|█████████████████████████████████████████████████████████████████████████| 808555/808555 [21:43<00:00, 620.46it/s]


Implementing the cosine similarity function

In [None]:
def cos_sim(v1, v2):
    return np.inner(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

## Search

Implementing additional functions for search

In [None]:
def bm(query, corp, indexed):
    return [[sum([bm25(term, num, corp, indexed) for term in query]), num] for num in corp]

In [None]:
def tfidf(query, corp, indexed):
    return [[sum([tf(term, num, indexed) * idf(term, corp, indexed) for term in query]), num] for num in corp]

In [None]:
def ft(query, corp, indexed):
    ans = []
    for i in corp:
        cs = cos_sim(indexed, ft_matr[i])
        if type(cs) == np.ndarray:
            cs = 0
        ans.append([cs, i])
    return ans

Implementing search

In [None]:
def index(query, model):
    if model in (tfidf, bm):
        indexed = defaultdict(dict)
        for term in query:
            if term in ind_tf:
                indexed[term] = ind_tf[term]
            else:
                indexed[term] = {}
    else:
        indexed = vectorize(fttxt_model, query)
    return indexed
    

def search(query, model):
    tfs = index(query, model)
    return sorted(model(query, corp=lemma_corp, indexed=tfs), reverse=True)[:10]
    
    
def output(query, model):
    l_query = normalize(query)
    return [corpus[num] for num in [doc[1] for doc in search(l_query, model)]]

Adding it to the Flask functionality

In [None]:
app = Flask(__name__)

In [None]:
@app.route('/')
def form():
    if request.args:
        query = request.args['query']
        if request.args['model'] == 'tfidf':
            model = tfidf
        elif request.args['model'] == 'bm':
            model = bm
        elif request.args['model'] == 'fasttext':
            model = ft
        try:
            outlist = output(query, model)
        except Exception as e:
            outlist = e.split(' ')  # обработка 
        return render_template('result.html', output=outlist)
    return render_template('index.html')

In [None]:
%tb
if __name__ == '__main__':
    app.run(debug=True, use_reloader=False)

SyntaxError: invalid syntax (<ipython-input-1-2af1c2e2e6b4>, line 1)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [25/Oct/2019 10:31:59] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [25/Oct/2019 10:32:00] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [25/Oct/2019 10:33:44] "GET /?query=в+чем+смысл+жизни&model=tfidf HTTP/1.1" 200 -
127.0.0.1 - - [25/Oct/2019 10:34:55] "GET /?query=в+чем+смысл+жизни&model=bm HTTP/1.1" 200 -
  """
  
127.0.0.1 - - [25/Oct/2019 10:37:06] "GET /?query=в+чем+смысл+жизни&model=fasttext HTTP/1.1" 200 -
