In [122]:
from flask import Flask
from flask import render_template, request, redirect, url_for

from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np 
import joblib
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import pymorphy2
import re
from nltk.corpus import stopwords 

import warnings
import nltk
from gensim.models.keyedvectors import KeyedVectors

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
# X = vect.fit_transform

In [2]:
fast_model = 'fasttext/model.model'
 
fasttext_model = KeyedVectors.load(fast_model)

In [120]:
from pymorphy2 import MorphAnalyzer
from nltk.tokenize import WordPunctTokenizer

In [115]:
def preproc(text): 
    morph = pymorphy2.MorphAnalyzer()
    text = re.sub(r'[A-Za-z0-9<>В«В»\.!\(\)?,;:\-\"]', r'', text)
    text = WordPunctTokenizer().tokenize(text)
    stopword_list = set(stopwords.words('russian'))
    
    preproc_text = ''
    for w in text:
        if w not in stopword_list:
            new_w = morph.parse(w)[0].normal_form + ' '
            preproc_text += new_w

    return preproc_text

def getting_fasttext(filepath):
    fasttext_model = KeyedVectors.load(filepath)
    return fasttext_model

def sent_vectorizer(sent, model):
    if type(sent) != str:
        sent_vector = np.zeros((model.vector_size,))
        return sent_vector
    sent = sent.split()
    lemmas_vectors = np.zeros((len(sent), model.vector_size))
    for idx, lemma in enumerate(sent):
        if lemma in model.vocab: 
            lemmas_vectors[idx] = model[lemma] 
    sent_vector = lemmas_vectors.mean(axis=0) 
    return sent_vector 

In [73]:
warnings.filterwarnings("ignore")

w2v = getting_fasttext('fasttext/model.model')

In [5]:
def calc_metric(query, data):
    cos_sim = data.apply(lambda row: cosine_similarity(row.values.reshape(1, -1), query)[0][0], axis=1) #сложна
    cos_sim = pd.DataFrame(cos_sim, columns=['val'])
    best_cos_sim = cos_sim.nlargest(10, 'val')
    return best_cos_sim

In [70]:
def metric_bm25(query):
    df = pd.read_csv('bm25_index.csv', index_col=None)
    query = query.split(' ')
    lemmas_list = list(df.columns)
    query_bm25 = {}
    for lemma in lemmas_list:
        if lemma in query:
            query_bm25[lemma] = [1]
        else:
            query_bm25[lemma] = [0]
    query_bm25 = pd.DataFrame.from_dict(query_bm25)
    metric_value = calc_metric(query_bm25, df)       
    return metric_value

In [148]:
def metric_tf(query):
    df = pd.read_csv('tf_idf_index.csv', index_col=None)
    vectorizer = joblib.load('tf_idf_vectorizer.pkl')
    query_tfidf = vectorizer.transform([query])
    query_tfidf = pd.DataFrame(query_tfidf.toarray(), columns=vectorizer.get_feature_names())
    
    metric_value = calc_metric(query_tfidf, df)
    return metric_value

In [151]:
df = pd.read_csv('tf_idf_index.csv', index_col=None)

In [154]:
df.shape

(100, 399)

In [155]:
vectorizer = joblib.load('tf_idf_vectorizer.pkl')

In [156]:
query = 'рыба'
query_tfidf = vectorizer.transform([query])

In [159]:
query_tfidf.toarray().shape

(1, 408)

In [109]:
def new_metric_tf(query):
    global data
    docs = data['question2']
    vect = TfidfVectorizer()
    corpus = docs
    X = pd.read_csv('tf_idf_index.csv', index_col=None)
    query = vect.transform([preproc(query)]).transpose()
    e = X.dot(query)
    df_tfIdf = pd.DataFrame(e.toarray(), index=docs)
    dict_tfIdf = {}
    for i in range(len(docs)):
        dict_tfIdf[all_text[i][2]] = df_tfIdf[0][i]
    sorted_tfidf = sorted(dict_tfIdf.items(), key=operator.itemgetter(1))
    return sorted_tfidf[-10:]

In [149]:
primer_tf = metric_tf('геолог')
print (primer_tf)

ValueError: ('Incompatible dimension for X and Y matrices: X.shape[1] == 399 while Y.shape[1] == 408', 'occurred at index 0')

In [8]:
def metric_fast(query):
    df = pd.read_csv('fasttext_index.csv', index_col=None)
    sent_vector = sent_vectorizer(query, w2v)
    query_fasttext = np.asarray(sent_vector).reshape(1, -1)
    metric = calc_metric(query_fasttext, df)
    return metric

In [11]:
data = pd.read_csv("quora_question_pairs_rus.csv", index_col='Unnamed: 0')

In [137]:
def top_docs(result):
    q_dict ={}
    for idx, row in result.iterrows():
        for id_doc, doc in enumerate (data['question2']):
            if idx==id_doc:
#                 q_dict ={}
                q_dict[idx] = [doc, row.val]
#                 print (q_dict)
#                 top5_doc.append (q_dict)
    return (q_dict)

In [142]:
pr_res = top_docs(primer)

In [144]:
for i in pr_res:
    print(i, pr_res[i])

0 ['что произойдет, если правительство Индии украдет кохинор кох-и-ноор-алмаз назад', 0.0]
1 ['как повысить скорость интернета путем взлома через dns', 0.0]
2 ['найти остаток, когда математика 23 ^ 24 математика разделена на 24 23', 0.0]
3 ['какая рыба выживет в соленой воде', 0.0]
4 ['Я тройная луна-козерог и восхождение в козероге, что это говорит обо мне', 0.0]
5 ['что делает детей активными и далеки от телефонных и видеоигр', 0.0]
6 ['что я должен делать, чтобы быть великим геологом?', 0.0]
7 ['когда вы используете вместо', 0.0]
8 ['как я могу взломать motorola dcx3400 для бесплатного интернета', 0.0]
9 ['что некоторые технические специалисты могут рассказать о долговечности и надежности ноутбуков и их компонентов', 0.0]


In [128]:
primer

Unnamed: 0,val
6,0.944287
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
7,0.0
8,0.0
9,0.0


In [58]:
for idx, row in res.iterrows():
        res_dict[idx] = [row.sentence, row.val]

404288


In [140]:
primer = metric_bm25('что будет с геологией')
print (primer)

   val
0  0.0
1  0.0
2  0.0
3  0.0
4  0.0
5  0.0
6  0.0
7  0.0
8  0.0
9  0.0


In [40]:
print (type(primer))

<class 'pandas.core.frame.DataFrame'>


In [150]:
app = Flask(__name__)

@app.route('/')
def query():
    if request.args:
        query = request.args['user_query']
        type_metrics = request.args['type_metrics']
        if type_metrics == 'BM25':
            values = metric_bm25(query)
            metrics = 'Вы выбрали BM25 и вот посмотрите:'
            top10_doc = top_docs(values)
        elif type_metrics == 'tf':
            values = metric_tf(query,data)
            metrics = 'Вы выбрали TFIDF и вот посмотрите:'
            top10_doc = top_docs(values)
        elif type_metrics == 'fasttext':
            values = metric_fast(query)
            metrics = 'Вы выбрали Fasttext и вот посмотрите:'
            top10_doc = top_docs(values)
#         metrics = 'hop'
#         top5_docs = top_docs(values)
#         top5_doc = '12345'
        return render_template('result.html', query=query, metrics=metrics, top10_doc=top10_doc)
    return render_template('query.html')

# @app.route('/result')
# def fact():
#     return render_template('result.html', query=query, metrics=metrics, top5_doc=top5_doc)

if __name__ == '__main__':
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [25/Oct/2019 12:36:35] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/Oct/2019 12:36:46] "[37mGET /?user_query=геолог&type_metrics=BM25 HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/Oct/2019 12:37:03] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/Oct/2019 12:37:11] "[37mGET /?user_query=геолог&type_metrics=fasttext HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/Oct/2019 12:38:29] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/Oct/2019 12:38:43] "[37mGET /?user_query=в+чем+смысл+жизни&type_metrics=BM25 HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/Oct/2019 12:38:43] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
[2019-10-25 12:38:51,734] ERROR in app: Exception on / [GET]
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/flask/app.py", line 2311, in wsgi_app
    response = self.full_dispatch_request()
  File "/Library/Frameworks/Python.framework/Versions/3.7/l