## ДЗ по поиску

Привет! Вам надо реализивать поисковик на базе вопросов-ответов с сайта [pravoved.ru](https://pravoved.ru/questions-archive/).        
Поиск должен работать на трех технологиях:       
1. обратном индексе     
2. word2vec         
3. doc2vec      

Вы должны понять, какой метод и при каких условиях эксперимента на этом корпусе работает лучше.          
Для измерения качества поиска найдите точность (accuracy) выпадания правильного ответа на конкретный вопрос (в этой базе у каждого вопроса есть только один правильный ответ). Точность нужно измерить для всей базы.    
При этом давайте считать, что выпал правильный ответ, если он попал в **топ-5** поисковой выдачи.

> Сделайте ваш поиск максимально качественным, чтобы значение точности стремилось к 1.     
Для этого можно поэкспериментировать со следующим:       
- модель word2vec (можно брать любую из опен сорса или обучить свою)
- способ получения вектора документа через word2vec: простое среднее арифметическое или взвешивать каждый вектор в соответствии с его tf-idf      
- количество эпох у doc2vec (начинайте от 100)
- предобработка документов для обучения doc2vec (удалять / не удалять стоп-слова)
- блендинг методов поиска: соединить результаты обратного индекса и w2v, или (что проще) w2v и d2v

In [560]:
import os
import json
from tqdm import tqdm_notebook
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pymystem3 import Mystem
import pickle

In [561]:
from judicial_splitter import splitter

In [562]:
import warnings
warnings.filterwarnings('ignore')

In [1125]:
with open('qa_corpus.pkl', 'rb') as file:
    qa_corpus = pickle.load(file)

In [1126]:
questions = []
for q in qa_corpus:
    questions.append(q[0])

In [1127]:
answers = []
for a in qa_corpus:
    answers.append(a[1])

In [756]:
def write_data(filename, array):
    with open(filename, 'w', encoding='utf-8') as f:
        for item in array:
            f.write("%s\n" % item)

In [757]:
write_data('Questions.txt', questions)
write_data('Answers.txt', answers)

In [1337]:
mystem = Mystem()

In [1338]:
def preprocessing(input_text, del_stopwords=True, del_digit=True):
    """
    :input: raw text
        1. lowercase, del punctuation, tokenize
        2. normal form
        3. del stopwords
        4. del digits
    :return: lemmas
    """
    russian_stopwords = set(stopwords.words('russian'))
    words = [x.lower().strip(string.punctuation + '»«–…') for x in word_tokenize(input_text)]
    lemmas = [mystem.lemmatize(x)[0] for x in words if x]

    lemmas_arr = []
    for lemma in lemmas:
        if del_stopwords:
            if lemma in russian_stopwords:
                continue
        if del_digit:
            if lemma.isdigit():
                continue
        lemmas_arr.append(lemma)
        
    return lemmas_arr

In [676]:
def write_data(filename, data):
    with open(filename, 'w') as fout:
        json.dump(data, fout)

### Word2Vec

In [163]:
from gensim.models import Word2Vec

In [164]:
model_path = 'araneum_none_fasttextcbow_300_5_2018.model'
w2v_model = Word2Vec.load(model_path)

In [1181]:
model_path_sg = 'araneum_none_fasttextskipgram_300_5_2018.model'
w2v_sg_model = Word2Vec.load(model_path_sg)

In [1182]:
def get_w2v_vectors(model, lemmas):
    """Получает вектор документа"""
    vectors = []
    
    for lemma in lemmas:
        try:
            vector = model.wv[lemma]
            vectors.append(vector)
        except KeyError as e:
            continue
            
    mean = sum(vectors) / len(vectors)        
    
    return mean

In [1183]:
def save_w2v_base(model, answers):
    """Индексирует всю базу для поиска через word2vec"""
    w2v_result = []
    
    for answer in tqdm_notebook(answers):
        lemmas = preprocessing(answer)
        w2v_vectors = get_w2v_vectors(model, lemmas)
        answer_vector = {'answer_text': answer, 'w2v_vectors': w2v_vectors.tolist()}
        w2v_result.append(answer_vector)
    
    return w2v_result

In [167]:
w2v_res = save_w2v_base(w2v_model, answers)

A Jupyter Widget

In [1184]:
w2v_sg_res = save_w2v_base(w2v_sg_model, answers)

A Jupyter Widget

### Doc2Vec

In [168]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [169]:
def split_d2v(answers):
    d2v_paragraphs = []
    answer_text = {}
    
    for answer in tqdm_notebook(answers):
        answer_text[answer] = answer
        splitted_text = splitter(answer, 1)

        for text in splitted_text:
            lemmas = preprocessing(text, del_stopwords=False)
            d2v_paragraphs.append({'answer_text': answer, 'answer_lemmas': lemmas})
    
    return answer_text, d2v_paragraphs

In [170]:
d2v_a, d2v_p = split_d2v(answers)

A Jupyter Widget

In [225]:
def split_d2v_sw(answers):
    d2v_paragraphs = []
    answer_text = {}
    
    for answer in tqdm_notebook(answers):
        answer_text[answer] = answer
        splitted_text = splitter(answer, 1)

        for text in splitted_text:
            lemmas = preprocessing(text)
            d2v_paragraphs.append({'answer_text': answer, 'answer_lemmas': lemmas})
    
    return answer_text, d2v_paragraphs

In [226]:
d2v_a_sw, d2v_p_sw = split_d2v_sw(answers)

A Jupyter Widget

In [227]:
def train_doc2vec(input_data, epochs):
    d2v_data = [TaggedDocument(words=j['answer_lemmas'], tags=[str(i)])for i, j in enumerate(input_data)]

    model = Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.025, min_count=0, workers=4, epochs=epochs)
    model.build_vocab(d2v_data)
    model.train(d2v_data, total_examples=model.corpus_count, epochs=model.epochs)
    
    return model

In [213]:
d2v_100_model = train_doc2vec(d2v_p, 100)

In [228]:
d2v_100_sw_model = train_doc2vec(d2v_p_sw, 100)

In [214]:
d2v_500_model = train_doc2vec(d2v_p, 500)

In [229]:
d2v_500_sw_model = train_doc2vec(d2v_p_sw, 500)

In [230]:
def get_d2v_vectors(model, lemmas):
    """Получает вектор документа"""
    d2v_vectors = model.infer_vector(lemmas)
    
    return d2v_vectors

In [231]:
def save_d2v_base(model, d2v_data):
    """Индексирует всю базу для поиска через word2vec"""
    d2v_result = []
    answer_vector = {}
    
    for dictionary in tqdm_notebook(d2v_data):
        d2v_vectors = get_d2v_vectors(model, dictionary['answer_lemmas'])
        answer_vector = {'answer_text': dictionary['answer_text'], 'd2v_vectors': d2v_vectors.tolist()}
        d2v_result.append(answer_vector)
    
    return d2v_result

In [385]:
d2v_100_res = save_d2v_base(d2v_100_model, d2v_p)

A Jupyter Widget

In [217]:
d2v_500_res = save_d2v_base(d2v_500_model, d2v_p)

A Jupyter Widget

In [232]:
d2v_100_sw_res = save_d2v_base(d2v_100_sw_model, d2v_p_sw)

A Jupyter Widget

In [233]:
d2v_500_sw_res = save_d2v_base(d2v_500_sw_model, d2v_p_sw)

A Jupyter Widget

In [176]:
write_data('Doc2Vec', d2v_res)

### Inverted Index

In [1130]:
def prepare_data(qa_corpus):
    answers_ii = {}
    questions_ii = {}

    for i, j in enumerate(qa_corpus):
        answers_ii[i] = j[1]
        questions_ii[i] = j[0]

    return answers_ii, questions_ii

In [1131]:
answers_ii, questions_ii = prepare_data(qa_corpus)

In [1132]:
def for_inv_idx(answers):
    ii_data = []
    lengths = {}
    
    for idx, answer in answers.items():
        answer = preprocessing(answer)
        lengths[idx] = len(answer)
        ii_data.append(' '.join(answer))
    
    avgdl = sum(lengths.values()) / len(lengths)
    
    return ii_data, lengths, avgdl

In [1133]:
ii_data, lengths, avgdl = for_inv_idx(answers_ii)

In [1165]:
dl1 = []
for t1, k1 in zip(answers_ii.values(), lengths.values()):
    dl1.append({'answer_text': t1, 'score': k1})

In [1166]:
def get_idf(answers, words, freqs):    
    idf_counter = {}
    N = len(answers)
    
    for idx, word in enumerate(words):
        n = 0
        for freq in freqs:
            count = freq[idx]
            if count != 0:
                n += 1
                
        idf = log((N - n + 0.5) / (n + 0.5))
        idf_counter[word] = idf
    
    return idf_counter

In [1173]:
def get_inv_index(answers):
    ii_data, lengths, avgdl = for_inv_idx(answers)
    count = CountVectorizer()
    X = count.fit_transform(ii_data)
    key = [k for k in answers.keys()]
    term_doc_matrix = pd.DataFrame(X.toarray(), index=key, columns=count.get_feature_names())
    term_doc_matrix = [term_doc_matrix.index.tolist(), term_doc_matrix.columns.tolist(), term_doc_matrix.values.tolist()]
    
    return term_doc_matrix

In [1174]:
term_doc_matrix = get_inv_index(answers_ii)

In [1176]:
answrs = term_doc_matrix[0]
words = term_doc_matrix[1]
freqs = term_doc_matrix[2]

In [1177]:
idf_score = get_idf(answrs, words, freqs)

In [1178]:
k1 = 2.0
b = 0.75

def score_BM25(idf, qf, dl, avgdl, k1, b):
    """
    Compute similarity score between search query and documents from collection
    :return: score
    """
    return idf * (k1 + 1) * qf / (qf + k1 * (1 - b + b * dl / avgdl))

In [1179]:
def search_inv_index(query, n_results):
    results = []
    final_result_ii = []
    
    for word in query:
        if word in words:
            idx = words.index(word)
            for indx, answ_indx in enumerate(answrs):
                qf = freqs[indx][idx]
                dl = lengths[int(answ_indx)]
                idf = idf_score[word]      
                okapi_score = score_BM25(idf, qf, dl, avgdl, k1, b)
                results.append({'index': answ_indx, 'okapi_score': okapi_score})
    
    res_inv_index = sorted(results, key=lambda k: k['okapi_score'], reverse=True)[:n_results]
    for i, j in zip(res_inv_index, dl1):
        final_result_ii.append({'index': i['index'], 'answer_text': j['answer_text']})
    
    return final_result_ii

### Функция поиска

In [521]:
from gensim import matutils
import numpy as np 

In [522]:
def similarity(v1, v2):
    v1_norm = matutils.unitvec(np.array(v1))
    v2_norm = matutils.unitvec(np.array(v2))
    return np.dot(v1_norm, v2_norm)

In [523]:
def search_w2v(query, model, w2v_res, n_results):
    similarity_result = []
    get_vectors = get_w2v_vectors(model, query)
    key = [k for k in dict(enumerate(w2v_res))]
    i = 0
    
    for w2v_r in w2v_res:
        compare_similarity = similarity(get_vectors, w2v_r['w2v_vectors'])
        index_similarity = {'index': key[i], 'similarity': compare_similarity}
        similarity_result.append(index_similarity)
        i += 1
    
    final_result = sorted(similarity_result, key=lambda k: k['similarity'], reverse=True)[:n_results]
    final_result = [{'index': text['index'], 'answer_text': w2v_res[text['index']]['answer_text']} for text in final_result]
        
    return final_result

In [524]:
def search_d2v(query, model, d2v_res, n_results):
    similarity_result = []
    get_vectors = get_d2v_vectors(model, query)
    key = [k for k in dict(enumerate(d2v_res))]
    i = 0
    
    for d2v_r in d2v_res:
        compare_similarity = similarity(get_vectors, d2v_r['d2v_vectors'])
        index_similarity = {'index': key[i], 'similarity': compare_similarity}
        similarity_result.append(index_similarity)
        i += 1
        
    final_result = sorted(similarity_result, key=lambda k: k['similarity'], reverse=True)[:n_results]
    final_result = [{'index': text['index'], 'answer_text': d2v_res[text['index']]['answer_text']} for text in final_result]
        
    return final_result

In [1348]:
def get_common_result(w2vres, d2vres, common): 
    comm_res = {}
    
    for i, j, c in zip(w2vres, d2vres, common):
        if c['index'] == i['index']:
            print(1)
            w2v_c = c['index']
        else: 
            w2v_c = 0
    
        if c['index'] == j['index']:
            print(2)
            d2v_c = c['index']
        else:
            d2v_c = 0
        
        comm_res[c['index']] = (w2v_c * 0.7 + d2v_c * 0.3) / 2
    
    return comm_res

In [1349]:
def search_w2v_d2v(query, model1, w2v_res, model2, d2v_res, n_results):
    common = []
    w2vres = search_w2v(query, model1, w2v_res, n_results)
    d2vres = search_d2v(query, model2, d2v_res, n_results)
    for i, j in zip(w2vres, d2vres):
        common.append(dict((k, v) for k, v in i.items() if k in j))
        
    common_result = get_common_result(w2vres, d2vres, common)
    common_result = [dict(sorted(common_result.items(), reverse=True, key=lambda k: k[1]))]

    return common_result

In [1330]:
def search(query, search_method, n_results=5):
    if search_method == 'inverted_index':
        query = preprocessing(query)
        final = search_inv_index(query, n_results)
            
    elif search_method == 'word2vec':
        query = preprocessing(query)
        final = search_w2v(query, w2v_model, w2v_res, n_results)
#         for i in final:
#             text = i['answer_text']
#             result_text.append(text)

    elif search_method == 'word2vec_sg':
        query = preprocessing(query)
        final = search_w2v(query, w2v_sg_model, w2v_sg_res, n_results)

    elif search_method == 'doc2vec_100':
        query = preprocessing(query, del_stopwords=False)
        final = search_d2v(query, d2v_100_model, d2v_100_res, n_results)

    elif search_method == 'doc2vec_100_sw':
        query = preprocessing(query)
        final = search_d2v(query, d2v_100_sw_model, d2v_100_sw_res, n_results)

    elif search_method == 'doc2vec_500':
        query = preprocessing(query, del_stopwords=False)
        final = search_d2v(query, d2v_500_model, d2v_500_res, n_results)

    elif search_method == 'doc2vec_500_sw':
        query = preprocessing(query)
        final = search_d2v(query, d2v_500_sw_model, d2v_500_sw_res, n_results)
    
    else:
        raise TypeError('unsupported search method')
        
    return final

In [1211]:
def count_accuracy(questions, search_method):
    accuracy = 0

    for i, j in enumerate(tqdm_notebook(questions)):
        search_result = search(j, search_method)

        for r in search_result:
            res = [r['index']]

            if i in res:
                accuracy += 1

    return accuracy / len(questions)        

**W2V accuracy**

**fastText CBOW (3..5-граммы)**

In [210]:
w2v_accuracy = count_accuracy(questions, 'word2vec')
w2v_accuracy

A Jupyter Widget

0.2658959537572254

**fastText Skipgram (3-граммы)**

In [1187]:
w2v_accuracy_sg = count_accuracy(questions, 'word2vec_sg')
w2v_accuracy_sg

A Jupyter Widget

0.3453757225433526

**D2V accuracy**

**100 эпох, без учета стоп-слов**

In [411]:
d2v_accuracy_100 = count_accuracy(questions, 'doc2vec_100')
d2v_accuracy_100

A Jupyter Widget

0.000722543352601156

**100 эпох, с учетом стоп-слов**

In [412]:
d2v_accuracy_100_sw = count_accuracy(questions, 'doc2vec_100_sw')
d2v_accuracy_100_sw

A Jupyter Widget

0.001445086705202312

**500 эпох, без учета стоп-слов**

In [413]:
d2v_accuracy_500 = count_accuracy(questions, 'doc2vec_500')
d2v_accuracy_500

A Jupyter Widget

0.002167630057803468

### Inverted Index Accuracy

In [1155]:
ii_acc = count_accuracy(questions, 'inverted_index')
ii_acc

A Jupyter Widget

0.5065028901734104

### Word2Vec (0.7) + Doc2Vec (0.3) accuracy

In [1350]:
common_accuracy = 0

for i, j in enumerate(tqdm_notebook(questions)):
    search_result = search_w2v_d2v(preprocessing(j), w2v_sg_model, w2v_sg_res, d2v_500_model, d2v_500_res, n_results=5)

    for r in search_result:
        if i in r.keys():
            common_accuracy += 1
            
common_accuracy = common_accuracy / len(questions)        

A Jupyter Widget

1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [1340]:
common_accuracy

0.3453757225433526