In [1]:
import os
import string

from gensim.models import Word2Vec, KeyedVectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument, TaggedLineDocument

from nltk.tokenize import word_tokenize
punkt = string.punctuation+'»«–…'

from nltk.corpus import stopwords
stop_words = set(stopwords.words('russian'))

import pymorphy2
morph = pymorphy2.MorphAnalyzer()

from judicial_splitter import splitter
from tqdm import tqdm_notebook as tqdm

import json

import numpy as np
import re

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from itertools import zip_longest

In [43]:
import pickle

with open('qa_corpus.pkl', 'rb') as file:
    qa_corpus = pickle.load(file)
    

In [4]:
from heapdict import heapdict
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity

# Basic

In [5]:
def preprocessing(text, stop=False):
    global word_tokenize, stop_words, punkt
    text = re.sub(r"([a-zа-я0-9])(.)([A-ZА-Я0-9])", r"\1\2 \3", text)
    text = word_tokenize(text)
    new_text= []
    for word in text:
        word = word.strip(punkt)
        if word:
            if word in punkt: continue
            elif word.isdigit(): continue
            elif stop and word in stop_words: continue
            else: new_text.append(morph.parse(word)[0].normal_form)
    return new_text

In [197]:
def write():
    global qa_corpus
    with open('d2v_answers_3.txt','w', encoding = 'utf-8') as answers:
        with open ('d2v_indexes.txt', 'w', encoding = 'utf-8') as indexes:
            with open('d2v_questions.txt', 'w', encoding = 'utf-8') as questions:
                for key, value in enumerate(tqdm(qa_corpus)):
                    questions.write(' '.join(preprocessing(value[0]))+'\n')
                    for chunk in splitter(value[1], 3):
                        answer = ' '.join(preprocessing(chunk))
                        if len(answer) > 5:
                            answers.write(answer+'\n')
                            indexes.write(str(key)+'\n')

Записать всё в предобработанном варианте, чтобы быстрее проверять.

In [198]:
write()

HBox(children=(IntProgress(value=0, max=1384), HTML(value='')))

# D2V

Итерирование по файлу вместо хранения при обучении.

In [5]:
def docs():
    for key, value in enumerate(zip_longest(open('d2v_indexes.txt','r'), open('d2v_answers_3.txt','r'))):
        yield TaggedDocument(words=value[1].strip().split(), tags=[int(value[0].strip())])

In [8]:
d2v_model = Doc2Vec(vector_size=100, min_count=5, alpha=0.025, seed = 23,
                min_alpha=0.025, epochs=1000, workers=8, dm=1)

%time d2v_model.build_vocab(docs())
print (len(d2v_model.wv.vocab))
%time d2v_model.train(docs(), total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

CPU times: user 125 ms, sys: 0 ns, total: 125 ms
Wall time: 121 ms
2785
CPU times: user 4.02 s, sys: 1.15 s, total: 5.16 s
Wall time: 3.93 s


In [9]:
d2v_model.save('d2v_1000')

In [6]:
d2v_model = Doc2Vec.load('d2v_1000')

In [7]:
def d2v_search(query, n=5):
    global d2v_model
    d2v_model.random.seed(23)
    d2v_vector = d2v_model.infer_vector(query.strip().split(), epochs=1000)
    result = {i[0]: i[1] for i in d2v_model.docvecs.most_similar(positive = [d2v_vector], topn=n)}
    return dict(result)

In [15]:
def check_efficiency_d2v(n = 10):
    with open('d2v_questions.txt', 'r') as f:
        k = 0
        for key, line in enumerate(f):
            result = d2v_search(line.strip(), n=n)
            if key in result:
                k+= 1

        print ('top-{}'.format(n), k, k/1384)

In [17]:
%time check_efficiency_d2v(n = 5)

top-5 6 0.004335260115606936
CPU times: user 6min 39s, sys: 9min 43s, total: 16min 22s
Wall time: 2min 23s


In [18]:
%time check_efficiency_d2v(n = 10)

top-10 13 0.00939306358381503
CPU times: user 6min 30s, sys: 9min 37s, total: 16min 8s
Wall time: 2min 19s


In [20]:
import json
def save_d2v_base():
    global d2v_model
    with open('d2v_vectors','w', encoding = 'utf-8') as vectors:
        for line in tqdm(open('d2v_answers_3.txt','r')):
            d2v_model.random.seed(23)
            d2v = d2v_model.infer_vector(line.strip().split())
            d2v = d2v.tolist()
            vectors.write(json.dumps(d2v)+'\n')
%time save_d2v_base()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 2min 42s, sys: 165 ms, total: 2min 42s
Wall time: 2min 41s


In [8]:
def d2v_search(query, n=10):
    global d2v_model
    result = heapdict()
    d2v_model.random.seed(23)
    q_d2v = d2v_model.infer_vector(query.strip().split())
    for key, value in enumerate(zip_longest(open('d2v_indexes.txt','r'), open('d2v_vectors','r'))):
        d2v = json.loads(value[1])
        x = cosine_similarity([q_d2v], [d2v])[0][0]
        #print (x)
        ordinal = int(value[0])
        if ordinal in result:
            if result[ordinal] < x:
                result[ordinal] = x
        else:
            if len(result) == n:
                z = result.peekitem()
                if x > z[1]:
                    result.popitem()
                    result[ordinal] = x
            else:
                result[ordinal] = x
    return dict(result)

In [24]:
%time check_efficiency_d2v(n = 5)

top-5 61 0.04407514450867052
CPU times: user 11min 23s, sys: 2.86 s, total: 11min 26s
Wall time: 11min 26s


In [None]:
%time check_efficiency_d2v(n = 10)

# W2V

In [9]:
def get_w2v_vectors(text, k=300, prep=True):
    """Получает вектор документа"""
    
    global w2v_model, stop_words, word_tokenize
    
    if prep:
        arr_text = preprocessing(text, stop=True)
    else:
        arr_text = text.split()
    n = 0
    vector = np.array([0]*300)
    
    for word in arr_text:
        if word not in stop_words:
            try:
                vec = np.array(w2v_model.wv[word])
                n += 1 
                vector = vector + vec
            except:
                continue
            
    if n > 0: vector = vector / n
    
    return vector

In [27]:
w2v_model = Word2Vec.load('/home/dkbrz/data/rusvectores/araneum_none_fasttextcbow_300_5_2018.model')

In [10]:
def w2v_search(query, n=10):
    result = heapdict()
    q_w2v = get_w2v_vectors(query, k=300, prep=False)
    for key, value in enumerate(zip_longest(open('d2v_indexes.txt','r'), open('w2v_vectors','r'))):
        w2v = json.loads(value[1])
        x = cosine_similarity([q_w2v], [w2v])[0][0]
        #print (x)
        ordinal = int(value[0])
        if ordinal in result:
            if result[ordinal] < x:
                result[ordinal] = x
        else:
            if len(result) == n:
                z = result.peekitem()
                if x > z[1]:
                    result.popitem()
                    result[ordinal] = x
            else:
                result[ordinal] = x
    return dict(result)

In [29]:
import json
def save_w2v_base():
    with open('w2v_vectors','w', encoding = 'utf-8') as vectors:
        for line in tqdm(open('d2v_answers_3.txt','r')):
            w2v = get_w2v_vectors(line.strip(), k=300, prep=False)
            w2v = w2v.tolist()
            vectors.write(json.dumps(w2v)+'\n')
%time save_w2v_base()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 2.32 s, sys: 47.2 ms, total: 2.37 s
Wall time: 2.36 s


In [31]:
def check_efficiency_w2v(n = 10):
    with open('d2v_questions.txt', 'r') as f:
        k = 0
        for key, line in tqdm(enumerate(f)):
            result = w2v_search(line.strip(), n=n)
            if key in result:
                k+= 1

        print ('top-{}'.format(n), k, k/1384)

In [32]:
check_efficiency_w2v(n = 5)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


top-5 413 0.2984104046242775


## W2V + TFIDF

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range = (1,1), stop_words=stop_words)
with open ('d2v_answers_3.txt','r') as f:
    corpus = f.readlines()
tfidf.fit(corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={'какой', 'хоть', 'ее', 'после', 'и', 'все', 'том', 'потому', 'была', 'почти', 'нет', 'этот', 'конечно', 'бы', 'тебя', 'больше', 'ли', 'в', 'себя', 'при', 'ведь', 'всю', 'разве', 'надо', 'я', 'до', 'иногда', 'им', 'чего', 'будет', 'можно', 'у', 'нибудь', 'них', 'себе', 'ней', 'какая', 'он..., 'над', 'мой', 'впрочем', 'еще', 'по', 'всех', 'когда', 'может', 'же', 'более', 'про', 'раз', 'за'},
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [11]:
def get_w2v_vectors(text, k=300, prep=True):
    """Получает вектор документа"""
    
    global w2v_model, stop_words, word_tokenize, tfidf
    
    if prep:
        arr_text = preprocessing(text, stop=True)
    else:
        arr_text = text.split()
    n = 0
    vector = np.array([0]*300)
    address = {key:value for key, value in enumerate(tfidf.transform([' '.join(arr_text)]).toarray()[0]) if value != 0}
    for word in set(arr_text):
        if word not in stop_words:
            try:
                weight = address[tfidf.vocabulary_[word]]
                vec = np.array(w2v_model.wv[word]*weight)
                vector = vector + vec
                n += weight
            except:
                continue
    if n > 0: vector = vector / n
    
    return vector

In [35]:
%time check_efficiency_w2v(n = 5)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


top-5 416 0.30057803468208094
CPU times: user 14min 18s, sys: 8.16 s, total: 14min 27s
Wall time: 14min 26s


# OKAPI

In [47]:
def write():
    global qa_corpus
    with open('ok_answers.txt','w', encoding = 'utf-8') as answers:
                for key, value in enumerate(tqdm(qa_corpus)):
                        answer = ' '.join(preprocessing(value[1]))
                        answers.write(answer+'\n')
write()

HBox(children=(IntProgress(value=0, max=1384), HTML(value='')))




In [13]:
from collections import Counter
import math
from collections import defaultdict
from itertools import islice

In [12]:
def get_term_doc_matrix():
    n = 1384
    dictionary = {}
    term_doc_matrix = []
    for key, item in enumerate(open('ok_answers.txt','r')):
        text = Counter(item.strip().split())
        for word in text:
            if word in dictionary:
                term_doc_matrix[dictionary[word]][key] += text[word]
            else:
                dictionary[word] = len(dictionary)
                term_doc_matrix.append(np.zeros(n))
                term_doc_matrix[dictionary[word]][key] += text[word]
    return dictionary, term_doc_matrix

def inverted_index(dictionary, term_doc_matrix) -> dict:
    """
    Create inverted index by input doc collection
    :return: inverted index
    """
    result = {}
    for word in dictionary:
        result[word] = {key: int(value) for key, value in enumerate(term_doc_matrix[dictionary[word]]) if value > 0}
    return result

def score_BM25(qf, dl, avgdl, k1, b, N, n) -> float:
    """
    Compute similarity score between search query and documents from collection
    :return: score
    """
    score = math.log(1 + (N-n+0.5)/(n+0.5)) * (k1+1)*qf/(qf+k1*(1-b+b*(dl/avgdl)))
    return score

def compute_sim(word, index, dictionary, term_doc_matrix, doc_length, avgdl, N) -> float:
    """
    Compute similarity score between search query and documents from collection
    :return: score
    """
    if word in dictionary:
        n = len(index[word])
        result = {}
        for doc in index[word]:
            qf = term_doc_matrix[dictionary[word]][doc]/doc_length[doc]
            #qf = term_doc_matrix[dictionary[word]][doc]
            score = score_BM25(qf, doc_length[doc], avgdl, k1, b, N, n)
            result[doc] = score
        return result
    else:
        return {}

def get_okapi(query, n = 30) -> float:
    """
    Compute sim score between search query and all documents in collection
    Collect as pair (doc_id, score)
    :param query: input text
    :return: list of lists with (doc_id, score)
    """
    global index, dictionary, term_doc_matrix, doc_length, avgdl, N
    query = query.strip().split()
    result = defaultdict(int)
    for word in query:
        current = compute_sim(word, index, dictionary, term_doc_matrix, doc_length, avgdl, N)
        for doc in current:
            result[doc] += current[doc]
    return {i[0]:i[1] for i in sorted(result.items(), key = lambda x: x[1], reverse = True)[:n]}

In [57]:
get_okapi('рф')

{1337: 0.7149959628673017,
 698: 0.16460346604070242,
 1210: 0.13970408908850643,
 29: 0.10047578999633326,
 595: 0.09604667442394428,
 485: 0.05594133098187612,
 295: 0.04325844684583166,
 1220: 0.008773665986195623,
 516: 0.006735074200899682,
 200: 0.0062007226286731965}

In [14]:
dictionary, term_doc_matrix = get_term_doc_matrix()
index = inverted_index(dictionary, term_doc_matrix)

k1 = 2.0
b = 0.75

doc_length = {}
for key, value in enumerate(np.transpose(term_doc_matrix)):
    doc_length[key] = sum(value)

avgdl = sum(doc_length.values())/len(doc_length)
N = len(doc_length)

In [59]:
def check_efficiency_ok(n=5):
    with open('d2v_questions.txt', 'r') as f:
        k = 0
        for key, line in tqdm(enumerate(f)):
            result = get_okapi(line.strip(), n=n)
            if key in result:
                k+= 1

        print ('top-{}'.format(n), k, k/1384)

In [93]:
%time check_efficiency_ok(n = 5)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


top-5 339 0.24494219653179192
CPU times: user 25.1 s, sys: 35 ms, total: 25.1 s
Wall time: 25.1 s


In [115]:
def general_search(query, n=10, prep=True):
    if prep: query = ' '.join(preprocessing(query))
    w2v = w2v_search(query, n=250)
    d2v = d2v_search(query, n=250)
    okapi = get_okapi(query, n=250)
    candidates = set(w2v) | set(d2v) | set(okapi)
    result = heapdict()
    #print (candidates)
    for i in candidates:
        if i in okapi: x = okapi[i]
        else: x = 1
        coef = 0
        if i in w2v: coef += w2v[i]
        if i in d2v: coef += d2v[i]
        coef = coef*(1+math.log(1 +x))
        if len(result) == n and coef > result.peekitem()[1]:
            result.popitem()
            result[i] = coef*(1+math.log(x))
        elif len(result) < n: result[i] = coef*(1+math.log(x))
    return dict(result)

In [94]:
general_search('закон', n=10, prep=True)

{1174: 2.459311588279255,
 156: 2.2463586797997133,
 1190: 2.321729026265036,
 1221: 2.4475711855115523,
 244: 2.3837872030075737,
 547: 2.267009936939927,
 572: 2.403965688945301,
 951: 2.513139969047119,
 981: 2.2786997432105696,
 987: 2.4245083284732454}

In [15]:
def check_efficiency_GN(n=5):
    with open('d2v_questions.txt', 'r') as f:
        k = 0
        for key, line in tqdm(enumerate(f)):
            result = general_search(line.strip(), n=n, prep=False)
            if key in result:
                k+= 1

        print ('top-{}'.format(n), k, k/1384)

In [117]:
check_efficiency_GN(n=5)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


top-5 246 0.1777456647398844


In [118]:
def general_search(query, n=10, prep=True):
    if prep: query = ' '.join(preprocessing(query))
    w2v = w2v_search(query, n=250)
    #d2v = d2v_search(query, n=250)
    okapi = get_okapi(query, n=250)
    #candidates = set(w2v) | set(d2v) | set(okapi)
    candidates = set(w2v) | set(okapi)
    result = heapdict()
    #print (candidates)
    for i in candidates:
        if i in okapi: x = okapi[i]
        else: x = 1
        coef = 0
        if i in w2v: coef += w2v[i]
        #if i in d2v: coef += d2v[i]
        coef = coef*(1+math.log(1 +x))
        if len(result) == n and coef > result.peekitem()[1]:
            result.popitem()
            result[i] = coef*(1+math.log(x))
        elif len(result) < n: result[i] = coef*(1+math.log(x))
    return dict(result)

In [119]:
check_efficiency_GN(n=5)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


top-5 432 0.31213872832369943


In [120]:
def general_search(query, n=10, prep=True):
    if prep: query = ' '.join(preprocessing(query))
    w2v = w2v_search(query, n=500)
    #d2v = d2v_search(query, n=250)
    okapi = get_okapi(query, n=500)
    #candidates = set(w2v) | set(d2v) | set(okapi)
    candidates = set(w2v) | set(okapi)
    result = heapdict()
    #print (candidates)
    for i in candidates:
        if i in okapi: x = okapi[i]
        else: x = 1
        coef = 0
        if i in w2v: coef += w2v[i]
        #if i in d2v: coef += d2v[i]
        coef = coef*(1+math.log(1 +x))
        if len(result) == n and coef > result.peekitem()[1]:
            result.popitem()
            result[i] = coef*(1+math.log(x))
        elif len(result) < n: result[i] = coef*(1+math.log(x))
    return dict(result)

In [121]:
check_efficiency_GN(n=5)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


top-5 432 0.31213872832369943


In [16]:
def general_search(query, n=10, prep=True):
    if prep: query = ' '.join(preprocessing(query))
    w2v = w2v_search(query, n=500)
    #d2v = d2v_search(query, n=250)
    okapi = get_okapi(query, n=500)
    #candidates = set(w2v) | set(d2v) | set(okapi)
    candidates = set(w2v) | set(okapi)
    result = heapdict()
    #print (candidates)
    for i in candidates:
        if i in okapi: x = okapi[i]
        else: x = 1
        coef = 0
        if i in w2v: coef += w2v[i]
        #if i in d2v: coef += d2v[i]
        coef = math.exp(coef)*x
        if len(result) == n and coef > result.peekitem()[1]:
            result.popitem()
            result[i] = coef*(1+math.log(x))
        elif len(result) < n: result[i] = coef*(1+math.log(x))
    return dict(result)

In [19]:
check_efficiency_GN(n=5)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


top-5 324 0.23410404624277456


In [107]:
math.exp(0.3)

1.3498588075760032

In [104]:
math.log(5)

1.6094379124341003