In [186]:
import adagram
from lxml import html
from nltk.corpus import wordnet as wn
from pymorphy2 import MorphAnalyzer
from string import punctuation
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))


def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word and word != '' and word not in stops]

    return ' '.join(words)

def normalize_tokenize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word and word != '' and word not in stops]

    return words

## Задание 1. Протестировать адаграм в определении перефразирования

Прочитаем корпус перефразирования

In [187]:
corpus_xml = html.fromstring(open('paraphraser/paraphrases.xml', 'rb').read())
texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])
    
data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

In [188]:
data['text_1_norm'] = data['text_1'].apply(normalize)
data['text_2_norm'] = data['text_2'].apply(normalize)

In [189]:
y = data['label'].values

Загрузим модель, обученную на семинаре

In [194]:
vm = adagram.VectorModel.load("out.pkl")

In [195]:
vm.sense_neighbors('рыба', 0) # первое значение слова мир

[('ловить', 0, 0.8158308),
 ('берег', 0, 0.5251646),
 ('река', 0, 0.46946052),
 ('школа', 0, 0.44177622),
 ('иордан', 0, 0.43631625),
 ('западный', 0, 0.42009193),
 ('оттуда', 0, 0.40351284),
 ('макс-2001', 0, 0.384777),
 ('зверь', 0, 0.37941083),
 ('федеральный', 0, 0.3656322)]

Функция, достающая контекст по слову

In [196]:
def get_context(i, words, window):
    left_context_lb = max(0, i - window)
    left_context_rb = i - 1
    left_context = []
    if left_context_lb <= left_context_rb:
        left_context = words[left_context_lb : left_context_rb + 1]
    right_context_lb = i+1
    right_context_rb = min(len(words), i+1 + window )
    right_context = []
    if right_context_lb < right_context_rb:
        right_context  = words[right_context_lb : right_context_rb]
    return left_context + right_context

def get_words_in_context(words, window=3):
    contexts = []
    for i, word in enumerate(words):
        contexts.append([word] + [get_context(i, words, window)])
    return contexts

In [197]:
words = [0,1,2,3,4,5,6,7,8,9]
get_words_in_context(words)

[[0, [1, 2, 3]],
 [1, [0, 2, 3, 4]],
 [2, [0, 1, 3, 4, 5]],
 [3, [0, 1, 2, 4, 5, 6]],
 [4, [1, 2, 3, 5, 6, 7]],
 [5, [2, 3, 4, 6, 7, 8]],
 [6, [3, 4, 5, 7, 8, 9]],
 [7, [4, 5, 6, 8, 9]],
 [8, [5, 6, 7, 9]],
 [9, [6, 7, 8]]]

Получим вектора из adagram

In [198]:
def get_embedding_adagram(text, model, window, dim):
    text_words = text.split(' ')
    word2context = get_words_in_context(text_words, window)
    vectors = np.zeros((len(word2context), dim))
    for i, (word, context) in enumerate(word2context):
        try:
            best_vector_num = model.disambiguate(word, context).argmax()
            v = vm.sense_vector(word, best_vector_num)
            vectors[i] = v
        except Exception as e:
            #print(e)
            continue

    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))

    return vector

In [199]:
get_embedding_adagram(normalize('Я ловлю рыбу в воде'), vm, 3, 100)

  z = np.log(z)


array([ 0.14275158, -0.22403363,  0.22693477, -0.52934948,  0.51523248,
        0.03478502, -0.37153084, -0.1403941 , -0.22241557,  0.25916778,
        0.32752725, -0.7455971 ,  0.12869103,  0.49492278, -0.04040194,
       -0.05277447,  0.19305277, -0.3290598 , -0.35874386,  0.04778454,
       -0.18476966, -0.32150583, -0.00242544, -0.01663467, -0.26283421,
       -0.40783263, -0.18386782,  0.72621819,  0.19111311,  0.39995851,
        0.05093759,  0.49570619,  0.42697386, -0.49313649,  0.20842485,
       -0.24829472, -0.32408496,  0.34340588, -0.34490345, -0.18698962,
       -0.25114506,  0.44190328, -0.27456576,  0.14071238, -0.48556054,
       -0.56815771,  0.0659041 , -0.09099596, -0.28906764,  0.33360192,
       -0.17318174,  0.60724228, -0.57436242,  0.78905046, -0.41291089,
       -0.28126622, -0.1527587 , -0.48059495, -0.49457   , -0.18323295,
        0.45138641,  0.17131703, -0.09176735,  0.24156274, -0.61263892,
       -0.17049634,  0.33847125, -0.04307706, -0.76868787, -0.46

In [200]:
def get_vectors_by_model(data, model, window, dim):

    X_text_1 = np.zeros((len(data['text_1_norm']), dim))
    X_text_2 = np.zeros((len(data['text_2_norm']), dim))
    for i, text in enumerate(data['text_1_norm'].values):
        X_text_1[i] = get_embedding_adagram(text, model, window, dim)
        if i % 100 == 0:
            print('processed text_1:', i)

    for i, text in enumerate(data['text_2_norm'].values):
        X_text_2[i] = get_embedding_adagram(text, model, window, dim)
        if i % 100 == 0:
            print('processed text2:', i)

    return X_text_1, X_text_2

Получим вектора при помощи adagram

In [201]:
#Раскомментировать, чтобы выполнить
#X_text_1_adagram, X_text_2_adagram = get_vectors_by_model(data, vm, 3, 100)
#Сохраним полученные вектора в pickle
#with open('X_text_1_adagram.pkl', 'wb') as f:
    #pickle.dump(X_text_1_adagram, f)

#Сохраним полученные вектора в pickle
#with open('X_text_2_adagram.pkl', 'wb') as f:
    #pickle.dump(X_text_2_adagram, f)

#Прочитаем из pickle
with open('X_text_1_adagram.pkl', 'rb') as f:
    X_text_1_adagram = pickle.load(f)

#Прочитаем из pickle
with open('X_text_2_adagram.pkl', 'rb') as f:
    X_text_2_adagram = pickle.load(f)

Конкатенируем вектора

In [202]:
X_text_adagram = np.concatenate([X_text_1_adagram, X_text_2_adagram], axis=1)

Выполним классификацию, используя вектора в качестве параметров

In [203]:
clf_adagram = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_leaf=15,
                             class_weight='balanced')
scores_adagram = cross_val_score(clf_adagram, X_text_adagram, y, cv=5, scoring='f1_micro')

In [204]:
scores_adagram

array([0.45335176, 0.51347616, 0.5100346 , 0.3968144 , 0.4065097 ])

## Задание 2. Реализовать алгоритм Леска и проверить его на реальном датасете

In [205]:
def tokenize(text):
    return [word.strip(punct) for word in text.lower().split() if word and word != '']

def get_overlapping_words(sentence1_tokenized, sentence2_tokenized):
    words1 = set(sentence1_tokenized)
    words2 = set(sentence2_tokenized)
    return words1.intersection(words2)

def lesk( word, sentence_tokenized):
    synsets = wn.synsets(word)
    bestsense = None
    bestsynset = None
    best_overlap_num = 0
    for i, synset in enumerate(synsets):
        definition = synset.definition()
        definition_tokenized = tokenize(definition)
        overlapping_words = get_overlapping_words(definition_tokenized, sentence_tokenized)
        #если везде пересечений нет, возьмем первое значение
        if len(overlapping_words) > best_overlap_num or bestsense is None:
            best_overlap_num = len(overlapping_words)
            bestsense = i
            bestsynset = synset
    return bestsense, bestsynset

In [206]:
lesk('day', 'Earth rotation'.split())

(0, Synset('day.n.01'))

In [207]:
lesk('day', 'some point or period in time'.split())

(1, Synset('day.n.02'))

In [208]:
lesk('day', 'I am a writer'.split())

(0, Synset('day.n.01'))

Проверим метод на корпусе WSD (возьмем первые 1000 предложений)

In [209]:
corpus_wsd_part = []
corpus = open('corpus_wsd_50k.txt').read().split('\n\n')
for i, sent in enumerate(corpus):
    if i >= 1000:
        break
    corpus_wsd_part.append([s.split('\t') for s in sent.split('\n')])
len(corpus_wsd_part)

1000

Функции для проверки корпуса через алгоритм Леска

In [210]:
def is_multisense(word):
    return word[0] != ''


def get_sentence_tokenized(sentence):
    sentence_tokenized = []
    for word in sentence:
        sentence_tokenized.append(word[1])
    return sentence_tokenized


def evaluate_corpus_with_lesk(corpus_wsd):
    results = []
    for i, sentence in enumerate(corpus_wsd):
        results += evaluate_sentence_with_lesk(sentence)
    return results

def evaluate_sentence_with_lesk(sentence):
    results = []
    sentence_tokenized = get_sentence_tokenized(sentence)
    for word in sentence:
        if is_multisense(word):
            results.append(evaluate_word_with_lesk(word, sentence_tokenized))
    return results


def evaluate_word_with_lesk(word, sentence_tokenized):
    index, synset_by_lesk = lesk(word[1], sentence_tokenized)
    synset_key = word[0]
    synset_real = wn.lemma_from_key(synset_key).synset()
    if synset_by_lesk == synset_real:
        return 1
    return 0

In [211]:
def evaluate_accuracy(results):
    return sum(results)/len(results)

In [212]:
results = evaluate_corpus_with_lesk(corpus_wsd_part)
evaluate_accuracy(results)

0.36300738007380073