# Homework on word embeddings

## Task 1

1) Векторизуйте тексты с помощью Word2vec модели, обученной самостоятельно, и с помощью модели, взятой с rusvectores (например вот этой - http://vectors.nlpl.eu/repository/20/180.zip). Обучите 2 модели по определению перефразирования на получившихся векторах и проверьте, что работает лучше. 

Word2Vec нужно обучить на отдельном корпусе (не на парафразах). Можно взять данные из семинара или любые другие. 
!!!! ВАЖНО: Оценивать модели нужно с помощью кросс-валидации (в семинаре не кросс-валидация)! Метрика - f1.

In [1]:
import pandas as pd
from lxml import html
import numpy as np
from pymystem3 import Mystem
from tqdm import tqdm
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD, NMF, PCA
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.ensemble import RandomForestClassifier
import gensim
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import Counter,defaultdict
from string import punctuation
from razdel import tokenize as razdel_tokenize
import os
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
%matplotlib inline

### Normalization

In [2]:
class Normilizer():

    def __init__(self, morph_type):
        
        self.morpho = MorphAnalyzer() 
        self.cashe = {}
        self.stops = set(stopwords.words('russian'))
        
    
    def normalize(self, text) -> list:
        """
            returns a normalized text with POS tags mapped to udpipe
        """
        
        words = self.tokenize(text)
        
        res=[]
        
        mapping = self.generate_mapping('data/ru-rnc.map.txt')

        for word in words:
            if not word or word in self.stops:# skip stop words
                continue 
            elif word in self.cashe: # check cashed first
                res.append(self.cashe[word])
            else:
                r=self.morpho.parse(word)[0]
                lemma = r.normal_form
                pos = r.tag.POS
                try:
                    pos = mapping[pos]
                    res.append(lemma+'_'+pos)
                    self.cashe[word]=lemma+'_'+pos
                except:
                    res.append('Error')
                
        return res

    
    def tokenize(self, text) -> str:
        """
            tokenizes a text and keeps only alphanumeric tokens
        """
        punct = punctuation+'«»—…“”*№–'
        
        tokens = [token.text.strip(punct).lower() for token in list(razdel_tokenize(text))]
        tokens = [token for token in tokens if token.isalnum()]

        return tokens
    
    
    def generate_mapping(self, path) -> dict:
        """ 
            generates mapping of PoS tags to map mystem and udpipe tags:
            Mapping was update for pymorphy2
        """
        mapping = {}

        for line in open(path):
            ms, ud = line.strip('\n').split()
            mapping[ms] = ud
            
        return mapping

In [3]:
data = open('data/wiki_data.txt', encoding='utf8').read().splitlines()
norm = Normilizer('pymorphy2')
norm.normalize('Обучить классификатор парафразов на предобученной модели вам нужно будет дома')

['обучить_VERB',
 'классификатор_NOUN',
 'парафраз_NOUN',
 'предобученный_ADJ',
 'модель_NOUN',
 'нужно_X',
 'дом_NOUN']

In [4]:
%%time
data_norm = [norm.normalize(text) for text in data]

Wall time: 1min 25s


In [5]:
len(data_norm)

20002

### Modelling

my embedding

In [6]:
w2v = gensim.models.Word2Vec([text for text in data_norm], size=50, sg=1)

In [7]:
w2v.most_similar('полиция_NOUN')

  w2v.most_similar('полиция_NOUN')


[('полицейский_ADJ', 0.8500531911849976),
 ('полицейский_NOUN', 0.8263692855834961),
 ('преступник_NOUN', 0.8263688087463379),
 ('охранник_NOUN', 0.8084696531295776),
 ('гестапо_NOUN', 0.7944058179855347),
 ('подозревать_ADJ', 0.7846692204475403),
 ('милиция_NOUN', 0.7791818976402283),
 ('жандарм_NOUN', 0.7747201919555664),
 ('бандит_NOUN', 0.7740579843521118),
 ('спецназ_NOUN', 0.7737398743629456)]

In [8]:
w2v

<gensim.models.word2vec.Word2Vec at 0x1853b0205e0>

rusvectores http://vectors.nlpl.eu/repository/20/180.zip

In [9]:
rusvec = gensim.models.KeyedVectors.load_word2vec_format('data/models/180/model.bin', binary=True)

In [10]:
rusvec.most_similar('полиция_NOUN')

[('полиция_PROPN', 0.78785240650177),
 ('полицейский_ADJ', 0.7626974582672119),
 ('полицейский_NOUN', 0.6821430921554565),
 ('жандармерия_NOUN', 0.6472468376159668),
 ('жандарм_NOUN', 0.6468209624290466),
 ('городовый_ADJ', 0.6110862493515015),
 ('сыскный_ADJ', 0.6072753667831421),
 ('жандармский_ADJ', 0.6051692962646484),
 ('агент_NOUN', 0.5971013307571411),
 ('градоначальник_NOUN', 0.5876316428184509)]

Vectorizing

In [11]:
class Vectorizer():
    """ word2vec vectorizer """
    
    def __init__(self, model, dim):
        self.model = model # Gensim w2v model
        self.dim = dim
        

    def get_embedding(self, text):
        """ transforms a text into a vector using w2v model """
        
        text = text.split()
        words = Counter(text) # cashe words
        total = len(text)
        vectors = np.zeros((len(words), self.dim))

        for i,word in enumerate(words):
            try:
                v = self.model[word]
                vectors[i] = v*(words[word]/total) # value times word frequency
            except (KeyError, ValueError):
                continue
        
        # modify later to throw error when dimensinalities are not equal         
        if vectors.any():
            vector = np.average(vectors, axis=0)
        else:
            vector = np.zeros((self.dim))
        
        return vector

In [12]:
corpus_xml = html.fromstring(open('data/paraphraser/paraphrases.xml', 'rb').read())
texts_1 = []
texts_2 = []
classes = []

for p in corpus_xml.xpath('//paraphrase'):
    texts_1.append(p.xpath('./value[@name="text_1"]/text()')[0])
    texts_2.append(p.xpath('./value[@name="text_2"]/text()')[0])
    classes.append(p.xpath('./value[@name="class"]/text()')[0])

data = pd.DataFrame({'text_1':texts_1, 'text_2':texts_2, 'label':classes})

In [13]:
data['text_1_norm'] = data['text_1'].apply(norm.normalize)
data['text_2_norm'] = data['text_2'].apply(norm.normalize)
data['text_1_norm'] = data['text_1_norm'].apply(" ".join)
data['text_2_norm'] = data['text_2_norm'].apply(" ".join)

In [14]:
data.head(2)

Unnamed: 0,text_1,text_2,label,text_1_norm,text_2_norm
0,Полицейским разрешат стрелять на поражение по ...,Полиции могут разрешить стрелять по хулиганам ...,0,полицейский_NOUN разрешить_VERB стрелять_VERB ...,полиция_NOUN мочь_VERB разрешить_VERB стрелять...
1,Право полицейских на проникновение в жилище ре...,Правила внесудебного проникновения полицейских...,0,право_NOUN полицейский_ADJ проникновение_NOUN ...,правило_NOUN внесудебный_ADJ проникновение_NOU...


In [15]:
data.label.value_counts()

0     2957
-1    2582
1     1688
Name: label, dtype: int64

In [16]:
y = data['label']
y.shape

(7227,)

In [17]:
dim=50

W2V = Vectorizer(w2v, dim)

X_text_1_w2v = np.zeros((len(data['text_1_norm']), dim))
X_text_2_w2v = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_w2v[i] = W2V.get_embedding(text)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_w2v[i] = W2V.get_embedding(text)
    
X_text_w2v = np.concatenate([X_text_1_w2v, X_text_2_w2v], axis=1)

  v = self.model[word]


In [18]:
X_text_w2v.shape

(7227, 100)

In [19]:
X_text_w2v[0]

array([ 0.03318706, -0.04561456,  0.05576942,  0.00259705, -0.04769241,
        0.01200561, -0.03033194,  0.01177125,  0.03747249, -0.00064962,
       -0.00491567, -0.05112004,  0.01183562,  0.01162327,  0.03830356,
        0.00492895, -0.01512252, -0.03033132, -0.01084426,  0.06821928,
       -0.0263701 ,  0.0204648 , -0.03520388, -0.01647543,  0.01431771,
        0.00974876, -0.03096395,  0.12084189,  0.01913359,  0.01874183,
       -0.00452537, -0.01621977,  0.03449763,  0.03314626, -0.04541603,
       -0.06757362, -0.05105894, -0.02299673, -0.03925633,  0.0093741 ,
        0.00426806,  0.07853495, -0.06917067, -0.01067022,  0.0219521 ,
        0.0344793 , -0.01934548,  0.09621828,  0.0409141 ,  0.02076775,
        0.00029674, -0.04170818,  0.04791079, -0.0041259 , -0.03660661,
        0.01266169, -0.01671135,  0.00719931,  0.02956726,  0.00553932,
       -0.01883045, -0.02489705, -0.00198378,  0.01398144,  0.02683648,
        0.01634377, -0.02820855, -0.05409733,  0.02027883,  0.05

In [20]:
dim=300

RV = Vectorizer(rusvec, dim)

X_text_1_rv = np.zeros((len(data['text_1_norm']), dim))
X_text_2_rv = np.zeros((len(data['text_2_norm']), dim))
for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_rv[i] = RV.get_embedding(text)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_rv[i] = RV.get_embedding(text)
    
X_text_rv = np.concatenate([X_text_1_rv, X_text_2_rv], axis=1)

In [21]:
X_text_rv.shape

(7227, 600)

In [22]:
X_text_rv[0]

array([ 1.64059601e-01,  1.21311058e-01, -1.96751207e-03,  1.23038463e-02,
       -3.22328943e-02, -1.23041797e-01, -3.32447545e-02, -7.48019665e-04,
        1.64932894e-01,  2.05589972e-01, -3.99886680e-02, -4.89076500e-02,
       -4.36700465e-02, -1.50282814e-01, -2.59457119e-02,  1.34450427e-01,
        1.53907708e-02, -2.26598683e-03,  9.73287964e-02,  1.51583243e-01,
        6.47600641e-02,  5.85517474e-02, -1.57137622e-01,  1.16854645e-01,
        1.04694522e-01, -4.70496186e-03,  3.42248657e-01, -1.72740516e-01,
       -5.26267874e-02, -4.68619385e-02,  1.57444389e-02,  6.29507114e-03,
       -9.23639437e-02,  1.26490953e-02,  4.81259639e-02, -9.92993576e-03,
        9.51276583e-02,  5.22257388e-03, -2.30989221e-01,  8.83672635e-02,
        1.33473856e-01, -1.34874692e-01, -9.26923336e-02, -1.11844506e-01,
       -1.22059313e-01, -1.84837302e-01,  3.19754065e-02, -5.73036658e-02,
        5.22557298e-02,  8.78588328e-02,  9.39663301e-02, -6.93004156e-02,
        6.22022151e-02,  

comparing models

In [23]:
from sklearn.model_selection import cross_val_score

In [24]:
clf1 = RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_leaf=15,
                             class_weight='balanced')
clf2 = LogisticRegression(C=10000, class_weight='balanced')

In [25]:
%%capture --no-stdout _
print(cross_val_score(clf1, X_text_w2v, y, scoring="f1_micro"))
print(cross_val_score(clf2, X_text_w2v, y, scoring="f1_micro"))

[0.45435685 0.46196404 0.50034602 0.34394464 0.35224913]
[0.38934993 0.41493776 0.44013841 0.33564014 0.32249135]


In [26]:
%%capture --no-stdout _
print(cross_val_score(clf1, X_text_rv, y, scoring="f1_micro"))
print(cross_val_score(clf2, X_text_rv, y, scoring="f1_micro"))

[0.4329184  0.47164592 0.49688581 0.36262976 0.37854671]
[0.39488243 0.41217151 0.43460208 0.34186851 0.3550173 ]


В целом, результаты сопоставимы, но на готовой модели результаты чуть более стабильны.

## Task 2

Преобразуйте тексты в векторы в каждой паре 5 методами  - SVD, NMF, Word2Vec (свой и  русвекторовский), Fastext. У вас должно получиться 5 пар векторов для каждой строчки в датасете. Между векторами каждой пары вычислите косинусную близость (получится 5 чисел для каждой пары).

Постройте обучающую выборку из этих близостей . Обучите любую модель (Логрег, Рандом форест или что-то ещё) на этой выборке и оцените качество на кросс-валидации (используйте микросреднюю f1-меру).  Попробуйте улучить метрику, изменив параметры в методах векторизации.
!!УТОЧНЕНИЕ: модель нужно обучить сразу на всех 5 близостях, а не по 1 модели на каждой близости!


In [27]:
from sklearn.metrics.pairwise import cosine_distances, cosine_similarity

5 vector types

In [28]:
tfidf = TfidfVectorizer(min_df=3, max_df=0.4, max_features=1000)
tfidf.fit(pd.concat([data['text_1_norm'], data['text_2_norm']]))

TfidfVectorizer(max_df=0.4, max_features=1000, min_df=3)

In [29]:
svd = TruncatedSVD(200)

X_text_1_svd = svd.fit_transform(tfidf.transform(data['text_1_norm']))
X_text_2_svd = svd.fit_transform(tfidf.transform(data['text_2_norm']))

In [30]:
nmf = NMF(100)

X_text_1_nmf = nmf.fit_transform(tfidf.transform(data['text_1_norm']))
X_text_2_nmf = nmf.fit_transform(tfidf.transform(data['text_2_norm']))

X_text_nmf = np.concatenate([X_text_1_nmf, X_text_2_nmf], axis=1)

In [31]:
w2v = gensim.models.Word2Vec([text for text in data_norm], size=50, sg=1)

dim=50

W2V = Vectorizer(w2v, dim)

X_text_1_w2v = np.zeros((len(data['text_1_norm']), dim))
X_text_2_w2v = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_w2v[i] = W2V.get_embedding(text)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_w2v[i] = W2V.get_embedding(text)

  v = self.model[word]


In [32]:
fast_text = gensim.models.FastText([text for text in data_norm], size=50, min_n=4, max_n=8) 

dim=50

FT = Vectorizer(fast_text, dim)

X_text_1_ft = np.zeros((len(data['text_1_norm']), dim))
X_text_2_ft = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_ft[i] = FT.get_embedding(text)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_ft[i] = FT.get_embedding(text)

  v = self.model[word]


In [33]:
dim=300

RV = Vectorizer(rusvec, dim)

X_text_1_rv = np.zeros((len(data['text_1_norm']), dim))
X_text_2_rv = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_rv[i] = RV.get_embedding(text)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_rv[i] = RV.get_embedding(text)

In [34]:
cosine_similarity(X_text_1_svd[0:1], X_text_2_svd[0:1])

array([[-0.0307519]])

In [35]:
def calculate_row_similarity(A, B):
    """ calculates similarity between each row of matrix A and matrix B """
    output = []
    for row in zip(A, B):
        # add a value only
        output.extend(cosine_similarity([row[0]], [row[1]])[0])
        
    return output

In [36]:
svd_sim = calculate_row_similarity(X_text_1_svd, X_text_2_svd)
nmf_sim = calculate_row_similarity(X_text_1_nmf, X_text_2_nmf)
w2v_sim = calculate_row_similarity(X_text_1_w2v, X_text_2_w2v)
ft_sim = calculate_row_similarity(X_text_1_ft, X_text_2_ft)
rv_sim = calculate_row_similarity(X_text_1_rv, X_text_2_rv)

similarities = pd.DataFrame({
    'svd': svd_sim,
    "nmf": nmf_sim,
    'w2v': w2v_sim,
    'ft' : ft_sim,
    'rv' : rv_sim,
})

similarities.head()

Unnamed: 0,svd,nmf,w2v,ft,rv
0,-0.030752,0.005112,0.911452,0.874739,0.644788
1,0.091922,0.024277,0.920896,0.862422,0.809568
2,0.013451,0.045723,0.942523,0.791031,0.806049
3,-0.199296,0.018446,0.799912,0.552522,0.401639
4,-0.245774,0.012617,0.920818,0.653118,0.584889


Классификаторы 1

In [37]:
clf1 = RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_leaf=15,
                             class_weight='balanced')
clf2 = LogisticRegression(C=10000, class_weight='balanced')

In [38]:
%%capture --no-stdout _
print(cross_val_score(clf1, similarities, y, scoring="f1_micro"))
print(cross_val_score(clf2, similarities, y, scoring="f1_micro"))

[0.53941909 0.57053942 0.60553633 0.47128028 0.46366782]
[0.51590595 0.53872752 0.57577855 0.44290657 0.45467128]


Результаты существенно выше, но разброс сохраняется

Классификаторы 2: Изменение гиперпараметров (увеличение или уменьшение) не дает ощутимой разницы в результатах

In [39]:
clf1 = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_leaf=10,
                             class_weight='balanced')
clf2 = LogisticRegression(C=1000, class_weight='balanced')

In [40]:
%%capture --no-stdout _
print(cross_val_score(clf1, similarities, y, scoring="f1_micro"))
print(cross_val_score(clf2, similarities, y, scoring="f1_micro"))

[0.53803596 0.57607192 0.61384083 0.46712803 0.46228374]
[0.51452282 0.53803596 0.57577855 0.44290657 0.45467128]


Оба классификатора, обученные на косинусной близости, дают более высокий результат в сравнении с классификаторами обученными только на эмбеддингах ~ +0.1


**Изменение параметров при векторизации**

In [41]:
tfidf = TfidfVectorizer(min_df=5, max_df=0.7, max_features=2000)
tfidf.fit(pd.concat([data['text_1_norm'], data['text_2_norm']]))

TfidfVectorizer(max_df=0.7, max_features=2000, min_df=5)

In [42]:
svd = TruncatedSVD(300)

X_text_1_svd = svd.fit_transform(tfidf.transform(data['text_1_norm']))
X_text_2_svd = svd.fit_transform(tfidf.transform(data['text_2_norm']))

In [43]:
nmf = NMF(150)

X_text_1_nmf = nmf.fit_transform(tfidf.transform(data['text_1_norm']))
X_text_2_nmf = nmf.fit_transform(tfidf.transform(data['text_2_norm']))

X_text_nmf = np.concatenate([X_text_1_nmf, X_text_2_nmf], axis=1)



In [44]:
w2v = gensim.models.Word2Vec([text for text in data_norm], size=100, sg=1)

dim=100

W2V = Vectorizer(w2v, dim)

X_text_1_w2v = np.zeros((len(data['text_1_norm']), dim))
X_text_2_w2v = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_w2v[i] = W2V.get_embedding(text)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_w2v[i] = W2V.get_embedding(text)

  v = self.model[word]


In [45]:
fast_text = gensim.models.FastText([text for text in data_norm], size=100, min_n=2, max_n=10) 

dim=100

FT = Vectorizer(fast_text, dim)

X_text_1_ft = np.zeros((len(data['text_1_norm']), dim))
X_text_2_ft = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_ft[i] = FT.get_embedding(text)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_ft[i] = FT.get_embedding(text)

  v = self.model[word]


In [46]:
dim=300

RV = Vectorizer(rusvec, dim)

X_text_1_rv = np.zeros((len(data['text_1_norm']), dim))
X_text_2_rv = np.zeros((len(data['text_2_norm']), dim))

for i, text in enumerate(data['text_1_norm'].values):
    X_text_1_rv[i] = RV.get_embedding(text)
    
for i, text in enumerate(data['text_2_norm'].values):
    X_text_2_rv[i] = RV.get_embedding(text)

In [47]:
svd_sim = calculate_row_similarity(X_text_1_svd, X_text_2_svd)
nmf_sim = calculate_row_similarity(X_text_1_nmf, X_text_2_nmf)
w2v_sim = calculate_row_similarity(X_text_1_w2v, X_text_2_w2v)
ft_sim = calculate_row_similarity(X_text_1_ft, X_text_2_ft)
rv_sim = calculate_row_similarity(X_text_1_rv, X_text_2_rv)

similarities = pd.DataFrame({
    'svd': svd_sim,
    "nmf": nmf_sim,
    'w2v': w2v_sim,
    'ft' : ft_sim,
    'rv' : rv_sim,
})

similarities.head()

Unnamed: 0,svd,nmf,w2v,ft,rv
0,-0.10387,0.0,0.879351,0.880637,0.644788
1,-0.103921,0.031926,0.903738,0.87632,0.809568
2,-0.038017,0.002405,0.907921,0.780492,0.806049
3,-0.11008,0.063408,0.767054,0.583848,0.401639
4,-0.097454,0.0,0.901347,0.660946,0.584889


сравннение

In [48]:
clf1 = RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_leaf=15,
                             class_weight='balanced')
clf2 = LogisticRegression(C=10000, class_weight='balanced')

In [49]:
%%capture --no-stdout _
print(cross_val_score(clf1, similarities, y, scoring="f1_micro"))
print(cross_val_score(clf2, similarities, y, scoring="f1_micro"))

[0.5373444  0.5670816  0.60069204 0.46782007 0.45743945]
[0.52213001 0.53665284 0.57716263 0.44567474 0.44290657]


In [50]:
clf1 = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_leaf=10,
                             class_weight='balanced')
clf2 = LogisticRegression(C=1000, class_weight='balanced')

In [51]:
%%capture --no-stdout _
print(cross_val_score(clf1, similarities, y, scoring="f1_micro"))
print(cross_val_score(clf2, similarities, y, scoring="f1_micro"))

[0.52766252 0.57261411 0.61245675 0.46020761 0.46920415]
[0.52213001 0.53665284 0.57716263 0.44567474 0.44290657]


Изменение параметров не дает заметной разницы (+-0.03)