# Assignment 3 (alternative version)


Задание: построить feed forward NN модель на pytorch для задачи NER из 4 дз. разрешается использовать эмбеддинги. Необходимо побить бейзлайны.

baseline 1: 0.0604 random labels

baseline 2: 0.3966 PoS features + logistic regression

baseline 3: 0.8122 word2vec cbow embedding + baseline 2 + svm

In [28]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn import model_selection, metrics
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from gensim.models.word2vec import Word2Vec
from sklearn.base import TransformerMixin
from collections import defaultdict
from sklearn.dummy import DummyClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import torch.nn as nn
import torch as tt

import warnings
warnings.filterwarnings('ignore')


SEED=1337

In [2]:
df = pd.read_csv('../input/assign3/ner_short.csv', index_col=0)
df.head()

Unnamed: 0,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,sentence_idx,word,tag
0,NNS,demonstrators,IN,of,NNS,__START1__,__START2__,__START2__,__START1__,1.0,Thousands,O
1,VBP,have,NNS,demonstrators,IN,NNS,__START1__,__START1__,Thousands,1.0,of,O
2,VBN,marched,VBP,have,NNS,IN,NNS,Thousands,of,1.0,demonstrators,O
3,IN,through,VBN,marched,VBP,NNS,IN,of,demonstrators,1.0,have,O
4,NNP,London,IN,through,VBN,VBP,NNS,demonstrators,have,1.0,marched,O


In [3]:
# number of sentences
df.sentence_idx.max()

1500.0

In [4]:
# class distribution
df.tag.value_counts(normalize=True )

O        0.852828
B-geo    0.027604
B-gpe    0.020935
B-org    0.020247
I-per    0.017795
B-tim    0.016927
B-per    0.015312
I-org    0.013937
I-geo    0.005383
I-tim    0.004247
B-art    0.001376
I-gpe    0.000837
I-art    0.000748
B-eve    0.000628
I-eve    0.000508
B-nat    0.000449
I-nat    0.000239
Name: tag, dtype: float64

In [5]:
# sentence length
tdf = df.set_index('sentence_idx')
tdf['length'] = df.groupby('sentence_idx').tag.count()
df = tdf.reset_index(drop=False)

In [6]:
# encode categorial variables

le = LabelEncoder()
df['pos'] = le.fit_transform(df.pos)
df['next-pos'] = le.fit_transform(df['next-pos'])
df['next-next-pos'] = le.fit_transform(df['next-next-pos'])
df['prev-pos'] = le.fit_transform(df['prev-pos'])
df['prev-prev-pos'] = le.fit_transform(df['prev-prev-pos'])

In [7]:
df.head()

Unnamed: 0,sentence_idx,next-next-pos,next-next-word,next-pos,next-word,pos,prev-pos,prev-prev-pos,prev-prev-word,prev-word,word,tag,length
0,1.0,18,demonstrators,9,of,18,39,40,__START2__,__START1__,Thousands,O,48
1,1.0,33,have,18,demonstrators,9,18,39,__START1__,Thousands,of,O,48
2,1.0,32,marched,33,have,18,9,18,Thousands,of,demonstrators,O,48
3,1.0,9,through,32,marched,33,18,9,of,demonstrators,have,O,48
4,1.0,16,London,9,through,32,33,18,demonstrators,have,marched,O,48


In [8]:
# splitting
y = LabelEncoder().fit_transform(df.tag)

df_train, df_test, y_train, y_test = model_selection.train_test_split(df, y, stratify=y, 
                                                                      test_size=0.25, random_state=SEED, shuffle=True)
print('train', df_train.shape[0])
print('test', df_test.shape[0])

train 50155
test 16719


In [9]:
# some wrappers to work with word2vec

class Word2VecWrapper(TransformerMixin):
    def __init__(self, window=5,negative=5, size=100, iter=100, is_cbow=False, random_state=SEED):
        self.window_ = window
        self.negative_ = negative
        self.size_ = size
        self.iter_ = iter
        self.is_cbow_ = is_cbow
        self.w2v = None
        self.random_state = random_state
        
    def get_size(self):
        return self.size_

    def fit(self, X, y=None):
        """
        X: list of strings
        """
        sentences_list = [x.split() for x in X]
        self.w2v = Word2Vec(sentences_list, 
                            window=self.window_,
                            negative=self.negative_, 
                            size=self.size_, 
                            iter=self.iter_,
                            sg=not self.is_cbow_, seed=self.random_state)

        return self
    
    def has(self, word):
        return word in self.w2v

    def transform(self, X):
        """
        X: a list of words
        """
        if self.w2v is None:
            raise Exception('model not fitted')
        return np.array([self.w2v[w] if w in self.w2v else np.zeros(self.size_) for w in X ])

In [10]:
%%time
# here we exploit that word2vec is an unsupervised learning algorithm
# so we can train it on the whole dataset (subject to discussion)

sentences_list = [x.strip() for x in ' '.join(df.word).split('.')]

w2v_cbow = Word2VecWrapper(window=5, negative=5, size=300, iter=300, is_cbow=True, random_state=SEED)
w2v_cbow.fit(sentences_list)

CPU times: user 34.9 s, sys: 288 ms, total: 35.2 s
Wall time: 19.7 s


Аналогично тому, как я делала в дз4: попробуем взять POS-теги, добавить к ним колонку 'sentence_idx', прогнать эти признаки через one-hot encoding, добавить W2V-эмбеддинги для самого слова и его ближайших соседей и загнать все это в нейросеть.

In [16]:
embeding = w2v_cbow
encoder_pos = OneHotEncoder()

X_train = sp.hstack([
    embeding.transform(df_train.word),
    embeding.transform(df_train['next-word']),
    embeding.transform(df_train['prev-word']),
    encoder_pos.fit_transform(df_train[['sentence_idx', 'pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])

X_test = sp.hstack([
    embeding.transform(df_test.word),
    embeding.transform(df_test['next-word']),
    embeding.transform(df_test['prev-word']),
    encoder_pos.transform(df_test[['sentence_idx', 'pos','next-pos','next-next-pos','prev-pos','prev-prev-pos']])
])

X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train, stratify=y_train, 
                                                                      test_size=0.25, random_state=SEED, shuffle=True)


## Training

In [74]:
class FeedforwardNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FeedforwardNN, self).__init__()    
        self.fc = nn.Linear(input_dim, hidden_dim) 
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, x):
        out = self.fc(x)
        out = self.relu(out)
        out = self.dropout(out)
        
        return out

In [78]:
class Trainer:
    def __init__(self, model, train, valid, test, y_train, y_valid, y_test):
        self.model = model
        self.train = train.todense()
        self.test = test.todense()
        self.valid = valid.todense()
        
        self.y_train = y_train
        self.y_valid = y_valid
        self.y_test = y_test
        
        self.optimizer = tt.optim.Adam(self.model.parameters())
        self.scheduler = tt.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=5, verbose=True, cooldown=5)
        self.criterion = nn.CrossEntropyLoss()

    def nn_train(self, num_epochs=100):
        train_size = self.train.shape[0]
        train_batches = int(np.ceil(train_size / batch_size))
        valid_size = self.valid.shape[0]
        valid_batches = int(np.ceil(valid_size / batch_size))
    
        for epoch in range(num_epochs):
            indices = np.random.choice(train_size, train_size, replace=False)        
            epoch_average_loss = 0
            
            for j in range(train_batches):
                batch_idx = indices[j: j + batch_size]
                batch_x = tt.tensor(self.train[batch_idx], dtype=torch.float)
                batch_y = tt.from_numpy(self.y_train[batch_idx])
                self.optimizer.zero_grad()          
            
                pred = self.model.forward(batch_x)
                loss_1 = self.criterion(pred, batch_y.long())
                loss_1.backward()
                self.optimizer.step()
                epoch_average_loss += loss_1.data.detach().item()
                epoch_average_loss /= train_batches
            
            indices = np.random.choice(valid_size, valid_size, replace=False)        
            epoch_average_loss_2 = 0

            with tt.no_grad():
                for z in range(valid_batches):
                    batch_idx = indices[j: j + batch_size]
                    batch_x = tt.tensor(self.valid[batch_idx], dtype=torch.float)
                    batch_y = tt.from_numpy(self.y_valid[batch_idx])
                    pred = self.model.forward(batch_x)
                    loss = self.criterion(pred, batch_y.long())
                    epoch_average_loss_2 += loss.data.detach().item()
                    epoch_average_loss_2 /= train_batches

            print('Epoch [%d/%d], Loss train: %.4f, Loss valid: %.4f' % (epoch+1, num_epochs, epoch_average_loss, epoch_average_loss_2))

            self.scheduler.step(loss_1)
        
    def predict(self):
        test = tt.tensor(self.test, dtype=torch.float)
        pred = self.model.forward(test)
        pred = tt.softmax(pred, dim=-1)
        pred = pred.detach().numpy()
        y_pred = np.argmax(pred, axis=1)
        
        return y_pred

In [81]:
batch_size = 64

model = FeedforwardNN(2606, 128, 17)
trainer = Trainer(model, X_train, X_val, X_test, y_train, y_val, y_test)
trainer.nn_train()

Epoch [1/100], Loss train: 0.0011, Loss valid: 0.0026
Epoch [2/100], Loss train: 0.0022, Loss valid: 0.0024
Epoch [3/100], Loss train: 0.0016, Loss valid: 0.0028
Epoch [4/100], Loss train: 0.0015, Loss valid: 0.0026
Epoch [5/100], Loss train: 0.0015, Loss valid: 0.0040
Epoch [6/100], Loss train: 0.0012, Loss valid: 0.0015
Epoch [7/100], Loss train: 0.0011, Loss valid: 0.0017
Epoch     6: reducing learning rate of group 0 to 1.0000e-04.
Epoch [8/100], Loss train: 0.0016, Loss valid: 0.0016
Epoch [9/100], Loss train: 0.0017, Loss valid: 0.0021
Epoch [10/100], Loss train: 0.0019, Loss valid: 0.0021
Epoch [11/100], Loss train: 0.0016, Loss valid: 0.0012
Epoch [12/100], Loss train: 0.0010, Loss valid: 0.0020
Epoch [13/100], Loss train: 0.0021, Loss valid: 0.0017
Epoch [14/100], Loss train: 0.0009, Loss valid: 0.0019
Epoch [15/100], Loss train: 0.0019, Loss valid: 0.0027
Epoch [16/100], Loss train: 0.0015, Loss valid: 0.0024
Epoch [17/100], Loss train: 0.0012, Loss valid: 0.0023
Epoch [18/10

## Evaluation

In [82]:
y_pred = trainer.predict()
metrics.f1_score(y_test, y_pred, average='macro')

0.131548871147835

Побился только один бейзлайн :(