In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Subset
from torch import optim
from sklearn.metrics import f1_score
import numpy as np
from IPython.display import clear_output
import matplotlib.pyplot as plt


In [None]:
!pip install navec
from navec import Navec

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting navec
  Downloading navec-0.10.0-py3-none-any.whl (23 kB)
Installing collected packages: navec
Successfully installed navec-0.10.0


# Download Data

In [None]:
data  = pd.read_csv('pre_post_question_preparing.csv')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197674 entries, 0 to 197673
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   question  197674 non-null  object
 1   label     197674 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.0+ MB


In [None]:
data.head()

Unnamed: 0,question,label
0,Are boots cold reliable?,0
1,is the main body rubber or plastic?,0
2,How long after placing order until US delivery?,0
3,Will this fit a Stern Monopoly machine?,0
4,Is the power cord detachable?,0


In [None]:
data['label'].value_counts()

0    100000
1     97674
Name: label, dtype: int64

# Embendings

In [None]:
!wget https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar

--2023-02-12 14:11:36--  https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 53012480 (51M) [application/x-tar]
Saving to: ‘navec_hudlit_v1_12B_500K_300d_100q.tar’


2023-02-12 14:11:45 (7.71 MB/s) - ‘navec_hudlit_v1_12B_500K_300d_100q.tar’ saved [53012480/53012480]



In [None]:
navec = Navec.load('/content/navec_hudlit_v1_12B_500K_300d_100q.tar')

In [None]:
len(navec.vocab.word_ids)

500002

In [None]:
navec.vocab.get('hi')

5720

In [None]:
import string
# реализуем предобработку
def preprocess(doc):
    # к нижнему регистру
    doc = doc.lower()
    # убираем пунктуацию, пробелы, прочее
    for p in string.punctuation + string.whitespace + 'http': 
        doc = doc.replace(p, ' ')
    # убираем кавычки
    for p in ['«', '»', '\'', '\"']:
        doc = doc.replace(p, ' ')
    # убираем лишние пробелы, объединяем обратно
    doc = doc.strip()
    doc = ' '.join([w for w in doc.split(' ') if w != ''])
    return doc

data['question'] = data['question'].map(preprocess)
data.head()

Unnamed: 0,question,label
0,are boo s cold reliable,0
1,is e main body rubber or las ic,0
2,ow long af er lacing order un il us delivery,0
3,will is fi a s ern mono oly mac ine,0
4,is e ower cord de ac able,0


In [None]:
# check missing words
missing = 0
total = 0
missing_words = []
for i, row in data.iterrows():
  list_sent = row['question'].split()
  for word in list_sent:
    if navec.vocab.word_ids.get(word) is None:
      missing += 1
      missing_words.append(word)
    total += len(list_sent)

missing/total

0.0088401532776349

In [None]:
missing_words[:10]

['boo',
 'lacing',
 'ern',
 'oly',
 'ine',
 'ower',
 'rime',
 '2016',
 'jee',
 'wrangler']

In [None]:
navec_gensim = navec.as_gensim
print(len(navec_gensim.vocab))
print(navec_gensim.vectors[0].shape[0])


500002
300


In [None]:
std_vectors = navec_gensim.vectors.std(axis=0).mean()
print(std_vectors)

0.30867122016706344


In [None]:
word_list = list(set(missing_words))
vectors_list = np.random.normal(0,0.3,(len(word_list), navec_gensim.vectors[0].shape[0]))
vectors_list.shape

(37505, 300)

In [None]:
navec_gensim.add(word_list, vectors_list)
navec_gensim.vocab['ocola'].index

502598

# Create datasets

In [None]:
class dataset_classif(Dataset):
  def __init__(self, df):
    self.df = df
  
  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
     text = self.df.iloc[idx, 0]
     label = self.df.iloc[idx, 1]
     text = torch.tensor([navec_gensim.vocab[word].index for word in text.split()], dtype=torch.long)
     return text, label


In [None]:
# postprocessing fill on nan for length max
def collate_fn(batch):
  x = [e[0] for e in batch]
  y = [e[1] for e in batch]
  return pad_sequence(x, padding_value=500001, batch_first = True), torch.tensor(y)

In [None]:
data = data.sample(frac=1)
#data = data.sample(50)
data.head()

Unnamed: 0,question,label
29751,are ese exercises accom anied wi music,0
191721,i need ri le swi c la e wi a oggle o en as wel...,1
34607,would ese work for o os i run a o ogra y busin...,0
41367,do air or s le i go roug,0
84193,can i know e c es measures for large size s ir,0


In [None]:
# gen random index
def subset_ind(dataset, ratio: float):
    return np.random.choice(len(dataset), size=int(ratio*len(dataset)), replace=False)

In [None]:
dataset = dataset_classif(data)

val_size = 0.2
val_inds = subset_ind(dataset, val_size)

train_dataset = Subset(dataset, [i for i in range(len(dataset)) if i not in val_inds])
val_dataset = Subset(dataset, val_inds)

In [None]:
# dataloader
train_dataloader = DataLoader(train_dataset, batch_size=16, collate_fn= collate_fn, shuffle=True, pin_memory=True, num_workers=2)
test_dataloader = DataLoader(val_dataset, batch_size=16, collate_fn= collate_fn)

In [None]:
train_iter = iter(train_dataloader)
next(train_iter)

(tensor([[ 13897,   6407,  13990,  13862,      0,  13273,  14138,   3708,   1683,
          503605, 500322,   5020, 500322, 511091,  10929, 500001, 500001, 500001,
          500001, 500001, 500001],
         [  1747,  14178, 517691,   3708,   8881,   4087,  13862,   5016,   3991,
          500001, 500001, 500001, 500001, 500001, 500001, 500001, 500001, 500001,
          500001, 500001, 500001],
         [ 13897,   6024,   4501, 528756,  10842,   1684,    252,  11165,  10929,
            6174, 504666,   4713, 509587, 500001, 500001, 500001, 500001, 500001,
          500001, 500001, 500001],
         [ 13648,  14138,   6407,   8332,  10929,   6024,   6235,   3132,   6072,
            6024,  10929, 534393,    728,   6174,  10929, 504674, 500001, 500001,
          500001, 500001, 500001],
         [   543,    905,      0,   7246,   8754,   7636, 500001, 500001, 500001,
          500001, 500001, 500001, 500001, 500001, 500001, 500001, 500001, 500001,
          500001, 500001, 500001],
     

# Model

In [None]:
class lstm(nn.Module):
  def __init__(self, w2v, padding_idx, dropout, hidden_size):
    super().__init__()
    self.embedding = nn.Embedding.from_pretrained(w2v)
    self.embedding.padding_idx = padding_idx
    self.embedding.weight.requires_grad = True
    self.dropout = nn.Dropout(dropout)
    self.lstm = nn.LSTM(input_size = self.embedding.embedding_dim,
                        hidden_size = hidden_size,
                        dropout = dropout,
                        batch_first=True)
                        
    self.layer = nn.Linear(hidden_size, 2)
  
  def forward(self, x):
    x = self.embedding(x)
    out, (hidden, c) = self.lstm(x)

    x = self.dropout(torch.cat([c[i,:,:] for i in range(c.shape[0])], dim=1))# делает срез со всех lstm и направлений в нем
    x = self.layer(x)
    return x

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device 

device(type='cpu')

In [None]:
w2v = torch.FloatTensor(navec_gensim.vectors)
model_lstm = lstm(w2v, 500001, dropout=0.2, hidden_size=256).to(device)
optimizer = optim.AdamW(model_lstm.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()



In [None]:
def train(model, optimizer, train_dataloader, test_dataloader, loss_fn, n_epoch, device):
  loss_list = []
  best_f1 = 0
  for _ in tqdm(range(n_epoch)):
    for i , batch in enumerate(tqdm(iter(train_dataloader))):
      x, y = batch[0].to(device), batch[1].to(device)

      model.train()
      optimizer.zero_grad()
      pred = model(x)
      loss = loss_fn(pred, y)
      loss_list.append(loss.item())
      loss.backward()
      optimizer.step()

      # clear_output(True)
      # plt.plot(loss_list)
      # plt.title("Training loss")
      # plt.xlabel("Batch")
      # plt.ylabel("Loss")
      # plt.show()
      if  i % 1000 == 0 and i != 0:
        with torch.no_grad():
          predicted_label = []
          real_label = []
          model.eval()
          for batch in iter(test_dataloader):
            x_val, y_val = batch[0].to(device), batch[1].to(device)
            pred_val = model(x_val)
            predicted_label.extend(np.argmax(pred_val.detach().cpu().numpy(), axis=1))
            real_label.extend(y_val.detach().cpu().numpy())
          f1 = f1_score(predicted_label, real_label, average='macro')
          print(f1)
          if f1 > best_f1:
            best_f1 = f1
            torch.save(model.state_dict(), f'model_lstm_f1_{f1}.pt')

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

0

In [None]:
train(model_lstm, optimizer, train_dataloader, test_dataloader, loss_fn, 2, device)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/9884 [00:00<?, ?it/s]

0.6823386083020082
0.7118151084989357
0.5654371268185475
0.6771018166743412
0.7311869808960207
0.728665178846418
0.7352711710383795
0.7256842511512953


# Check quality

In [None]:
def check_quality_test_data(model, test_loader):
  with torch.no_grad():
    predicted_label = []
    real_label = []
    for x, y in test_loader:
      x, y = x.to(device), y.to(device)
      y_pred = model(x)
      predicted_label.extend(np.argmax(y_pred.detach().cpu().numpy(), axis=1))
      real_label.extend(y.detach().cpu().numpy())
    print(classification_report(predicted_label, real_label))

In [None]:
model_lstm = lstm(w2v, 500001, dropout=0.2, hidden_size=256).to(device)
model_lstm.load_state_dict(torch.load('model_lstm_f1_0.735.pt'))
model_lstm.eval()

lstm(
  (embedding): Embedding(537507, 300, padding_idx=500001)
  (dropout): Dropout(p=0.2, inplace=False)
  (lstm): LSTM(300, 256, batch_first=True, dropout=0.2)
  (layer): Linear(in_features=256, out_features=2, bias=True)
)

In [None]:
check_quality_test_data(model_lstm, test_dataloader)

              precision    recall  f1-score   support

           0       0.82      0.68      0.75     24136
           1       0.61      0.77      0.68     15398

    accuracy                           0.72     39534
   macro avg       0.72      0.73      0.71     39534
weighted avg       0.74      0.72      0.72     39534

