In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Subset
from torch import optim
from sklearn.metrics import f1_score
import numpy as np
from IPython.display import clear_output
import matplotlib.pyplot as plt


In [None]:
!pip install navec
from navec import Navec

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting navec
  Downloading navec-0.10.0-py3-none-any.whl (23 kB)
Installing collected packages: navec
Successfully installed navec-0.10.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Download data

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/pre_post_question_preparing.csv')

In [None]:
data.head()

Unnamed: 0,question,label
0,Are boots cold reliable?,0
1,is the main body rubber or plastic?,0
2,How long after placing order until US delivery?,0
3,Will this fit a Stern Monopoly machine?,0
4,Is the power cord detachable?,0


In [None]:
import string
# реализуем предобработку
def preprocess(doc):
    # к нижнему регистру
    doc = doc.lower()
    # убираем пунктуацию, пробелы, прочее
    for p in string.punctuation + string.whitespace + 'http': 
        doc = doc.replace(p, ' ')
    # убираем кавычки
    for p in ['«', '»', '\'', '\"']:
        doc = doc.replace(p, ' ')
    # убираем лишние пробелы, объединяем обратно
    doc = doc.strip()
    doc = ' '.join([w for w in doc.split(' ') if w != ''])
    return 'CLS ' + doc

data['question'] = data['question'].map(preprocess)
data.head()

Unnamed: 0,question,label
0,CLS are boo s cold reliable,0
1,CLS is e main body rubber or las ic,0
2,CLS ow long af er lacing order un il us delivery,0
3,CLS will is fi a s ern mono oly mac ine,0
4,CLS is e ower cord de ac able,0


# Embedding

In [None]:
!wget https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar

--2023-02-23 14:37:35--  https://storage.yandexcloud.net/natasha-navec/packs/navec_hudlit_v1_12B_500K_300d_100q.tar
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 53012480 (51M) [application/x-tar]
Saving to: ‘navec_hudlit_v1_12B_500K_300d_100q.tar’


2023-02-23 14:37:39 (16.4 MB/s) - ‘navec_hudlit_v1_12B_500K_300d_100q.tar’ saved [53012480/53012480]



In [None]:
navec = Navec.load('/content/navec_hudlit_v1_12B_500K_300d_100q.tar')

In [None]:
# check missing words
missing = 0
total = 0
missing_words = []
for i, row in data.iterrows():
  list_sent = row['question'].split()
  for word in list_sent:
    if navec.vocab.word_ids.get(word) is None:
      missing += 1
      missing_words.append(word)
    total += len(list_sent)

missing/total

0.010922733317557401

In [None]:
missing_words[:10]

['CLS', 'boo', 'CLS', 'CLS', 'lacing', 'CLS', 'ern', 'oly', 'ine', 'CLS']

In [None]:
navec_gensim = navec.as_gensim
print(len(navec_gensim.vocab))
print(navec_gensim.vectors[0].shape[0])

500002
300


In [None]:
std_vectors = navec_gensim.vectors.std(axis=0).mean()
print(std_vectors)

0.30867122016706344


In [None]:
word_list = list(set(missing_words))
vector_list = np.random.normal(0,0.3, (len(word_list), navec_gensim.vectors[0].shape[0]))
vector_list.shape

(37506, 300)

In [None]:
navec_gensim.add(word_list, vector_list)
navec_gensim.vocab['ocola'].index

529109

# Create Dataset

In [None]:
class dataset_classif(Dataset):

  def __init__(self, df):
    self.df = df
  
  def __len__(self):
    return len(self.df)
  
  def __getitem__(self, idx):
    text = self.df.iloc[idx, 0]
    label = self.df.iloc[idx, 1]
    text = torch.tensor([navec_gensim.vocab[word].index for word in text.split()], dtype=torch.long)
    return text, label

In [None]:
# postprocessing fill on nan for length max
def collate_fn(batch):
  x = [e[0] for e in batch]
  y = [e[1] for e in batch]
  return pad_sequence(x, padding_value=500001, batch_first = True), torch.tensor(y)

In [None]:
data = data.sample(frac=1, replace=False)
#data = data.sample(50)
data.head()

Unnamed: 0,question,label
28748,CLS ow long does i ake o ea u food exam le if ...,0
26076,CLS can i use modeling lam con inuously for 2 ...,0
154815,CLS w a e vol age range of i,1
169842,CLS do you carry is same case wi rose gold,1
96676,CLS are e s in guard good soccer,0


In [None]:
# gen random index
def subset_ind(dataset, ratio: float):
    return np.random.choice(len(dataset), size=int(ratio*len(dataset)), replace=False)

In [None]:
dataset = dataset_classif(data)

val_size = 0.2
val_inds = subset_ind(dataset, val_size)

train_dataset = Subset(dataset, [i for i in range(len(dataset)) if i not in val_inds])
val_dataset = Subset(dataset, val_inds)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn, pin_memory=True, num_workers=2)
test_dataloader = DataLoader(val_dataset, batch_size=16, collate_fn= collate_fn)

# Model&Train

In [None]:
from torch.nn.modules import transformer
class SimpleTransformer(nn.Module):
  def __init__(self, w2v, padding_idx):
    super().__init__()
    self.embedding = nn.Embedding.from_pretrained(w2v)
    self.embedding.padding_idx = padding_idx
    self.embedding.weight.requires_grad = True
    self.transformer_layer_enc = nn.TransformerEncoderLayer(d_model=300, nhead=2, batch_first=True)
    self.transformer = nn.TransformerEncoder(self.transformer_layer_enc, num_layers=1)
    self.linear_cls = nn.Linear(300, 2)

  def forward(self, x):
    x = self.embedding(x)
    x = self.transformer(x)
    x = x[:,0,:]
    out = self.linear_cls(x)
    return out

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:
w2v = torch.FloatTensor(navec_gensim.vectors)
model_transformer = SimpleTransformer(w2v, 500001).to(device)
optimizer = optim.AdamW(model_transformer.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()

In [None]:
def train(model, optimizer, train_dataloader, test_dataloader, loss_fn, n_epoch, device):
  loss_list = []
  best_f1 = 0
  for _ in tqdm(range(n_epoch)):
    for i , batch in enumerate(tqdm(iter(train_dataloader))):
      x, y = batch[0].to(device), batch[1].to(device)

      model.train()
      optimizer.zero_grad()
      pred = model(x)
      loss = loss_fn(pred, y)
      loss_list.append(loss.item())
      loss.backward()
      optimizer.step()

      if  i % 500 == 0 and i != 0:
        with torch.no_grad():
          predicted_label = []
          real_label = []
          model.eval()
          for batch in iter(test_dataloader):
            x_val, y_val = batch[0].to(device), batch[1].to(device)
            pred_val = model(x_val)
            predicted_label.extend(np.argmax(pred_val.detach().cpu().numpy(), axis=1))
            real_label.extend(y_val.detach().cpu().numpy())
          f1 = f1_score(predicted_label, real_label, average='macro')
          print(f1)
          if f1 > best_f1:
            best_f1 = f1
            torch.save(model.state_dict(), f'/content/drive/MyDrive/Colab Notebooks/simple_transformer_f1_{np.round(f1,5)}.pt')

In [None]:
train(model_transformer, optimizer, train_dataloader, test_dataloader, loss_fn, 1, device)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9884 [00:00<?, ?it/s]

0.6745185112793881
0.6907197647047685
0.699116575016392
0.7130926569034337
0.7154010364047536
0.7097941311516749
0.7227241189169612
0.7162177842480952


# Check quality

In [None]:
def check_quality_test_data(model, test_loader):
  with torch.no_grad():
    predicted_label = []
    real_label = []
    for x, y in test_loader:
      x, y = x.to(device), y.to(device)
      y_pred = model(x)
      predicted_label.extend(np.argmax(y_pred.detach().cpu().numpy(), axis=1))
      real_label.extend(y.detach().cpu().numpy())
    print(classification_report(predicted_label, real_label))

In [None]:
model_check = SimpleTransformer(w2v, 500001).to(device)
model_check.load_state_dict(torch.load(f'/content/drive/MyDrive/Colab Notebooks/simple_transformer_f1_0.72272.pt'))
model_check.eval()

SimpleTransformer(
  (embedding): Embedding(537508, 300, padding_idx=500001)
  (transformer_layer_enc): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)
    )
    (linear1): Linear(in_features=300, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=300, bias=True)
    (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)
        )
        (linear1): Linear(in_features=300, out_features=2048, bias=Tr

In [None]:
check_quality_test_data(model_check, test_dataloader)

              precision    recall  f1-score   support

           0       0.05      0.81      0.10      1326
           1       0.99      0.51      0.67     38208

    accuracy                           0.52     39534
   macro avg       0.52      0.66      0.39     39534
weighted avg       0.96      0.52      0.65     39534

