In [3]:
%pip install git+https://github.com/fkodom/yet-another-retnet.git

  pid, fd = os.forkpty()


Collecting git+https://github.com/fkodom/yet-another-retnet.git
  Cloning https://github.com/fkodom/yet-another-retnet.git to /tmp/pip-req-build-0gsbz68o
  Running command git clone --filter=blob:none --quiet https://github.com/fkodom/yet-another-retnet.git /tmp/pip-req-build-0gsbz68o
  Resolved https://github.com/fkodom/yet-another-retnet.git to commit fdd1c0e85a5ee64d4556c731879ee5efab9a968a
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: yet-another-retnet
  Building wheel for yet-another-retnet (pyproject.toml) ... [?25l[?25hdone
  Created wheel for yet-another-retnet: filename=yet_another_retnet-0.5.1-py3-none-any.whl size=406883 sha256=d4e65fda56ee21951587592b982308397868cd49f9b7378d7f25881d0938c9c1
  Stored in directory: /tmp/pip-ephem-wheel-cache-8fhontav/wheels/23/1c/61/6971408ed03a8c880915076e3008802b3feeafb157cac

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import DistilBertTokenizer, DistilBertModel
#from datasets import Dataset

from torch.utils.data import Dataset, DataLoader

from transformers.pipelines.pt_utils import KeyDataset
from tqdm import tqdm
import pandas as pd

from yet_another_retnet.retnet import RetNet

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
torch.random.manual_seed(0)

tokenizer = AutoTokenizer.from_pretrained('gpt2')

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = RetNet(
    num_tokens=len(tokenizer), # vocab size, usually taken from tokenizer
    d_model=192, #embedding dimension
    dim_feedforward = 384,
    nhead=4,
    num_layers=8,
    device=device)

model_q = RetNet(
    num_tokens=len(tokenizer), # vocab size, usually taken from tokenizer
    d_model=192, #embedding dimension
    dim_feedforward = 384,
    nhead=1,
    num_layers=3,
    device=device)

#retnet_1_3b(num_tokens=len(tokenizer), device=device)

In [6]:
model

RetNet(
  (embedding): Embedding(50258, 192)
  (decoder): RetNetDecoder(
    (layers): ModuleList(
      (0-7): 8 x RetNetDecoderLayer(
        (dropout): Dropout(p=0.1, inplace=False)
        (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
        (retention): MultiScaleRetention(
          (q_proj): Linear(in_features=192, out_features=192, bias=True)
          (k_proj): Linear(in_features=192, out_features=192, bias=True)
          (v_proj): Linear(in_features=192, out_features=192, bias=True)
          (group_norm): GroupNorm(4, 192, eps=1e-06, affine=False)
          (g_proj): Linear(in_features=192, out_features=192, bias=True)
          (out_proj): Linear(in_features=192, out_features=192, bias=True)
        )
        (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
        (linear1): Linear(in_features=192, out_features=384, bias=True)
        (linear2): Linear(in_features=384, out_features=192, bias=True)
      )
    )
  )
  (out): Linear(in_featur

In [5]:
train_test_data = pd.read_csv('/kaggle/input/human-vs-qwen25-n-phi3/train_test_data.csv')

In [7]:
def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp
get_n_params(model.decoder), get_n_params(model_q.decoder)

(2672640, 1002240)

In [7]:
import os
import random
import numpy as np


def enable_determinism():
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
    torch.use_deterministic_algorithms(True)

def fix_seeds(seed: int):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.mps.manual_seed(seed)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(0)

enable_determinism()
fix_seeds(0)

In [8]:
# Класс датасета
class AnswersDataset(Dataset):
  def __init__(self, tokenizer, data_df, sampletype,  max_len=512):
    self.raw_data = data_df[data_df['sample_type']==sampletype]

    self.max_len = max_len
    self.tokenizer = tokenizer
    self.inputs_q = []
    self.inputs_a = []
    
    self.targets = []

    self.class_mapper = {'Human': 0, 'Phi3-mini': 1, 'Qwen25': 2}

    self.class_mapper_inv = {v: k for k, v in self.class_mapper.items()}

    self._build()


  def __len__(self):
    return len(self.inputs_a)

  def __getitem__(self, index):
    question_ids = self.inputs_q[index].squeeze()
    answers_ids = self.inputs_a[index].squeeze()


    target_ids = self.targets[index]


    return  question_ids, answers_ids, target_ids
    #{"question_ids": question_ids, "answers_ids": answers_ids, "target_ids": target_ids}

  def _build(self):
    self._buil_examples_from_files()

  def _buil_examples_from_files(self):
    # REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()]")
    # REPLACE_WITH_SPACE = re.compile("()|(\-)|(\/)")

    for i, row in tqdm(self.raw_data.iterrows(), total=self.raw_data.shape[0]):

      if pd.isna(row['Answers']):
        continue

      text_question = row['Question']
      text_answer = row['Answers']

      line_question = text_question.strip()
      line_answer = text_answer.strip()


      # line = REPLACE_NO_SPACE.sub("", line)
      # line = REPLACE_WITH_SPACE.sub("", line)
      # line = line + ' '

      target = self.class_mapper[row['Author']]

       # tokenize inputs
      tokenized_questions, q_mask = [v for k, v in self.tokenizer.batch_encode_plus(
          [line_question], max_length=self.max_len, padding='max_length', return_tensors="pt",
          truncation=True
      ).items()]
      tokenized_answers, a_mask = [v for k, v in self.tokenizer.batch_encode_plus(
          [line_answer], max_length=self.max_len, padding='max_length', return_tensors="pt",
          truncation=True
      ).items()]

       # tokenize targets


      self.inputs_q.append(tokenized_questions)
      self.inputs_a.append(tokenized_answers)
 
      self.targets.append(target)

In [9]:
max_seq_length = 128 # with 256 one epoch with 2 evaluations take 2 hours together
train_dataset = AnswersDataset(tokenizer, train_test_data, 'train', max_len=max_seq_length)
#test_dataset = AnswersDataset(tokenizer, train_test_data, 'test', max_len=max_seq_length)
val_dataset = AnswersDataset(tokenizer, train_test_data, 'val', max_len=max_seq_length)

100%|██████████| 67699/67699 [01:07<00:00, 995.90it/s] 
100%|██████████| 21233/21233 [00:21<00:00, 971.87it/s] 


In [10]:
g = torch.Generator()
batch_size=128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=g,
                          pin_memory=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                        pin_memory=True, num_workers=2)
#test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [11]:
model.decoder

RetNetDecoder(
  (layers): ModuleList(
    (0-7): 8 x RetNetDecoderLayer(
      (dropout): Dropout(p=0.1, inplace=False)
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (retention): MultiScaleRetention(
        (q_proj): Linear(in_features=192, out_features=192, bias=True)
        (k_proj): Linear(in_features=192, out_features=192, bias=True)
        (v_proj): Linear(in_features=192, out_features=192, bias=True)
        (group_norm): GroupNorm(4, 192, eps=1e-06, affine=False)
        (g_proj): Linear(in_features=192, out_features=192, bias=True)
        (out_proj): Linear(in_features=192, out_features=192, bias=True)
      )
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (linear1): Linear(in_features=192, out_features=384, bias=True)
      (linear2): Linear(in_features=384, out_features=192, bias=True)
    )
  )
)

In [9]:
from tqdm import tqdm
import torch.nn as nn
class RetNetClf_FCtuned_ver2(nn.Module):
    def __init__(self, max_seq_length=512):
        super(RetNetClf_FCtuned_ver2, self).__init__()
        self.rnmodel_ans = model
        self.rnmodel_q = model_q.decoder

        self.model_out_features =  model.decoder.layers[-1].linear2.out_features
        self.dropout_rate = model.decoder.layers[-1].dropout.p

        
        self.model_q_out_entrance = torch.nn.Sequential(
            torch.nn.Dropout(p=self.dropout_rate, inplace=False),
            torch.nn.LayerNorm((self.model_out_features,), eps=1e-06, elementwise_affine=True)
        )

        self.retlayer_join_index = 1
        
        self.length=max_seq_length
        self.fc = nn.Linear(in_features=self.length*self.model_out_features, out_features = 3)
        self.activation = nn.Softmax(dim=1)
        #self.freeze_layers()


    def forward(self, qx, ax):
        # parallel mode for training

        q_emb = self.rnmodel_ans.embedding(qx)
        a_emb = self.rnmodel_ans.embedding(ax)

        rnout_a = a_emb
        rnout_q = self.rnmodel_q.forward_parallel(q_emb)
        for layer in self.rnmodel_ans.decoder.layers[:self.retlayer_join_index]:
            rnout_a = layer.forward_parallel(rnout_a)

        # now join question and answer in retention layer
        join_layer = self.rnmodel_ans.decoder.layers[self.retlayer_join_index]
        rnout_q = self.model_q_out_entrance(rnout_q)
        rnout_a = join_layer.dropout(rnout_a)
        rnout_a = join_layer.norm1(rnout_a)
        retout = join_layer.retention.forward_parallel(rnout_q, rnout_a, rnout_a)[0]

        retout = join_layer.norm2(retout)
        retout = join_layer.linear1(retout)
        retout = join_layer.linear2(retout)

        # now the rest retention layers as usual
        for layer in self.rnmodel_ans.decoder.layers[self.retlayer_join_index+1:]:
            retout = layer.forward_parallel(retout)
        #print(rnout_q.shape, rnout_a.shape)
        
        out = retout.reshape(retout.shape[0], -1)
        pred = self.fc(out)

        return self.activation(pred)


class RetNetClf_FCtuned_ver1(nn.Module):
    def __init__(self, max_seq_length=512):
        super(RetNetClf_FCtuned_ver1, self).__init__()
        self.rnmodel_ans = model # some retention layers then join question and answer in one ret. layer
        self.rnmodel_q = model_q.decoder

        
        self.model_out_features =  model.decoder.layers[-1].linear2.out_features

        self.length=max_seq_length
        self.fc = nn.Linear(in_features=2*self.length*self.model_out_features, out_features = 3)
        self.activation = nn.Softmax()
        #self.freeze_layers()


    def forward(self, qx, ax):
        # parallel mode for training

        q_emb = self.rnmodel_ans.embedding(qx)
        a_emb = self.rnmodel_ans.embedding(ax)
        
        rnout_q = self.rnmodel_q.forward_parallel(q_emb)
        rnout_a = self.rnmodel_ans.decoder.forward_parallel(a_emb)

        #print(rnout_q.shape, rnout_a.shape)
        concat = torch.concatenate([rnout_q, rnout_a], axis=1)
        out = concat.reshape(concat.shape[0], -1)
        pred = self.fc(out)

        return self.activation(pred)

In [11]:
#model1 = RetNetClf_FCtuned(max_seq_length)
model_1, model_2 = RetNetClf_FCtuned_ver1(max_seq_length), RetNetClf_FCtuned_ver2(max_seq_length)
get_n_params(model_1.rnmodel_ans)+get_n_params(model_1.rnmodel_q)+get_n_params(model_1.fc), get_n_params(model_1)

(23171669, 23171669)

In [13]:
get_n_params(model_2.rnmodel_ans)+get_n_params(model_2.rnmodel_q)\
+get_n_params(model_2.model_q_out_entrance)+get_n_params(model_2.fc), get_n_params(model_2)

(23098325, 23098325)

In [13]:
!pip install torcheval

  pid, fd = os.forkpty()


Collecting torcheval
  Downloading torcheval-0.0.7-py3-none-any.whl.metadata (8.6 kB)
Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: torcheval
Successfully installed torcheval-0.0.7


In [14]:
from torcheval.metrics.functional import multiclass_confusion_matrix as conf_matrix

def evaluation_epoch(model, loader, loss_obj):
  n_correct, n_total = 0, 0
  n_correct_oo = 0
  total_loss = 0

  total_conf_matrix = torch.zeros(size=(3, 3))

  with torch.no_grad():
    for chunk in tqdm(loader):
      qx, ax, y = chunk
      pred = model(qx.to(device), ax.to(device)).cpu()
      apred = torch.argmax(pred, 1) # for all-vs-all classification

      oopred = torch.where(apred > 0, 1, 0) # for one-vs-others classification
      ooy = torch.where(y > 0, 1, 0)

      n_correct += (apred == y).sum()
      n_correct_oo += (oopred == ooy).sum()

      n_total += y.shape[0]

      total_loss += y.shape[0] * loss_obj(pred, y).item()
      total_conf_matrix+=conf_matrix(apred, y, num_classes=3)

  return {'accuracy_ava': n_correct/n_total, 'loss': total_loss/n_total,
          'accuracy_ovo':n_correct_oo/n_total, 'conf_matrix': total_conf_matrix}

def train_neural_net(model, train_loader, test_loader):
  loss = nn.CrossEntropyLoss()
  optimizer=torch.optim.Adam([{'params': model.rnmodel_ans.parameters()},
                              {'params': model.rnmodel_q.parameters()},
                              {'params': model.model_q_out_entrance.parameters()},
                              {'params': model.fc.parameters()}], lr=1e-4)
  n_epochs=8
  loss_train_history = [] # логируется всегда

  # логируются каждую эпоху

  train_epoch_evals = []
  test_epoch_evals = []

  for _ in range(n_epochs):
    i=0
    model.train().to(device)
    for train_chunk in tqdm(train_loader):
        qx, ax, y = train_chunk
        optimizer.zero_grad(set_to_none=True)


        pred = model(qx.to(device), ax.to(device))
        loss_val = loss(pred, y.to(device)) #.long()
        loss_val.backward()

        loss_val_item = loss_val.detach().cpu().item()

        optimizer.step()
        loss_train_history.append(loss_val_item)
        i+=1
        if i % 100 == 0:
            print(f'train step {i}: train loss = {loss_val_item :.3f}')
            #inspect retention weights
            # for j, layer in enumerate(model.rnmodel_ans.decoder.layers):
            #     if j == 0:
            #         print('gradient:{}\n----------\n{}'.format(j,torch.norm(layer.retention.q_proj.weight.grad)))
            # print('gradient:{}\n----------\n{}'.format(j,torch.norm(model.fc.weight.grad)))
        #break

    model.eval()
    #model.cpu()

    #train
    print('train evaluation')
    train_eval = evaluation_epoch(model, train_loader, loss)
    print(f"epoch {_}: train ava accuracy = {train_eval['accuracy_ava'] :.3f}")
    print(f"epoch {_}: train loss = {train_eval['loss'] :.3f}")
    print(f"epoch {_}: train ovo accuracy = {train_eval['accuracy_ovo'] :.3f}")
    train_epoch_evals.append(train_eval)

    #test
    print('test evaluation')
    test_eval = evaluation_epoch(model, test_loader, loss)
    print(f"epoch {_}: test ava accuracy = {test_eval['accuracy_ava'] :.3f}")
    print(f"epoch {_}: test loss = {test_eval['loss'] :.3f}")
    print(f"epoch {_}: test ovo accuracy = {test_eval['accuracy_ovo'] :.3f}")
    test_epoch_evals.append(test_eval)

  return {'training_loss_history': loss_train_history,
          'train_epochs_res': train_epoch_evals,
          'test_epochs_res': test_epoch_evals
          }

In [15]:
model1 = RetNetClf_FCtuned_ver2(max_seq_length)
train_hist = train_neural_net(model1, train_loader, val_loader) # потом перезапустить обучение Т5, т.к. забыл там активацию

  self.pid = os.fork()
 19%|█▉        | 100/529 [00:28<01:57,  3.64it/s]

train step 100: train loss = 0.874


 38%|███▊      | 200/529 [00:57<01:38,  3.34it/s]

train step 200: train loss = 0.942


 57%|█████▋    | 300/529 [01:29<01:18,  2.92it/s]

train step 300: train loss = 0.879


 76%|███████▌  | 400/529 [02:04<00:43,  2.94it/s]

train step 400: train loss = 0.871


 95%|█████████▍| 500/529 [02:37<00:09,  3.09it/s]

train step 500: train loss = 0.812


100%|██████████| 529/529 [02:47<00:00,  3.16it/s]


train evaluation


100%|██████████| 529/529 [00:59<00:00,  8.89it/s]


epoch 0: train ava accuracy = 0.717
epoch 0: train loss = 0.832
epoch 0: train ovo accuracy = 0.837
test evaluation


100%|██████████| 166/166 [00:18<00:00,  8.96it/s]


epoch 0: test ava accuracy = 0.704
epoch 0: test loss = 0.846
epoch 0: test ovo accuracy = 0.825


 19%|█▉        | 100/529 [00:33<02:23,  3.00it/s]

train step 100: train loss = 0.783


 38%|███▊      | 200/529 [01:06<01:51,  2.96it/s]

train step 200: train loss = 0.900


 57%|█████▋    | 300/529 [01:39<01:15,  3.05it/s]

train step 300: train loss = 0.846


 76%|███████▌  | 400/529 [02:12<00:42,  3.04it/s]

train step 400: train loss = 0.896


 95%|█████████▍| 500/529 [02:44<00:09,  3.15it/s]

train step 500: train loss = 0.879


100%|██████████| 529/529 [02:54<00:00,  3.04it/s]


train evaluation


100%|██████████| 529/529 [00:58<00:00,  8.99it/s]


epoch 1: train ava accuracy = 0.740
epoch 1: train loss = 0.811
epoch 1: train ovo accuracy = 0.846
test evaluation


100%|██████████| 166/166 [00:18<00:00,  8.98it/s]


epoch 1: test ava accuracy = 0.727
epoch 1: test loss = 0.823
epoch 1: test ovo accuracy = 0.836


 19%|█▉        | 100/529 [00:32<02:18,  3.09it/s]

train step 100: train loss = 0.819


 38%|███▊      | 200/529 [01:04<01:47,  3.07it/s]

train step 200: train loss = 0.784


 57%|█████▋    | 300/529 [01:36<01:13,  3.11it/s]

train step 300: train loss = 0.816


 76%|███████▌  | 400/529 [02:08<00:40,  3.19it/s]

train step 400: train loss = 0.794


 95%|█████████▍| 500/529 [02:40<00:09,  3.16it/s]

train step 500: train loss = 0.830


100%|██████████| 529/529 [02:49<00:00,  3.12it/s]


train evaluation


100%|██████████| 529/529 [00:58<00:00,  9.01it/s]


epoch 2: train ava accuracy = 0.764
epoch 2: train loss = 0.787
epoch 2: train ovo accuracy = 0.867
test evaluation


100%|██████████| 166/166 [00:18<00:00,  9.01it/s]


epoch 2: test ava accuracy = 0.745
epoch 2: test loss = 0.805
epoch 2: test ovo accuracy = 0.857


 19%|█▉        | 100/529 [00:31<02:09,  3.30it/s]

train step 100: train loss = 0.778


 38%|███▊      | 200/529 [01:01<01:40,  3.27it/s]

train step 200: train loss = 0.790


 57%|█████▋    | 300/529 [01:33<01:11,  3.18it/s]

train step 300: train loss = 0.840


 76%|███████▌  | 400/529 [02:04<00:39,  3.29it/s]

train step 400: train loss = 0.825


 95%|█████████▍| 500/529 [02:34<00:08,  3.37it/s]

train step 500: train loss = 0.794


100%|██████████| 529/529 [02:42<00:00,  3.25it/s]


train evaluation


100%|██████████| 529/529 [00:58<00:00,  9.04it/s]


epoch 3: train ava accuracy = 0.740
epoch 3: train loss = 0.811
epoch 3: train ovo accuracy = 0.856
test evaluation


100%|██████████| 166/166 [00:18<00:00,  8.99it/s]


epoch 3: test ava accuracy = 0.708
epoch 3: test loss = 0.844
epoch 3: test ovo accuracy = 0.843


 19%|█▉        | 100/529 [00:30<02:09,  3.32it/s]

train step 100: train loss = 0.778


 38%|███▊      | 200/529 [01:00<01:42,  3.20it/s]

train step 200: train loss = 0.762


 57%|█████▋    | 300/529 [01:30<01:08,  3.36it/s]

train step 300: train loss = 0.817


 76%|███████▌  | 400/529 [02:00<00:38,  3.36it/s]

train step 400: train loss = 0.825


 95%|█████████▍| 500/529 [02:30<00:08,  3.35it/s]

train step 500: train loss = 0.817


100%|██████████| 529/529 [02:39<00:00,  3.31it/s]


train evaluation


100%|██████████| 529/529 [00:59<00:00,  8.94it/s]


epoch 4: train ava accuracy = 0.745
epoch 4: train loss = 0.806
epoch 4: train ovo accuracy = 0.882
test evaluation


100%|██████████| 166/166 [00:18<00:00,  8.96it/s]


epoch 4: test ava accuracy = 0.743
epoch 4: test loss = 0.808
epoch 4: test ovo accuracy = 0.877


 19%|█▉        | 100/529 [00:30<02:10,  3.28it/s]

train step 100: train loss = 0.798


 38%|███▊      | 200/529 [01:00<01:39,  3.29it/s]

train step 200: train loss = 0.801


 57%|█████▋    | 300/529 [01:30<01:09,  3.31it/s]

train step 300: train loss = 0.762


 76%|███████▌  | 400/529 [02:00<00:38,  3.35it/s]

train step 400: train loss = 0.853


 95%|█████████▍| 500/529 [02:30<00:08,  3.36it/s]

train step 500: train loss = 0.755


100%|██████████| 529/529 [02:39<00:00,  3.33it/s]


train evaluation


100%|██████████| 529/529 [00:59<00:00,  8.96it/s]


epoch 5: train ava accuracy = 0.756
epoch 5: train loss = 0.795
epoch 5: train ovo accuracy = 0.882
test evaluation


100%|██████████| 166/166 [00:18<00:00,  8.96it/s]


epoch 5: test ava accuracy = 0.757
epoch 5: test loss = 0.794
epoch 5: test ovo accuracy = 0.884


 19%|█▉        | 100/529 [00:29<02:07,  3.36it/s]

train step 100: train loss = 0.778


 38%|███▊      | 200/529 [00:59<01:37,  3.37it/s]

train step 200: train loss = 0.801


 57%|█████▋    | 300/529 [01:29<01:07,  3.39it/s]

train step 300: train loss = 0.786


 76%|███████▌  | 400/529 [01:58<00:38,  3.39it/s]

train step 400: train loss = 0.864


 95%|█████████▍| 500/529 [02:28<00:08,  3.38it/s]

train step 500: train loss = 0.903


100%|██████████| 529/529 [02:36<00:00,  3.37it/s]


train evaluation


100%|██████████| 529/529 [00:58<00:00,  8.99it/s]


epoch 6: train ava accuracy = 0.760
epoch 6: train loss = 0.791
epoch 6: train ovo accuracy = 0.872
test evaluation


100%|██████████| 166/166 [00:18<00:00,  8.96it/s]


epoch 6: test ava accuracy = 0.748
epoch 6: test loss = 0.804
epoch 6: test ovo accuracy = 0.866


 19%|█▉        | 100/529 [00:29<02:07,  3.37it/s]

train step 100: train loss = 0.794


 38%|███▊      | 200/529 [00:59<01:37,  3.38it/s]

train step 200: train loss = 0.819


 57%|█████▋    | 300/529 [01:28<01:07,  3.39it/s]

train step 300: train loss = 0.887


 76%|███████▌  | 400/529 [01:58<00:38,  3.39it/s]

train step 400: train loss = 0.755


 95%|█████████▍| 500/529 [02:27<00:08,  3.38it/s]

train step 500: train loss = 0.786


100%|██████████| 529/529 [02:36<00:00,  3.38it/s]


train evaluation


100%|██████████| 529/529 [00:58<00:00,  8.98it/s]


epoch 7: train ava accuracy = 0.763
epoch 7: train loss = 0.789
epoch 7: train ovo accuracy = 0.882
test evaluation


100%|██████████| 166/166 [00:18<00:00,  8.96it/s]

epoch 7: test ava accuracy = 0.740
epoch 7: test loss = 0.812
epoch 7: test ovo accuracy = 0.878





In [16]:
def precision_macro(conf_matrix):
  #macro = averaged across precisions for each class
  n_classes = conf_matrix.shape[0]
  by_class = []
  for i in range(n_classes):
    val = conf_matrix[i, i]/conf_matrix[:, i].sum()
    by_class.append(val)
  return by_class, sum(by_class)/n_classes # class-wise precision and macro

def recall_macro(conf_matrix):
  #macro = averaged across recalls for each class
  n_classes = conf_matrix.shape[0]
  by_class = []
  for i in range(n_classes):
    val = conf_matrix[i, i]/conf_matrix[i, :].sum()
    by_class.append(val)
  return by_class, sum(by_class)/n_classes # class-wise recall and macro

def precision_ovo(conf_matrix, one_label=0): #one_label -- the label of the class which is opposed to other ones
  sub_matrix = conf_matrix[:, one_label]
  denum = conf_matrix.sum() - sub_matrix.sum()
  num = conf_matrix.sum() - sub_matrix.sum() - conf_matrix[one_label, :].sum() + conf_matrix[one_label, one_label]
  return num/denum
def recall_ovo(conf_matrix, one_label=0):
  return conf_matrix[one_label, one_label]/conf_matrix[one_label, :].sum()


In [None]:
test_dataset = AnswersDataset(tokenizer, train_test_data, 'test', max_len=max_seq_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
                        pin_memory=True, num_workers=2)

 43%|████▎     | 7677/18019 [00:06<00:12, 840.03it/s] 

In [None]:
def evaluation_epoch_test(model, loader):
  n_correct, n_total = 0, 0
  n_correct_oo = 0
  total_loss = 0

  total_conf_matrix = torch.zeros(size=(3, 3))

  with torch.no_grad():
    for chunk in tqdm(loader):
      qx, ax, y = chunk
      pred = model(qx.to(device), ax.to(device)).cpu()
      apred = torch.argmax(pred, 1) # for all-vs-all classification

      oopred = torch.where(apred > 0, 1, 0) # for one-vs-others classification
      ooy = torch.where(y > 0, 1, 0)

      n_correct += (apred == y).sum()
      n_correct_oo += (oopred == ooy).sum()

      n_total += y.shape[0]

      #total_loss += y.shape[0] * loss_obj(pred, y).item()
      total_conf_matrix+=conf_matrix(apred, y, num_classes=3)

  return {'accuracy_ava': n_correct/n_total, 'loss': -1,
          'accuracy_ovo':n_correct_oo/n_total, 'conf_matrix': total_conf_matrix}

In [None]:
test_results = evaluation_epoch_test(model1, test_loader)

v2: precision_ovo, recall_ovo = (0.9707504510879517, 0.9477296471595764)

v1: precision_ovo, recall_ovo = (0.9473173022270203, 0.8949313759803772)

In [None]:
mtx = test_results['conf_matrix']
precision_ovo(mtx).item(), recall_ovo(mtx).item() #v2

v2: (([tensor(0.6849), tensor(0.7429), tensor(0.8725)], tensor(0.7668)),
 ([tensor(0.9477), tensor(0.7545), tensor(0.5211)], tensor(0.7411)))

v1: (([tensor(0.7607), tensor(0.8228), tensor(0.7750)], tensor(0.7861)),
 ([tensor(0.8949), tensor(0.7158), tensor(0.7563)], tensor(0.7890)))

In [None]:
precision_macro(mtx), recall_macro(mtx) #v2

In [None]:
mtx#v2

In [24]:
mtx#v1

tensor([[5085.,  401.,  196.],
        [ 843., 4768., 1050.],
        [ 757.,  626., 4291.]])

In [None]:
train_hist['real_test_res'] = [test_results]

In [None]:
def to_python_types(results):
  output = {}
  output['training_loss_history'] = results['training_loss_history']
  for x in ['train_epochs_res', 'test_epochs_res', 'real_test_res']:
    if x in [y for y in results.keys()]:
        output[x] = []
        for hist_log in results[x]:
          res = {}
          res['accuracy_ava'] = hist_log['accuracy_ava'].item()
          if 'loss' in [y for y in hist_log.keys()]:
            res['loss'] = hist_log['loss']
          res['accuracy_ovo'] = hist_log['accuracy_ovo'].item()
          if 'conf_matrix' in [y for y in hist_log.keys()]:
            res['conf_matrix'] = hist_log['conf_matrix'].tolist()
          output[x].append(res)

  return output

In [None]:
import json
with open('retnet_train_val_res_v2.json', 'w') as f:
    json.dump(to_python_types(train_hist), f)