In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [2]:
!pip install datasets transformers evaluate accelerate wandb -q

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mbagor123[0m ([33mgreatakela[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Обучние кросс-энкодера

Для переранижирования кандидатов используем обученную на классификацию bert-модель. Модель будет оценивать предложенных кандидатов, состоящих из контекста, вопроса и ответа на предмет того, является ли ответ продолжением контекста + ответа.

Для ранжирования правильных ответов - выберем уверенность модели в классификации.

Ниже представлен код для обучения модели и сохранения ее на Hugging Face для использования в чат боте.

In [5]:
import gc

# Check if automatic garbage collection is enabled
print("Garbage Collection Enabled:", gc.isenabled())

# Force garbage collection
gc.collect()

Garbage Collection Enabled: True


30

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import Trainer, TrainingArguments, set_seed
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
import accelerate
import evaluate
from transformers import EvalPrediction
import os
import warnings

warnings.filterwarnings("ignore")

In [7]:
data = pd.read_pickle("/content/drive/MyDrive/GNLP/HW1/spock_lines_reranker.pkl")


df_train, df_valid = train_test_split(data, test_size=0.15, stratify=data['LABEL'])

dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(df_train.reset_index(drop=True)),
        "valid": Dataset.from_pandas(df_valid.reset_index(drop=True)),
    }
)
dataset

DatasetDict({
    train: Dataset({
        features: ['QUESTION', 'ANSWER', 'CONTEXT', 'LABEL', 'CQA'],
        num_rows: 8851
    })
    valid: Dataset({
        features: ['QUESTION', 'ANSWER', 'CONTEXT', 'LABEL', 'CQA'],
        num_rows: 1563
    })
})

In [8]:
data

Unnamed: 0,QUESTION,ANSWER,CONTEXT,LABEL,CQA
0,Scott here.,[voice rising] How can you Even support your ...,Get on with it. Why did you let him do it? I w...,1,Get on with it. Why did you let him do it? I w...
1,Speculation?,Because it's effective doesn't make it right.,What about medicine? Why no doctors? We haven'...,1,What about medicine? Why no doctors? We haven'...
2,Direct hit amidships by photon torpedo.,"In the beginning, but you can't tell me you d...","I'm sorry, Captain. Yes. You should've been a ...",1,"I'm sorry, Captain. Yes. You should've been a ..."
3,Too quickly. Bridge.,Freefall!,Aren't there certain mathematical problems whi...,0,Aren't there certain mathematical problems whi...
4,"All decks and divisions confirm, status red.",It's a good thing we didn't. Tightness in her...,The disease certainly is. How long do we have ...,1,The disease certainly is. How long do we have ...
...,...,...,...,...,...
10959,"Apparently it was not, Captain. Our tractor be...",And we are still increasing speed. Contact wit...,"Get ready to execute course change, Mister Sul...",0,"Get ready to execute course change, Mister Sul..."
10960,Why not?,Then go get yourself one and leave me alone.,"I'm weak, Captain, but not in difficulty. He m...",1,"I'm weak, Captain, but not in difficulty. He m..."
10961,The very reason for the existence of our stars...,"Captain, since I was included in the invitatio...","Speculation is unnecessary, Captain. The answe...",0,"Speculation is unnecessary, Captain. The answe..."
10962,"On audio, sir.",This is Spock.,"No more than an hour now, sir. Put all seconda...",0,"No more than an hour now, sir. Put all seconda..."


In [9]:
from collections import Counter

Counter(dataset['train']['LABEL'])

Counter({1: 4426, 0: 4425})

In [10]:
dataset['train'][3]

{'QUESTION': "Regulations. Don't give me regulations. You've wanted command all along. First little excuse you get",
 'ANSWER': ' [Turns to Foreman.] Half hour to remove the pRobe? [Foreman nods.]',
 'CONTEXT': "Ah, yes, that's more what we want. The dashing warrior and his elegant lady. That mirror. It's part of his audience, his ego. He never wanders from it. Is it ego, or something else? Explain. The mirror.",
 'LABEL': 1,
 'CQA': "Ah, yes, that's more what we want. The dashing warrior and his elegant lady. That mirror. It's part of his audience, his ego. He never wanders from it. Is it ego, or something else? Explain. The mirror.[SEP]Regulations. Don't give me regulations. You've wanted command all along. First little excuse you get[SEP] [Turns to Foreman.] Half hour to remove the pRobe? [Foreman nods.]"}

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [12]:
def preprocess_data(examples):

    encoding = tokenizer(
        examples['CQA'],
        add_special_tokens=True,
    )
    return encoding

In [13]:
acc = evaluate.load("accuracy")


def compute_metrics(p: EvalPrediction):
    preds = p.predictions
    preds = np.argmax(preds, axis=1)

    acc_result = acc.compute(predictions=preds, references=p.label_ids)
    result = {
        "accuracy": acc_result["accuracy"],
    }
    return result

In [14]:
#tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

encoded_dataset = dataset.map(preprocess_data, batched=True)
encoded_dataset = encoded_dataset.remove_columns(["CONTEXT", "QUESTION", "ANSWER", "CQA"])
encoded_dataset = encoded_dataset.rename_column("LABEL", "labels")
encoded_dataset.set_format("torch")
encoded_dataset

Map:   0%|          | 0/8851 [00:00<?, ? examples/s]

Map:   0%|          | 0/1563 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8851
    })
    valid: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1563
    })
})

In [15]:
encoded_dataset["train"]['input_ids'][3].shape

torch.Size([99])

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2
)

os.environ["WANDB_PROJECT"] = "reranker_train"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

# "distilroberta-base" distilbert-base-uncased

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
training_args = TrainingArguments(
    output_dir=f"gnlp_hw1_reranker",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    #evaluation_strategy="steps",
    #eval_steps=30,
    #logging_steps=30,
    #save_steps=30,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.001,
    num_train_epochs=3,
    warmup_ratio=0.1,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    save_strategy="no",
    save_total_limit=1,
    group_by_length=True,
    push_to_hub=True,
    report_to="wandb",
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.335,0.329746,0.90659
2,0.1634,0.247001,0.945617
3,0.0332,0.267415,0.952015


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=3321, training_loss=0.19066020000128098, metrics={'train_runtime': 662.9986, 'train_samples_per_second': 40.05, 'train_steps_per_second': 5.009, 'total_flos': 1605448673707260.0, 'train_loss': 0.19066020000128098, 'epoch': 3.0})

In [19]:
wandb.finish()

0,1
eval/accuracy,▁▇█
eval/loss,█▁▃
eval/runtime,█▃▁
eval/samples_per_second,▁▅█
eval/steps_per_second,▁▅█
train/epoch,▁▂▃▃▅▅▆▇██
train/global_step,▁▂▃▃▅▅▆▇██
train/grad_norm,▂▁█▁▁▁
train/learning_rate,█▇▆▄▂▁
train/loss,█▆▃▃▁▁

0,1
eval/accuracy,0.95202
eval/loss,0.26742
eval/runtime,10.1799
eval/samples_per_second,153.537
eval/steps_per_second,19.254
total_flos,1605448673707260.0
train/epoch,3.0
train/global_step,3321.0
train/grad_norm,0.01018
train/learning_rate,0.0


In [20]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e