# Imports

In [None]:
import wandb

wandb.login(
    key="123",  # Replace with your actual API key
)


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnomark[0m ([33mnomark-igor-sikorsky-kyiv-polytechnic-institute[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
import warnings

warnings.filterwarnings("ignore")


In [None]:
import os

import torch
import torch.nn as nn
from datasets import Dataset, concatenate_datasets, load_dataset
from tqdm import tqdm
from transformers import (
    BertConfig,
    BertForSequenceClassification,
    BertModel,
    BertPreTrainedModel,
    BertTokenizer,
    Trainer,
    TrainingArguments,
)
from transformers.modeling_outputs import SequenceClassifierOutput


2025-06-19 20:57:46.149672: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750366666.174822    7437 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750366666.182507    7437 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
subreddits = ["askphysics"]

splits = ["train", "validation", "test"]

all_data = {split: [] for split in splits}

for subreddit in subreddits:
    data = load_dataset("stanfordnlp/shp", data_dir=subreddit)
    for split in splits:
        all_data[split].append(data[split])

final_dataset = {split: concatenate_datasets(all_data[split]) for split in splits}

train_dataset = final_dataset["train"]
val_dataset = final_dataset["validation"]
test_dataset = final_dataset["test"]


# Load Model

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
class BertScoringModel(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.score_head = nn.Linear(config.hidden_size, 1)
        self.post_init()

    def forward(
        self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None
    ):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        cls_output = outputs.last_hidden_state[:, 0, :]
        scores = self.score_head(cls_output).squeeze(-1)

        loss = None
        if labels is not None:
            labels = labels.float()
            loss = nn.MSELoss()(scores, labels)

        return SequenceClassifierOutput(loss=loss, logits=scores)


config = BertConfig.from_pretrained("bert-base-uncased")
model = BertScoringModel(config)
model.to("cuda" if torch.cuda.is_available() else "cpu")

BertScoringModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

# Zero eval

In [None]:
def eval_model(model, dataset, num_samples):
    samples = dataset.select(range(num_samples))

    model.eval()

    successful = 0

    def get_score(prompt, response):
        combined_text = f"{prompt} {tokenizer.sep_token} {response}"
        inputs = tokenizer(
            combined_text,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=512,
        )
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        with torch.no_grad():
            output = model(**inputs)
            score = output.logits.item()

        return score

    for ex in tqdm(samples):
        prompt = ex["history"].strip()
        response_a = ex["human_ref_A"].strip()
        response_b = ex["human_ref_B"].strip()

        score_a = get_score(prompt, response_a)
        score_b = get_score(prompt, response_b)

        successful += 1 if score_a > score_b and ex["labels"] == 1 else 0
        successful += 1 if score_b > score_a and ex["labels"] == 0 else 0

    print("Successful:", successful)
    print("Total:", len(samples))
    print("Accuracy:", successful / len(samples))


In [None]:
eval_model(model, test_dataset, 587)

100%|██████████| 587/587 [00:47<00:00, 12.32it/s]

Successful: 215
Total: 587
Accuracy: 0.36626916524701875





# Fine tune model

In [None]:
def map_data_to_samples(dataset):
    def expand_example(example):
        prompt = example["history"]
        label = example["labels"]
        return {
            "prompt": [prompt, prompt],
            "response": [example["human_ref_A"], example["human_ref_B"]],
            "label": [int(label == 1), int(label == 0)],
        }

    expanded = dataset.map(
        expand_example, batched=False, remove_columns=dataset.column_names
    )
    flattened = Dataset.from_dict(
        {
            "prompt": sum(expanded["prompt"], []),
            "response": sum(expanded["response"], []),
            "label": sum(expanded["label"], []),
        }
    )
    return flattened


In [None]:
def tokenize_samples(dataset, tokenizer):
    sep_token = tokenizer.sep_token

    def tokenize_batch(batch):
        combined = [
            f"{p} {sep_token} {r}" for p, r in zip(batch["prompt"], batch["response"])
        ]
        tokenized = tokenizer(
            combined, truncation=True, padding="max_length", max_length=512
        )
        tokenized["label"] = batch["label"]
        return tokenized

    tokenized_dataset = dataset.map(
        tokenize_batch, batched=True, remove_columns=dataset.column_names
    )
    tokenized_dataset.set_format(
        type="torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"]
    )
    return tokenized_dataset


In [None]:
wandb.init(project="nlp_lab2", name="bert-finetune", reinit=True)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [12]:
mapped_train_dataset = map_data_to_samples(train_dataset)
mapped_val_dataset = map_data_to_samples(val_dataset)

tokenized_train_dataset = tokenize_samples(mapped_train_dataset, tokenizer)
tokenized_val_dataset = tokenize_samples(mapped_val_dataset, tokenizer)

Map:   0%|          | 0/7364 [00:00<?, ? examples/s]

Map:   0%|          | 0/409 [00:00<?, ? examples/s]

Map:   0%|          | 0/14728 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
model.train()

training_args = TrainingArguments(
    output_dir="./bert-scoring",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to=["wandb"],
    run_name="bert-finetune",
    # # Multi-GPU optimizations
    fp16=True,  # Use mixed precision for better performance
    dataloader_num_workers=4,  # Parallel data loading
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
)

print("Evaluating original model before fine-tuning...")
pre_tuning_metrics = trainer.evaluate()
print("Pre-tuning metrics:", pre_tuning_metrics)

print("Starting fine-tuning...")
trainer.train()

trainer.save_model("./bert-finetuned-final")
print("Fine-tuning complete. Model saved to './bert-finetuned-final'")

print("Evaluating fine-tuned model...")
post_tuning_metrics = trainer.evaluate()
print("Post-tuning metrics:", post_tuning_metrics)


Evaluating original model before fine-tuning...


Pre-tuning metrics: {'eval_loss': 0.3922080099582672, 'eval_runtime': 17.2475, 'eval_samples_per_second': 47.427, 'eval_steps_per_second': 0.754}
Starting fine-tuning...


Epoch,Training Loss,Validation Loss
1,0.259,0.250514


Fine-tuning complete. Model saved to './bert-finetuned-final'
Evaluating fine-tuned model...


Post-tuning metrics: {'eval_loss': 0.25051364302635193, 'eval_runtime': 15.8342, 'eval_samples_per_second': 51.66, 'eval_steps_per_second': 0.821, 'epoch': 1.0}


# Final eval

In [None]:
eval_model(model, test_dataset, 587)

100%|██████████| 587/587 [00:47<00:00, 12.36it/s]

Successful: 329
Total: 587
Accuracy: 0.5604770017035775





In [None]:
if os.path.exists("/kaggle/working/NLP/Lab2/bert-finetuned-final"):
    model = BertScoringModel.from_pretrained(
        "/kaggle/working/NLP/Lab2/bert-finetuned-final", local_files_only=True
    )
    model.to("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    eval_model(model, test_dataset, 587)

100%|██████████| 587/587 [00:47<00:00, 12.35it/s]

Successful: 329
Total: 587
Accuracy: 0.5604770017035775



