In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [6]:
df_processed = pd.read_csv("datasets/cleaned_dataset.csv")
print(f"Dataset shape: {df_processed.shape}")

Dataset shape: (407047, 5)


In [7]:
premises = df_processed['premise'].tolist()
hypotheses = df_processed['hypothesis'].tolist()
labels = df_processed['label'].tolist()

In [8]:
X_temp_premise, X_test_premise, X_temp_hypothesis, X_test_hypothesis, y_temp, y_test = train_test_split(
    premises, hypotheses, labels, 
    test_size=0.2, 
    random_state=42, 
    stratify=labels
)

# Second split: train vs validation
X_train_premise, X_val_premise, X_train_hypothesis, X_val_hypothesis, y_train, y_val = train_test_split(
    X_temp_premise, X_temp_hypothesis, y_temp,
    test_size=0.25,  # 0.25 * 0.8 = 0.2 of total data for validation
    random_state=42,
    stratify=y_temp )


In [12]:
from datasets import Dataset, DatasetDict

train_dict = {
    "premises": X_train_premise,
    "hypotheses": X_train_hypothesis,
    "labels": y_train
}

val_dict = {
    "premises": X_val_premise,
    "hypotheses": X_val_hypothesis,
    "labels": y_val
}

test_dict = {
    "premises": X_test_premise,
    "hypotheses": X_test_hypothesis,
    "labels": y_test
}


In [13]:
# Create Dataset objects for each split
train_ds = Dataset.from_dict(train_dict)
val_ds = Dataset.from_dict(val_dict)
test_ds = Dataset.from_dict(test_dict)

# Combine into a DatasetDict for convenience
ds = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds
})

# Access individual splits
print(ds["train"][0])
print(ds["validation"][0])
print(ds["test"][0])


{'premises': 'every single one of them is a tax-cutting, reform-the-government, conservative republican, gingrich declared on abc.', 'hypotheses': 'gingrich was made that he did not get a seat at the time.', 'labels': 1}
{'premises': 'and uh so i had her baby sitting but she was six months pregnant and it was getting too much for her so i just quit i would rather quit and take care of my own kids than let somebody else raise them', 'hypotheses': 'my babysitter was approaching her third trimester and struggling so decided to look after my kids instead', 'labels': 0}
{'premises': 'uh you can you can buy bags of silver coins a a bag has a thousand dollars face value in it and it is traded for silver', 'hypotheses': 'the bags are available for sale and you get them for just a thousand dollars.', 'labels': 0}


In [14]:
print(f"Train samples: {len(y_train)}")
print(f"Validation samples: {len(y_val)}")
print(f"Test samples: {len(y_test)}")

Train samples: 244227
Validation samples: 81410
Test samples: 81410


In [15]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

model_name = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
max_length = 128

def preprocess_function(examples):
    inputs = [
        f"Premise: {p} Hypothesis: {h}" 
        for p, h in zip(examples["premises"], examples["hypotheses"])
    ]
    model_inputs = tokenizer(
        inputs,
        padding="max_length",
        truncation=True,
        max_length=max_length
    )
    # Include labels in the output dictionary if available
    model_inputs["labels"] = examples["labels"]  # or adjust key name if needed
    return model_inputs


In [24]:
train_encodings = ds["train"].map(
    preprocess_function,
    batched=True,
    load_from_cache_file=False,
    remove_columns=ds["train"].column_names,
    desc="Tokenizing train"
)
val_encodings = ds["validation"].map(
    preprocess_function,
    batched=True,
    load_from_cache_file=False,
    remove_columns=ds["validation"].column_names,
    desc="Tokenizing val"
)
test_encodings = ds["test"].map(
    preprocess_function,
    batched=True,
    load_from_cache_file=False,
    remove_columns=ds["test"].column_names,
    desc="Tokenizing test"
)

Tokenizing train:   0%|          | 0/244227 [00:00<?, ? examples/s]

Tokenizing val:   0%|          | 0/81410 [00:00<?, ? examples/s]

Tokenizing test:   0%|          | 0/81410 [00:00<?, ? examples/s]

In [25]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

In [26]:
batch_size = 16
train_dataloader = DataLoader(train_encodings, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
eval_dataloader = DataLoader(val_encodings, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

In [27]:
from peft import PromptEncoderConfig, get_peft_model

In [21]:
pip install peft

Collecting peft
  Downloading peft-0.17.1-py3-none-any.whl.metadata (14 kB)
Collecting accelerate>=0.21.0 (from peft)
  Using cached accelerate-1.10.0-py3-none-any.whl.metadata (19 kB)
Downloading peft-0.17.1-py3-none-any.whl (504 kB)
Using cached accelerate-1.10.0-py3-none-any.whl (374 kB)
Installing collected packages: accelerate, peft

   ---------------------------------------- 0/2 [accelerate]
   ---------------------------------------- 0/2 [accelerate]
   ---------------------------------------- 0/2 [accelerate]
   ---------------------------------------- 0/2 [accelerate]
   ---------------------------------------- 0/2 [accelerate]
   -------------------- ------------------- 1/2 [peft]
   -------------------- ------------------- 1/2 [peft]
   -------------------- ------------------- 1/2 [peft]
   -------------------- ------------------- 1/2 [peft]
   -------------------- ------------------- 1/2 [peft]
   -------------------- ------------------- 1/2 [peft]
   -------------------- 

In [29]:
peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=20, encoder_hidden_size=128)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 822,275 || all params: 278,868,230 || trainable%: 0.2949


In [58]:
from transformers import get_linear_schedule_with_warmup
import torch

lr = 1e-5
num_epochs = 10
dataset_size = 244227
total_steps = num_epochs * (dataset_size // batch_size)
warmup_steps = int(0.1 * total_steps)  # 10% warmup steps


optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs)
)

In [59]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, 
        average='weighted', 
        zero_division=0
    )
    accuracy = accuracy_score(labels, predictions)
    
    precision_per_class, recall_per_class, f1_per_class, support = precision_recall_fscore_support(
        labels, predictions, 
        average=None, 
        zero_division=0
    )
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'precision_per_class': precision_per_class.tolist(),
        'recall_per_class': recall_per_class.tolist(),
        'f1_per_class': f1_per_class.tolist(),
        'support_per_class': support.tolist()
    }

In [60]:
from tqdm import tqdm

device = "cuda"
model = model.to(device)


for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    all_logits = []
    all_labels = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        eval_loss += outputs.loss.item()
        all_logits.append(outputs.logits.detach().cpu().numpy())
        all_labels.append(batch["labels"].detach().cpu().numpy())

    eval_loss /= len(eval_dataloader)

    # Concatenate all predictions and labels
    all_logits = np.concatenate(all_logits, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    metrics = compute_metrics((all_logits, all_labels))

    train_loss = total_loss / len(train_dataloader)

    print(f"Epoch {epoch}: Eval loss: {eval_loss:.4f}, Train Loss: {train_loss} Accuracy: {metrics['accuracy']:.4f}, F1: {metrics['f1']:.4f}")

    


100%|██████████| 15265/15265 [26:12<00:00,  9.71it/s]
100%|██████████| 5089/5089 [03:57<00:00, 21.43it/s]


Epoch 0: Eval loss: 1.0796, Train Loss: 1.0940337251804713 Accuracy: 0.4181, F1: 0.4109


100%|██████████| 15265/15265 [26:14<00:00,  9.70it/s]
100%|██████████| 5089/5089 [03:56<00:00, 21.54it/s]


Epoch 1: Eval loss: 1.0840, Train Loss: 1.093210575383802 Accuracy: 0.3809, F1: 0.3160


100%|██████████| 15265/15265 [26:15<00:00,  9.69it/s]
100%|██████████| 5089/5089 [03:59<00:00, 21.28it/s]


Epoch 2: Eval loss: 1.0861, Train Loss: 1.0924456885787959 Accuracy: 0.3702, F1: 0.2865


100%|██████████| 15265/15265 [26:16<00:00,  9.68it/s]
100%|██████████| 5089/5089 [04:01<00:00, 21.08it/s]


Epoch 3: Eval loss: 1.0831, Train Loss: 1.091860765911812 Accuracy: 0.3835, F1: 0.3200


100%|██████████| 15265/15265 [26:15<00:00,  9.69it/s]
100%|██████████| 5089/5089 [03:59<00:00, 21.28it/s]


Epoch 4: Eval loss: 1.0824, Train Loss: 1.0919149455811756 Accuracy: 0.3883, F1: 0.3335


100%|██████████| 15265/15265 [26:16<00:00,  9.68it/s]
100%|██████████| 5089/5089 [03:58<00:00, 21.36it/s]


Epoch 5: Eval loss: 1.0801, Train Loss: 1.0912611850784757 Accuracy: 0.3970, F1: 0.3533


100%|██████████| 15265/15265 [26:16<00:00,  9.68it/s]
100%|██████████| 5089/5089 [03:58<00:00, 21.32it/s]


Epoch 6: Eval loss: 1.0812, Train Loss: 1.090796236120362 Accuracy: 0.3929, F1: 0.3454


100%|██████████| 15265/15265 [26:16<00:00,  9.68it/s]
100%|██████████| 5089/5089 [03:58<00:00, 21.36it/s]


Epoch 7: Eval loss: 1.0792, Train Loss: 1.0905510258721478 Accuracy: 0.4013, F1: 0.3659


100%|██████████| 15265/15265 [26:15<00:00,  9.69it/s]
100%|██████████| 5089/5089 [03:58<00:00, 21.36it/s]


Epoch 8: Eval loss: 1.0817, Train Loss: 1.0906518046324112 Accuracy: 0.3884, F1: 0.3332


100%|██████████| 15265/15265 [26:15<00:00,  9.69it/s]
100%|██████████| 5089/5089 [03:58<00:00, 21.35it/s]

Epoch 9: Eval loss: 1.0820, Train Loss: 1.0903775750751226 Accuracy: 0.3868, F1: 0.3291



