In [6]:
DATASET_PATH  = "Augmented Datasets/"

In [5]:
import torch
import torch.nn as nn
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification

class TwinnedModel(nn.Module):
    def __init__(self, do_finetuning, model_name1='microsoft/deberta-v3-xsmall', model_name2='microsoft/deberta-v3-xsmall', num_classes=3):
        super(TwinnedModel, self).__init__()

        # Load pre-trained models
        self.model1 = DebertaV2ForSequenceClassification.from_pretrained(model_name1, num_labels=num_classes)
        self.model2 = DebertaV2ForSequenceClassification.from_pretrained(model_name2, num_labels=num_classes)

        # Freeze the models if you don't want to train them further
        for param in self.model1.parameters():
            param.requires_grad = do_finetuning
        for param in self.model2.parameters():
            param.requires_grad = do_finetuning

        # Define a fully connected layer to combine the hidden states
        combined_hidden_size = self.model1.config.hidden_size + self.model2.config.hidden_size
        self.fc = nn.Linear(combined_hidden_size, num_classes)

    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2):
        # Get outputs from both models (hidden states instead of logits)
        outputs1 = self.model1(input_ids1, attention_mask=attention_mask1, output_hidden_states=True)
        outputs2 = self.model2(input_ids2, attention_mask=attention_mask2, output_hidden_states=True)

        # Get the last hidden state (hidden_states[-1])
        hidden_state1 = outputs1.hidden_states[-1][:, 0, :]  # [CLS] token representation
        hidden_state2 = outputs2.hidden_states[-1][:, 0, :]  # [CLS] token representation

        # Concatenate the hidden states
        combined_hidden = torch.cat((hidden_state1, hidden_state2), dim=1)

        # Pass the combined hidden states through the fully connected layer
        logits = self.fc(combined_hidden)

        return logits

# Example usage
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-xsmall')

# Define sample input sentences
premise = "A man is eating food."
hypothesis = "The man is having a meal."

# Tokenize the input sentences for both models
inputs1 = tokenizer(premise, hypothesis, return_tensors='pt', padding=True, truncation=True, max_length=256)
inputs2 = tokenizer(hypothesis, premise, return_tensors='pt', padding=True, truncation=True, max_length=256)

# Instantiate the model
model = TwinnedModel(do_finetuning=True)

# Pass the inputs through the model
logits = model(inputs1['input_ids'], inputs1['attention_mask'], inputs2['input_ids'], inputs2['attention_mask'])

# Output logits for each class (entailment, contradiction, neutral)
print("Logits:", logits)

# Get the predicted class by applying torch.argmax
predicted_class = torch.argmax(logits, dim=1).item()

# Define the mapping of index to label
label_map = {0: "entailment", 1: "contradiction", 2: "neutral"}

# Get the corresponding label
predicted_label = label_map[predicted_class]

print(f"Predicted class: {predicted_class} : {predicted_label}")


  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Logits: tensor([[0.2114, 0.6161, 0.1677]], grad_fn=<AddmmBackward0>)
Predicted class: 1 : contradiction


In [10]:
import torch
import torch.nn as nn
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import json

class TwinnedModelForHF(nn.Module):
    def __init__(self, do_finetuning, model_name1='microsoft/deberta-v3-xsmall', model_name2='microsoft/deberta-v3-xsmall', num_classes=3):
        super(TwinnedModelForHF, self).__init__()

        # Load pre-trained models
        self.model1 = DebertaV2ForSequenceClassification.from_pretrained(model_name1, num_labels=num_classes)
        self.model2 = DebertaV2ForSequenceClassification.from_pretrained(model_name2, num_labels=num_classes)

        # Freeze the models if fine-tuning is not needed
        for param in self.model1.parameters():
            param.requires_grad = do_finetuning
        for param in self.model2.parameters():
            param.requires_grad = do_finetuning

        # Define a fully connected layer to combine the hidden states
        combined_hidden_size = self.model1.config.hidden_size + self.model2.config.hidden_size
        self.fc = nn.Linear(combined_hidden_size, num_classes)

    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2, labels=None):
        # Get outputs from both models (hidden states instead of logits)
        outputs1 = self.model1(input_ids1, attention_mask=attention_mask1, output_hidden_states=True)
        outputs2 = self.model2(input_ids2, attention_mask=attention_mask2, output_hidden_states=True)

        # Get the last hidden state (hidden_states[-1])
        hidden_state1 = outputs1.hidden_states[-1][:, 0, :]  # [CLS] token representation
        hidden_state2 = outputs2.hidden_states[-1][:, 0, :]  # [CLS] token representation

        # Concatenate the hidden states
        combined_hidden = torch.cat((hidden_state1, hidden_state2), dim=1)

        # Pass the combined hidden states through the fully connected layer
        logits = self.fc(combined_hidden)

        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return {"loss": loss, "logits": logits}

# Function to load .jsonl dataset
def load_jsonl_dataset(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Function to tokenize and format the dataset
def encode_data(data, tokenizer, max_length=256):
    inputs1_list = []
    attention_mask1_list = []
    input_ids2_list = []
    attention_mask2_list = []
    labels_list = []

    # Map labels as integers representing the classes
    label_map = {'ENTAILMENT': 0, 'CONTRADICTION': 1, 'NEUTRAL': 2}

    
    for item in data:
        premise = item['premise']
        hypothesis = item['hypothesis']
        label = item['label']
        
        inputs1 = tokenizer(premise, hypothesis, truncation=True, padding='max_length', max_length=max_length)
        inputs2 = tokenizer(hypothesis, premise, truncation=True, padding='max_length', max_length=max_length)
        
        inputs1_list.append(inputs1['input_ids'])
        attention_mask1_list.append(inputs1['attention_mask'])
        input_ids2_list.append(inputs2['input_ids'])
        attention_mask2_list.append(inputs2['attention_mask'])
        labels_list.append(label_map[label])
    
    # Create a dictionary of lists
    dataset_dict = {
        "input_ids1": inputs1_list,
        "attention_mask1": attention_mask1_list,
        "input_ids2": input_ids2_list,
        "attention_mask2": attention_mask2_list,
        "labels": labels_list
    }
    
    return Dataset.from_dict(dataset_dict)


# Load the tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-xsmall')

# Load datasets from jsonl files
train_data = load_jsonl_dataset(DATASET_PATH+'fever_test_syn.jsonl')
val_data = load_jsonl_dataset(DATASET_PATH+'fever_test_syn.jsonl')
test_data = load_jsonl_dataset(DATASET_PATH+'fever_test_syn.jsonl')

# Encode datasets
train_dataset = encode_data(train_data, tokenizer)
val_dataset = encode_data(val_data, tokenizer)
test_dataset = encode_data(test_data, tokenizer)

# Custom data collator
def data_collator(features):
    input_ids1 = torch.tensor([f["input_ids1"] for f in features], dtype=torch.long)
    attention_mask1 = torch.tensor([f["attention_mask1"] for f in features], dtype=torch.long)
    input_ids2 = torch.tensor([f["input_ids2"] for f in features], dtype=torch.long)
    attention_mask2 = torch.tensor([f["attention_mask2"] for f in features], dtype=torch.long)
    labels = torch.tensor([f["labels"] for f in features], dtype=torch.long)

    return {
        "input_ids1": input_ids1,
        "attention_mask1": attention_mask1,
        "input_ids2": input_ids2,
        "attention_mask2": attention_mask2,
        "labels": labels
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
     per_device_train_batch_size=8,
    gradient_accumulation_steps=4,  # Simulate batch size of 8*4=32 but without loading all at once
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
)

# Initialize the model
model = TwinnedModelForHF(do_finetuning=False)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Evaluate the model on the test dataset
trainer.evaluate(test_dataset)


  0%|          | 0/429 [04:31<?, ?it/s]
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the se

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 2.00 GiB total capacity; 1.67 GiB already allocated; 0 bytes free; 1.71 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF