In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig, RobertaForSequenceClassification
from datasets import load_dataset
from evaluate import load
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
device = "cuda" if torch.cuda.is_available() else "cpu"
#  You can install and import any other libraries if needed

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device

'cuda'

In [3]:
# Some Chinese punctuations will be tokenized as [UNK], so we replace them with English ones
token_replacement = [
    ["：" , ":"],
    ["，" , ","],
    ["“" , "\""],
    ["”" , "\""],
    ["？" , "?"],
    ["……" , "..."],
    ["！" , "!"]
]

In [4]:

tokenizer = RobertaTokenizer.from_pretrained("roberta-base", cache_dir="./cache/")

In [5]:
tokenizer(text="Testing the output.", text_pair="Testing second sentence.", padding = True, truncation=True, return_tensors="pt", return_attention_mask=True, return_token_type_ids=False)

{'input_ids': tensor([[    0, 47446,     5,  4195,     4,     2,     2, 47446,   200,  3645,
             4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [6]:
class SemevalDataset(Dataset):
    def __init__(self, split="train") -> None:
        super().__init__()
        assert split in ["train", "validation", "test"]
        self.data = load_dataset(
            "sem_eval_2014_task_1", split=split, trust_remote_code=True, cache_dir="./cache/"
        ).to_list()

    def __getitem__(self, index):
        d = self.data[index]
        # Replace Chinese punctuations with English ones
        for k in ["premise", "hypothesis"]:
            for tok in token_replacement:
                d[k] = d[k].replace(tok[0], tok[1])
             
        return d

    def __len__(self):
        return len(self.data)

data_sample = SemevalDataset(split="train").data[:3]
print(f"Dataset example: \n{data_sample[0]} \n{data_sample[1]} \n{data_sample[2]}")

Downloading data: 100%|██████████| 87.3k/87.3k [00:00<00:00, 124kB/s] 
Downloading data: 100%|██████████| 93.4k/93.4k [00:00<00:00, 131kB/s] 
Downloading data: 100%|██████████| 16.4k/16.4k [00:00<00:00, 32.1MB/s]
Generating train split: 100%|██████████| 4500/4500 [00:00<00:00, 8934.71 examples/s]
Generating test split: 100%|██████████| 4927/4927 [00:00<00:00, 33425.59 examples/s]
Generating validation split: 100%|██████████| 500/500 [00:00<00:00, 27384.40 examples/s]


Dataset example: 
{'sentence_pair_id': 1, 'premise': 'A group of kids is playing in a yard and an old man is standing in the background', 'hypothesis': 'A group of boys in a yard is playing and a man is standing in the background', 'relatedness_score': 4.5, 'entailment_judgment': 0} 
{'sentence_pair_id': 2, 'premise': 'A group of children is playing in the house and there is no man standing in the background', 'hypothesis': 'A group of kids is playing in a yard and an old man is standing in the background', 'relatedness_score': 3.200000047683716, 'entailment_judgment': 0} 
{'sentence_pair_id': 3, 'premise': 'The young boys are playing outdoors and the man is smiling nearby', 'hypothesis': 'The kids are playing outdoors near a man with a smile', 'relatedness_score': 4.699999809265137, 'entailment_judgment': 1}


In [7]:
# Define the hyperparameters
# You can modify these values if needed
lr = 3e-5
epochs = 3
train_batch_size = 8
validation_batch_size = 8

In [8]:
# TODO1: Create batched data for DataLoader
# `collate_fn` is a function that defines how the data batch should be packed.
# This function will be called in the DataLoader to pack the data batch.

def collate_fn(batch):
    # TODO1-1: Implement the collate_fn function
    # Write your code here
    # The input parameter is a data batch (tuple), and this function packs it into tensors.
    # Use tokenizer to pack tokenize and pack the data and its corresponding labels.
    # Return the data batch and labels for each sub-task.
    premises = [data_instance["premise"] for data_instance in batch]
    hypothesis = [data_instance["hypothesis"] for data_instance in batch]
    relatedness_scores = [data_instance["relatedness_score"] for data_instance in batch]
    entailment_judgments = [data_instance["entailment_judgment"] for data_instance in batch]

    input_texts = tokenizer(premises, hypothesis, padding = True, truncation=True, return_tensors="pt", return_attention_mask=True, return_token_type_ids=False)

    relatedness_scores=torch.FloatTensor(relatedness_scores)
    entailment_judgments=torch.LongTensor(entailment_judgments)
    # print(set(entailment_judgments.numpy()))
    return input_texts, relatedness_scores, entailment_judgments

# TODO1-2: Define your DataLoader
dl_train = torch.utils.data.DataLoader(dataset=SemevalDataset(split="train"), collate_fn=collate_fn, batch_size=train_batch_size, shuffle=True, num_workers=32) # Write your code here
dl_validation = torch.utils.data.DataLoader(dataset=SemevalDataset(split="validation"), collate_fn=collate_fn, batch_size=validation_batch_size, shuffle=False, num_workers=32)  # Write your code here
dl_test = torch.utils.data.DataLoader(dataset=SemevalDataset(split="test"), collate_fn=collate_fn, batch_size=validation_batch_size, shuffle=False, num_workers=32) # Write your code here



In [9]:
print(next(iter(dl_train)))

({'input_ids': tensor([[    0,   250,   313,    16,   816,     5,  8669,  1945,     2,     2,
           250,   621,    16,   816,    10, 32909,     2,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [    0,   250,   664,   693,    16,   562,    10, 12904,    15,    69,
           865,     2,     2,   250,  1816,    16,   562,    10, 12904,    15,
            69,   865,     2,     1,     1,     1,     1,     1,     1,     1,
             1],
        [    0,   970,    16,   117,  4758, 10601, 42922,  4835,  5803,     2,
             2,   133,  4758,    16, 10601, 42922,  4835,  5803,     2,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [    0,   250,   313,    16,   816,     5,  8669,     2,     2,   250,
           621,    16,  6288,     7,    10, 32909,   816,     2,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1, 

In [10]:
# TODO2: Construct your model
class MultiLabelModel(torch.nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
        # Write your code here
        # Define what modules you will use in the model
        # Please use "google-bert/bert-base-uncased" model (https://huggingface.co/google-bert/bert-base-uncased)
        # Besides the base model, you may design additional architectures by incorporating linear layers, activation functions, or other neural components.
        # Remark: The use of any additional pretrained language models is not permitted.

        num_classes = 3 # Based on our data
        # config = RobertaConfig.from_pretrained("FacebookAI/roberta-base", cache_dir="./cache/", pad_token_id=tokenizer.pad_token_id)
        # config.max_position_embeddings = 514
        
        self.roberta_model = RobertaModel.from_pretrained(
            "roberta-base", 
            cache_dir="./cache/",
            # config=config

        )
        self.dropout = torch.nn.Dropout(0.3)

        self.regression_head = torch.nn.Linear(self.roberta_model.config.hidden_size,1)
        self.classification_head=torch.nn.Linear(self.roberta_model.config.hidden_size,num_classes)
        
    def forward(self, **kwargs):
        # Write your code here
        # Forward pass

        output=self.roberta_model(input_ids=kwargs.get("input_ids"), attention_mask=kwargs.get("attention_mask"))
        cls_token_output = output.last_hidden_state[:, 0, :]
        # output_dropout = self.dropout(output.pooler_output)
        output_dropout = self.dropout(cls_token_output)
        output_regression = self.regression_head(output_dropout)
        output_classification = self.classification_head(output_dropout)

        return output_regression, output_classification

In [11]:
# TODO3: Define your optimizer and loss function

model = MultiLabelModel().to(device)
# TODO3-1: Define your Optimizer
optimizer = torch.optim.AdamW(params =  model.parameters(), lr=lr) # Write your code here

# TODO3-2: Define your loss functions (you should have two)
# Write your code here
classification_criterion = torch.nn.CrossEntropyLoss()
regression_criterion = torch.nn.MSELoss()

# scoring functions
psr = load("pearsonr")
acc = load("accuracy")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
psr

EvaluationModule(name: "pearsonr", module_type: "metric", features: {'predictions': Value(dtype='float32', id=None), 'references': Value(dtype='float32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted class labels, as returned by a model.
    references (`list` of `int`): Ground truth labels.
    return_pvalue (`boolean`): If `True`, returns the p-value, along with the correlation coefficient. If `False`, returns only the correlation coefficient. Defaults to `False`.

Returns:
    pearsonr (`float`): Pearson correlation coefficient. Minimum possible value is -1. Maximum possible value is 1. Values of 1 and -1 indicate exact linear positive and negative relationships, respectively. A value of 0 implies no correlation.
    p-value (`float`): P-value, which roughly indicates the probability of an The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from t

In [13]:
import numpy as np
x = np.array([10, 12, 15, 18, 20])
y = np.array([25, 30, 38, 45, 50])
# Calculate Pearson correlation and p-value
results = psr.compute(references = x, predictions = y, return_pvalue=True)
print(results)
# Print the results
print(f"Pearson correlation coefficient (r): {results['pearsonr']:.3f}")
print(f"P-value: {results['p-value']:.3f}")

{'pearsonr': np.float64(0.9997647888947827), 'p-value': np.float64(4.330173202906213e-06)}
Pearson correlation coefficient (r): 1.000
P-value: 0.000


In [14]:
acc

EvaluationModule(name: "accuracy", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
        >>> print(results)
    

In [15]:
results = acc.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
results

{'accuracy': 0.5}

In [17]:
import numpy as np
import os
best_score = 0.0

for ep in range(epochs):
    batch_train_index=0
    pbar = tqdm(dl_train)
    pbar.set_description(f"Training epoch [{ep+1}/{epochs}]")
    model.train()
    # TODO4: Write the training loop
    # Write your code here
    # train your model
    # clear gradient
    # forward pass
    # compute loss
    # back-propagation
    # model optimization
    for input_batch, rel_score_batch, entail_judge_batch in pbar:
        input_batch = input_batch.to(device)
        rel_score_batch = rel_score_batch.to(device)
        entail_judge_batch = entail_judge_batch.to(device)

        optimizer.zero_grad()

        rel_score_preds, entail_judge_preds = model(**input_batch)

        regression_loss = regression_criterion(rel_score_preds.squeeze(), rel_score_batch)
        classification_loss = classification_criterion(entail_judge_preds, entail_judge_batch)
        
        overall_loss = regression_loss + classification_loss

        overall_loss.backward()
        optimizer.step()

        batch_train_index+=1
        if batch_train_index%50==0:
            pbar.set_postfix(loss = overall_loss.item())


    pbar = tqdm(dl_validation)
    pbar.set_description(f"Validation epoch [{ep+1}/{epochs}]")
    model.eval()
    # TODO5: Write the evaluation loop
    # Write your code here
    # Evaluate your model
    # Output all the evaluation scores (PearsonCorr, Accuracy)
    real_rel_scores = []
    pred_rel_scores = []
    real_entail_classes = []
    pred_entail_classes = []

    with torch.no_grad():
        batch_val_index=0
        for input_batch, rel_score_real_batch, entail_judge_real_batch in pbar:
            input_batch = input_batch.to(device)
            rel_score_real_batch = rel_score_real_batch.to(device)
            entail_judge_real_batch = entail_judge_real_batch.to(device)

            rel_score_pred_batch, entail_judge_pred_batch = model(**input_batch)

            entailment_predicted_labels = torch.argmax(entail_judge_pred_batch, dim=1)

            pred_rel_scores.append(rel_score_pred_batch.cpu())
            real_rel_scores.append(rel_score_real_batch.cpu())
            pred_entail_classes.append(entailment_predicted_labels.cpu())
            real_entail_classes.append(entail_judge_real_batch.cpu())

        pred_rel_scores = torch.cat(pred_rel_scores).squeeze()
        real_rel_scores = torch.cat(real_rel_scores)
        pred_entail_classes = torch.cat(pred_entail_classes)
        real_entail_classes = torch.cat(real_entail_classes)

        pearson_corr = psr.compute(references = real_rel_scores, predictions = pred_rel_scores)['pearsonr'] # Write your code here
        accuracy = acc.compute(references=real_entail_classes, predictions=pred_entail_classes)['accuracy'] # Write your code here
        # print(f"F1 Score: {f1.compute()}")
        # batch_val_index+=1
        # if batch_val_index%10==0:
        print(f"Epoch no. {ep} - Pearson Correlation: {pearson_corr} - Accuracy: {accuracy}")
        if pearson_corr + accuracy > best_score:
            best_score = pearson_corr + accuracy
            os.makedirs("./saved_models", exist_ok=True)
            torch.save(model.state_dict(), f'./saved_models/best_model.ckpt')

Training epoch [1/3]: 100%|██████████| 563/563 [00:40<00:00, 13.88it/s, loss=0.534]
Validation epoch [1/3]: 100%|██████████| 63/63 [00:01<00:00, 39.63it/s]


Epoch no. 0 - Pearson Correlation: 0.896378647662928 - Accuracy: 0.876


Training epoch [2/3]: 100%|██████████| 563/563 [00:39<00:00, 14.09it/s, loss=0.169]
Validation epoch [2/3]: 100%|██████████| 63/63 [00:01<00:00, 38.24it/s]


Epoch no. 1 - Pearson Correlation: 0.8616918713520976 - Accuracy: 0.89


Training epoch [3/3]: 100%|██████████| 563/563 [00:39<00:00, 14.34it/s, loss=0.452]
Validation epoch [3/3]: 100%|██████████| 63/63 [00:01<00:00, 38.71it/s]


Epoch no. 2 - Pearson Correlation: 0.8864331895514849 - Accuracy: 0.89


In [18]:
# Load the model
model = MultiLabelModel().to(device)
model.load_state_dict(torch.load(f"./saved_models/best_model.ckpt", weights_only=True))

# Test Loop
pbar = tqdm(dl_test, desc="Test")
model.eval()

# TODO6: Write the test loop
# Write your code here
# We have loaded the best model with the highest evaluation score for you
# Please implement the test loop to evaluate the model on the test dataset
# We will have 10% of the total score for the test accuracy and pearson correlation
real_rel_scores = []
pred_rel_scores = []
real_entail_classes = []
pred_entail_classes = []

with torch.no_grad():
    batch_val_index=0
    for input_batch, rel_score_real_batch, entail_judge_real_batch in pbar:
        input_batch = input_batch.to(device)
        rel_score_real_batch = rel_score_real_batch.to(device)
        entail_judge_real_batch = entail_judge_real_batch.to(device)

        rel_score_pred_batch, entail_judge_pred_batch = model(**input_batch)

        entailment_predicted_labels = torch.argmax(entail_judge_pred_batch, dim=1)

        pred_rel_scores.append(rel_score_pred_batch.cpu())
        real_rel_scores.append(rel_score_real_batch.cpu())
        pred_entail_classes.append(entailment_predicted_labels.cpu())
        real_entail_classes.append(entail_judge_real_batch.cpu())

    pred_rel_scores = torch.cat(pred_rel_scores).squeeze()
    real_rel_scores = torch.cat(real_rel_scores)
    pred_entail_classes = torch.cat(pred_entail_classes)
    real_entail_classes = torch.cat(real_entail_classes)

    pearson_corr = psr.compute(references = real_rel_scores, predictions = pred_rel_scores)['pearsonr']
    accuracy = acc.compute(references=real_entail_classes, predictions=pred_entail_classes)['accuracy']
   
    print(f"Test Set - Pearson Correlation: {pearson_corr} - Accuracy: {accuracy}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Test: 100%|██████████| 616/616 [00:09<00:00, 66.05it/s]


Test Set - Pearson Correlation: 0.8888255144218109 - Accuracy: 0.901968743657398
