# 0. Constants

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


* PEP-3K Dataset Path

In [2]:
BASE_DIR_PEP = '/content/drive/MyDrive/semantic plausibility/datasets/pep-3k/train-dev-test-split'

In [3]:
TRAIN_PEP = 'train.csv'
DEV_PEP = 'dev.csv'
TEST_PEP = 'test.csv'

* PAP Dataset Path

In [4]:
BASE_DIR_PAP = '/content/drive/MyDrive/semantic plausibility/datasets/pap/train-dev-test-split-filtered/binary'

# 1. ROBERTA Fine-tuning in PEP

Implemented by Wen Wen & Chih-Yi Lin

## Prerpocessing

* Install

In [None]:
!pip install torch
!pip install transformers
!pip install datasets
!pip install accelerate

In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, roc_auc_score
from tqdm import tqdm
from huggingface_hub import notebook_login

* Loading and Preprocessing Data

In [None]:
def load_data(base_dir, train_file, dev_file, test_file):

    # Joins file paths
    train_path = os.path.join(base_dir, train_file)
    dev_path = os.path.join(base_dir, dev_file)
    test_path = os.path.join(base_dir, test_file)

    train_data = pd.read_csv(train_path)
    dev_data = pd.read_csv(dev_path)
    test_data = pd.read_csv(test_path)

    # Converts text and label columns from DataFrames to lists
    train_texts, train_labels = train_data['text'].tolist(), train_data['label'].tolist()
    dev_texts, dev_labels = dev_data['text'].tolist(), dev_data['label'].tolist()
    test_texts, test_labels = test_data['text'].tolist(), test_data['label'].tolist()

    return train_texts, train_labels, dev_texts, dev_labels, test_texts, test_labels

## Needed Functions

* Tokenization and Encoding

In [None]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
def tokenize_data(texts, labels, tokenizer):

    # Tokenize texts with truncation and padding
    encodings = tokenizer(texts, truncation=True, padding=True)
    dataset = CustomDataset(encodings, labels)

    return dataset

* Training

In [None]:
# for pushing trained model
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
def train_model(model, train_loader, dev_loader, optimizer, device, output_dir, num_epochs=3):

    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=8,
        learning_rate=1e-5,
        num_train_epochs=num_epochs,
        save_total_limit=2,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_loader.dataset,
        eval_dataset=dev_loader.dataset,
    )

    # Train the model
    trainer.train()

    # Push the model to the Hugging Face Model Hub
    trainer.push_to_hub("melodyzen168/")

In [None]:
def get_training_arguments(base_dir, train_file, dev_file, test_file):

    # Define a tokenizer
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    # Load data
    train_texts, train_labels, dev_texts, dev_labels, test_texts, test_labels = load_data(base_dir, train_file, dev_file, test_file)

    # Tokenize data
    train_dataset = tokenize_data(train_texts, train_labels, tokenizer)
    dev_dataset = tokenize_data(dev_texts, dev_labels, tokenizer)
    test_dataset = tokenize_data(test_texts, test_labels, tokenizer)

    # Creates data loaders
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
    dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=8)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8)


    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

    # Here sets the learning rate: lr=1e-5
    optimizer = AdamW(model.parameters(), lr=1e-5)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    output_directory = "./ROBERTA_WILL_BE_FINE"

    return train_loader, dev_loader, test_loader, tokenizer, model, optimizer, device, output_directory


* Evaluation function

In [None]:
def evaluate_model(model, loader, device):
    predictions = []
    true_labels = []
    predicted_probs = []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1)
            predicted_probs.extend(probabilities[:, 1].tolist())
            predictions.extend(torch.argmax(logits, dim=1).tolist())
            true_labels.extend(labels.tolist())

    accuracy = accuracy_score(true_labels, predictions)
    auc_score = roc_auc_score(true_labels, predicted_probs)

    print(f'Accuracy: {accuracy:.3f}')
    print(f'AUC Score: {auc_score:.3f}')

    return accuracy, auc_score


## Tuning in PEP Data

In [None]:
train_loader, dev_loader, test_loader, tokenizer, model, optimizer, device, output_directory = get_training_arguments(BASE_DIR_PEP, TRAIN_PEP, DEV_PEP, TEST_PEP)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_model(model, train_loader, dev_loader, optimizer, device, output_directory)

Step,Training Loss
500,0.6277


## Evaluation

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("melodyzen168/ROBERTA_WILL_BE_FINE")
trainer = Trainer(model=model)
print("Dev Score:")
dev_accuracy = evaluate_model(model, dev_loader, device)

print("Test Score:")
test_accuracy = evaluate_model(model, test_loader, device)

Dev Score:
Accuracy: 0.725
AUC Score: 0.827
Test Score:
Accuracy: 0.759
AUC Score: 0.825


# 2. PAP with RoBERTa and Distilled BERT

Implemented by Quy Nguyen

### Import dependencies

In [5]:
# Import the notebook_login function from the huggingface_hub module
# This function is used to authenticate a user in a notebook environment,
# allowing them to access their Hugging Face account and associated models and datasets.
# COMMENT THIS IF IF NEEDED
# from huggingface_hub import notebook_login
# notebook_login()

In [6]:
!pip install -q transformers datasets evaluate accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
from typing import Optional
import numpy as np
import pandas as pd
import torch
from dataclasses import dataclass, field

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
from sklearn.metrics import precision_score, recall_score, roc_curve, auc

## Load the dataset

* Make sure to run Constants at the beginning of the Notebook

In [8]:
pap = load_dataset('csv', data_files={
    'train': f'{BASE_DIR_PAP}/train.csv',
    'dev': f'{BASE_DIR_PAP}/dev.csv',
    'test': f'{BASE_DIR_PAP}/test.csv'
})

Generating train split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [9]:
# create a map of the expected ids to their labels
id2label = {0: "IMPLAUSIBLE", 1: "PLAUSIBLE"}
label2id = {"IMPLAUSIBLE": 0, "PLAUSIBLE": 1}

In [10]:
# look at 1 example
pap["train"][0]

{'text': 'group releases album', 'original_label': 'plausible', 'label': 1}

## Evaluation

In [11]:
# Load the 'accuracy' metric from the 'evaluate' library.
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [12]:
def compute_metrics(eval_pred):
  "compute metrics based on evaluation predictions."
  # Unpack the tuple containing predictions and true labels.
  predictions, labels = eval_pred
  # Use numpy's argmax function to get the index of the maximum value along axis 1.
  # This converts the predicted probabilities into the predicted class (0 or 1).
  predictions = np.argmax(predictions, axis=1)

  # Call the metric function loaded from the 'evaluate' library.
  # It compares the predicted classes to the true labels.
  return accuracy.compute(predictions=predictions, references=labels)


In [13]:
texts = ['man eats bridge', 'camel rides lake', 'paper kills leaves', 'man knits shirt']

In [14]:
def predict_text_plausibility(text, tokenizer, model):
  print(text)
  inputs = tokenizer(text, return_tensors="pt")
  with torch.no_grad():
      logits = model(**inputs).logits
  predicted_class_id = logits.argmax().item()
  # pred = model.config.id2label[predicted_class_id]
  print(predicted_class_id)
  return predicted_class_id

In [15]:
def evaluate_prediction(data_df):
  y_eval = data_df['label']
  y_pred = data_df['prediction']
  precision = precision_score(y_eval, y_pred)
  recall = recall_score(y_eval, y_pred)
  print(f'Precision: {precision:.3f} / Recall: {recall:.3f} / Accuracy: {(y_pred==y_eval).sum()/len(y_pred):.3f}')
  # Compute False Positive Rate, True Positive Rate, and AUC score
  fpr, tpr, thresholds = roc_curve(y_eval, y_pred)
  auc_score = auc(fpr, tpr)
  print(f'AUC: {auc_score:.3f}')

# Precision: 0.822 / Recall: 0.779 / Accuracy: 0.722
# AUC: 0.680

In [16]:
def evaluate_model(model_name, texts):
  print("make predict on some examples")
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
  print("predict some random triplets\n")
  for t in texts:
    predict_text_plausibility(t, tokenizer, model)
  print("*"*10, "predict the test set", "*"*10, "\n")
  pap_df = pd.read_csv(f'{BASE_DIR_PAP}/test.csv')
  pap_df['prediction'] = pap_df['text'].apply(lambda x: predict_text_plausibility(x, tokenizer, model))
  evaluate_prediction(pap_df)


## Preprocessing

In [17]:
def preprocess_function(examples):
  "tokenize the text and truncate sequences to be no longer than BERT maximum input length"
  return tokenizer(examples["text"], truncation=True)

In [18]:
def preprocess(data_set):
  tokenized_data = data_set.map(preprocess_function, batched=True)

  return tokenized_data


## Experiment configurations

In [19]:
@dataclass
class ScriptArguments:
    model_name: Optional[str] = field(default="roberta-base", metadata={"help": "the model name"})
    output_dir: Optional[str] = field(default=f"./output", metadata={"help": "the output directory"})
    learning_rate: Optional[float] = field(default=2e-5, metadata={"help": "the learning rate"})
    per_device_train_batch_size: Optional[int] = field(default=16, metadata={"help": "batch size per device during training"})
    per_device_eval_batch_size: Optional[int] = field(default=16, metadata={"help": "batch size per device during evaluation"})
    seq_length: Optional[int] = field(default=512, metadata={"help": "Input sequence length"})
    num_train_epochs: Optional[int] = field(default=4, metadata={"help": "the number of training epochs"})
    weight_decay: Optional[float] = field(default=0.01, metadata={"help": "using weight decay"})
    logging_steps: Optional[int] = field(default=10, metadata={"help": "the number of logging steps"})
    num_train_epochs: Optional[int] = field(default=3, metadata={"help": "the number of training epochs"})
    evaluation_strategy: Optional[str] = field(default="epoch", metadata={"help": "the number of training epochs"})
    save_strategy: Optional[str] = field(default="epoch", metadata={"help": "the number of training epochs"})
    load_best_model_at_end: Optional[bool] = field(default=True, metadata={"help": "Load the best model"})
    push_to_hub: Optional[bool] = field(default=False, metadata={"help": "Push the model to HF Hub"})

In [20]:
def run_experiment(script_args, data_set):
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
  tokenized_data = preprocess(data_set)
  training_args = TrainingArguments(
    output_dir=f"./{script_args.model_name}-semantic-plausibility",
    learning_rate=script_args.learning_rate,
    per_device_train_batch_size=script_args.per_device_train_batch_size,
    per_device_eval_batch_size=script_args.per_device_eval_batch_size,
    num_train_epochs=script_args.num_train_epochs,
    weight_decay=script_args.weight_decay,
    evaluation_strategy=script_args.evaluation_strategy,
    save_strategy=script_args.save_strategy,
    load_best_model_at_end=script_args.load_best_model_at_end,
    push_to_hub=script_args.push_to_hub,
  )
  trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["dev"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
  )
  trainer.train()
  # trainer.push_to_hub() # uncomment if wish to push to hub

### ROBERTa

In [21]:
script_args = ScriptArguments()
model = AutoModelForSequenceClassification.from_pretrained(
  script_args.model_name, num_labels=2, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(script_args.model_name)
run_experiment(script_args, pap)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/1386 [00:00<?, ? examples/s]

Map:   0%|          | 0/173 [00:00<?, ? examples/s]

Map:   0%|          | 0/174 [00:00<?, ? examples/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.513576,0.722543
2,No log,0.469197,0.786127
3,No log,0.484478,0.780347


In [22]:
model_name = "nguyenhongquy/roberta-base-semantic-plausibility"

In [23]:
evaluate_model(model_name, texts)

make predict on some examples


tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/863 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

predict some random triplets

man eats bridge
1
camel rides lake
1
paper kills leaves
1
man knits shirt
1
********** predict the test set ********** 

album makes debut
1
album breaks genre
1
lack produces form
1
inclusion expands range
1
candidacy encodes appreciation
1
literature comprises poetry
1
interpretation buries railway
0
invasion trims uniform
0
population begins period
1
saga injures courtesy
1
ratio outnumbers name
1
health gleans sweatshirt
1
growth implies ground
1
experimenter participates collision
1
delegation violates doctrine
1
attendee disengages norm
1
rafter accentuates tranquility
1
airship provides compliance
1
ordination withholds taboo
1
principle constrains wool
1
memory improves fitness
1
body gathers suspension
1
designer wins challenge
1
guest watches detonation
1
recommendation stimulates discussion
1
route utilizes part
1
eminence corroborates area
1
newspaper stimulates tradition
1
collision abandons value
1
pipe incorporates layer
1
letter asserts ant

### distilbert-base-uncased

In [24]:
script_args = ScriptArguments(model_name="distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
  script_args.model_name, num_labels=2, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(script_args.model_name)
run_experiment(script_args, pap)


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/1386 [00:00<?, ? examples/s]

Map:   0%|          | 0/173 [00:00<?, ? examples/s]

Map:   0%|          | 0/174 [00:00<?, ? examples/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.540567,0.710983
2,No log,0.521082,0.745665
3,No log,0.541128,0.734104


In [25]:
model_name = "nguyenhongquy/distilbert-base-uncased-semantic-plausibility"

In [26]:
evaluate_model(model_name, texts=texts)

make predict on some examples


tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

predict some random triplets

man eats bridge
0
camel rides lake
1
paper kills leaves
0
man knits shirt
1
********** predict the test set ********** 

album makes debut
1
album breaks genre
1
lack produces form
1
inclusion expands range
1
candidacy encodes appreciation
1
literature comprises poetry
1
interpretation buries railway
1
invasion trims uniform
1
population begins period
1
saga injures courtesy
1
ratio outnumbers name
1
health gleans sweatshirt
0
growth implies ground
1
experimenter participates collision
1
delegation violates doctrine
1
attendee disengages norm
1
rafter accentuates tranquility
1
airship provides compliance
1
ordination withholds taboo
1
principle constrains wool
1
memory improves fitness
1
body gathers suspension
1
designer wins challenge
1
guest watches detonation
1
recommendation stimulates discussion
1
route utilizes part
1
eminence corroborates area
1
newspaper stimulates tradition
1
collision abandons value
1
pipe incorporates layer
1
letter asserts ant

## Interim Conclusion

*Roberta*

* All triples are predicted as Plausible, including "man eats bridge", "camel rides lake", "paper kills leaves". RoBERTa has a very high recall, suggesting that the model always predict Plausible for any triples.
* For the test set: Precision: 0.729 / Recall: 0.976 / Accuracy: 0.724
AUC: 0.538

Distilbert*

* All triples are predicted as Plausible, including "camel rides lake", but correctly predict "man eats bridge" as Implausible.
* For the test set: Precision: 0.739 / Recall: 0.960 / Accuracy: 0.730
AUC: 0.560
* Performance is better compared with RoBERTa, especially with AUC metric.

# Reference

* AdamW
https://keras.io/api/optimizers/adamw/

* Roberta https://huggingface.co/docs/transformers/v4.36.1/en/model_doc/roberta#transformers.RobertaForSequenceClassification

* The finetuning script is based on HuggingFace tutorial https://huggingface.co/docs/transformers/training