In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd /content/drive/MyDrive/CS-7643-EfficiencyLane/

/content/drive/MyDrive/CS-7643-EfficiencyLane


In [3]:
!pip install -qq adapters datasets

In [5]:
!pip install accelerate



In [4]:
!pip install transformers[torch]



In [20]:
dataset_name = 'citation_intent'
path = f"data/{dataset_name}/"
model_name = 'roberta-base'

In [21]:
from datasets import load_dataset
# Load all splits at once
data_files = {
    "train": path + "train.jsonl",
    "test": path + "test.jsonl",
    "dev": path + "dev.jsonl"
}
dataset = load_dataset("json", data_files=data_files)

In [22]:
import json

def extract_labels(file_path):
    labels = set()  # Using a set to store unique labels
    with open(file_path, 'r') as file:
        for line in file:
            data = json.loads(line)
            labels.add(data["label"])
    return labels

train_labels = extract_labels(f'data/{dataset_name}/train.jsonl')
dev_labels = extract_labels(f'data/{dataset_name}/dev.jsonl')
test_labels = extract_labels(f'data/{dataset_name}/test.jsonl')

all_labels = train_labels.union(dev_labels).union(test_labels)
print("All unique labels in the dataset:", all_labels)
label_encoder = {label: idx for idx, label in enumerate(all_labels)}
print("Label Encoder:", label_encoder)
num_labels = len(label_encoder)

All unique labels in the dataset: {'Future', 'Extends', 'CompareOrContrast', 'Motivation', 'Uses', 'Background'}
Label Encoder: {'Future': 0, 'Extends': 1, 'CompareOrContrast': 2, 'Motivation': 3, 'Uses': 4, 'Background': 5}


In [23]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained(model_name)



def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], max_length=80, truncation=True, padding="max_length")

# Encode the input data
dataset = dataset.map(encode_batch, batched=True)

#dataset['train'][0]

def convert_labels_to_integers(example):
    return {'label': label_encoder[example['label']]}

dataset = dataset.map(convert_labels_to_integers)

# The transformers model expects the target class column to be named "labels"
dataset = dataset.rename_column(original_column_name="label", new_column_name="labels")

# Transform to pytorch tensors and only output the required columns
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/1688 [00:00<?, ? examples/s]

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

In [24]:
from transformers import RobertaConfig
from adapters import AutoAdapterModel

config = RobertaConfig.from_pretrained(
    model_name,
    num_labels=num_labels
)
model = AutoAdapterModel.from_pretrained(
    model_name,
    config=config,
)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# Add a new adapter
adapter_name = model_name+"_"+dataset_name
model.add_adapter(adapter_name, config="pfeiffer")

# Add a matching classification head
model.add_classification_head(
    adapter_name,
    num_labels=num_labels
  )

# Activate the adapter
model.train_adapter(adapter_name)

In [26]:
import numpy as np
from transformers import TrainingArguments, EvalPrediction, Trainer
#from adapters import AdapterTrainer

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_accuracy,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [27]:
trainer.train()

Step,Training Loss
200,1.2322


TrainOutput(global_step=318, training_loss=1.1314510609368857, metrics={'train_runtime': 84.9239, 'train_samples_per_second': 119.26, 'train_steps_per_second': 3.745, 'total_flos': 426730975695360.0, 'train_loss': 1.1314510609368857, 'epoch': 6.0})

In [28]:
trainer.train()

Step,Training Loss
200,0.8551


TrainOutput(global_step=318, training_loss=0.8002688809760712, metrics={'train_runtime': 84.95, 'train_samples_per_second': 119.223, 'train_steps_per_second': 3.743, 'total_flos': 426730975695360.0, 'train_loss': 0.8002688809760712, 'epoch': 6.0})

In [29]:
trainer.evaluate()

{'eval_runtime': 2.0327,
 'eval_samples_per_second': 68.381,
 'eval_steps_per_second': 2.46,
 'epoch': 6.0}

In [33]:
model_name = 'allenai/cs_roberta_base'

config = RobertaConfig.from_pretrained(
    model_name,
    num_labels=num_labels
)
model = AutoAdapterModel.from_pretrained(
    model_name,
    config=config,
)

# Add a new adapter
adapter_name = model_name+"_"+dataset_name
model.add_adapter(adapter_name, config="pfeiffer")

# Add a matching classification head
model.add_classification_head(
    adapter_name,
    num_labels=num_labels
  )

# Activate the adapter
model.train_adapter(adapter_name)

training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_accuracy,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [31]:
trainer.train()

Step,Training Loss
200,1.2606


TrainOutput(global_step=318, training_loss=1.177172966723172, metrics={'train_runtime': 87.8677, 'train_samples_per_second': 115.264, 'train_steps_per_second': 3.619, 'total_flos': 423608020669440.0, 'train_loss': 1.177172966723172, 'epoch': 6.0})

In [34]:
trainer.evaluate()

{'eval_runtime': 1.4986,
 'eval_samples_per_second': 92.754,
 'eval_steps_per_second': 3.336}