# Connect to Google Drive and set the folder

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!mkdir gp

mkdir: cannot create directory ‘gp’: File exists


In [3]:
!cp /content/drive/MyDrive/CS-7643-EfficiencyLane/gp.py /content/gp

In [4]:
cd gp

/content/gp


In [5]:
!ls -ltr

total 16
drwx------ 11 root root  4096 Apr 25 12:36 data
-rw-------  1 root root 10478 Apr 25 12:41 gp.py


In [6]:
!cp -r /content/drive/MyDrive/CS-7643-EfficiencyLane/data /content/gp

In [7]:
!pwd

/content/gp


# Install libraries

In [8]:
!pip install -qq adapters datasets

In [9]:
!pip install accelerate



In [10]:
!pip install transformers[torch]



In [11]:
from transformers import RobertaTokenizer, RobertaConfig
from transformers import RobertaForSequenceClassification, Trainer, EvalPrediction, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
from adapters import AutoAdapterModel, RobertaAdapterModel, AdapterTrainer, AdapterConfig
from transformers import RobertaTokenizer, RobertaConfig
from gp import setup_dataset
import torch
import numpy as np
import matplotlib.pyplot as plt

def compute_f1(p: EvalPrediction):
        preds = np.argmax(p.predictions, axis=1)
        return {"macro_f1": f1_score(p.label_ids, preds, average='macro')}

model_name = 'roberta-base'
dataset_name1 = 'citation_intent'
dataset_name2 = 'sciie'
model_type = 'base'

base_model = RobertaAdapterModel.from_pretrained(model_name)

# Define adapter configuration
adapter_config = AdapterConfig.load("pfeiffer")  # Using Pfeiffer configuration for simplicity

dataset1, num_labels1 = setup_dataset(dataset_name1)
dataset2, num_labels2 = setup_dataset(dataset_name2)

adapter_name1 = model_type+"_"+dataset_name1
adapter_name2 = model_type+"_"+dataset_name2

# Add two adapters, one for each task
base_model.add_adapter(adapter_name1, config=adapter_config)


"""config1 = RobertaConfig.from_pretrained(
        model_name,
        num_labels=num_labels1
)
config2 = RobertaConfig.from_pretrained(
        model_name,
        num_labels=num_labels2
)"""

adapter_training_args = TrainingArguments(
        learning_rate=5e-4,
        num_train_epochs=10,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        logging_dir=f"./fusion_{model_type}_adapter_logs",
        warmup_steps=500,
        logging_steps=10,
        output_dir=f"./fusion_{model_type}_adapter_output",
        overwrite_output_dir=True,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        remove_unused_columns=False,
        load_best_model_at_end=True,
        metric_for_best_model="eval_macro_f1"
    )


#Activate the 1st adapter and train it
base_model.add_classification_head(
        adapter_name1,
        num_labels=num_labels1,
        overwrite_ok=True
    )
base_model.train_adapter(adapter_name1)
adapter_trainer1 = AdapterTrainer(
        model=base_model,
        args=adapter_training_args,
        train_dataset=dataset1["train"],
        eval_dataset=dataset1["dev"],
        compute_metrics=compute_f1,
    )
adapter_trainer1.train()


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaAdapterModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading dataset:citation_intent
Starting to load data...


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Finished loading
All unique labels in the dataset: {'CompareOrContrast', 'Uses', 'Extends', 'Background', 'Future', 'Motivation'}
Label Encoder: {'CompareOrContrast': 0, 'Uses': 1, 'Extends': 2, 'Background': 3, 'Future': 4, 'Motivation': 5}


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/1688 [00:00<?, ? examples/s]

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Map:   0%|          | 0/1688 [00:00<?, ? examples/s]

Map:   0%|          | 0/139 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Loading dataset:sciie
Starting to load data...


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Finished loading
All unique labels in the dataset: {'HYPONYM-OF', 'PART-OF', 'FEATURE-OF', 'COMPARE', 'USED-FOR', 'CONJUNCTION', 'EVALUATE-FOR'}
Label Encoder: {'HYPONYM-OF': 0, 'PART-OF': 1, 'FEATURE-OF': 2, 'COMPARE': 3, 'USED-FOR': 4, 'CONJUNCTION': 5, 'EVALUATE-FOR': 6}


Map:   0%|          | 0/3219 [00:00<?, ? examples/s]

Map:   0%|          | 0/974 [00:00<?, ? examples/s]

Map:   0%|          | 0/455 [00:00<?, ? examples/s]

Map:   0%|          | 0/3219 [00:00<?, ? examples/s]

Map:   0%|          | 0/974 [00:00<?, ? examples/s]

Map:   0%|          | 0/455 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Epoch,Training Loss,Validation Loss,Macro F1
1,1.3564,1.307702,0.11368
2,1.1627,1.124139,0.323295
3,1.1344,0.914606,0.40151
4,0.8646,0.787648,0.544861
5,0.5856,0.717973,0.593079
6,0.6685,0.721648,0.650529
7,0.4838,0.645197,0.69602
8,0.3351,0.756882,0.676529
9,0.306,0.865144,0.693713
10,0.1152,0.878396,0.693713




NameError: name 'fine_tuning_model' is not defined

In [12]:
for name, param in base_model.named_parameters():
    print(f"{name}: {param.numel()}")

roberta.embeddings.word_embeddings.weight: 38603520
roberta.embeddings.position_embeddings.weight: 394752
roberta.embeddings.token_type_embeddings.weight: 768
roberta.embeddings.LayerNorm.weight: 768
roberta.embeddings.LayerNorm.bias: 768
roberta.encoder.layer.0.attention.self.query.weight: 589824
roberta.encoder.layer.0.attention.self.query.bias: 768
roberta.encoder.layer.0.attention.self.key.weight: 589824
roberta.encoder.layer.0.attention.self.key.bias: 768
roberta.encoder.layer.0.attention.self.value.weight: 589824
roberta.encoder.layer.0.attention.self.value.bias: 768
roberta.encoder.layer.0.attention.output.dense.weight: 589824
roberta.encoder.layer.0.attention.output.dense.bias: 768
roberta.encoder.layer.0.attention.output.LayerNorm.weight: 768
roberta.encoder.layer.0.attention.output.LayerNorm.bias: 768
roberta.encoder.layer.0.intermediate.dense.weight: 2359296
roberta.encoder.layer.0.intermediate.dense.bias: 3072
roberta.encoder.layer.0.output.dense.weight: 2359296
roberta.enc

In [13]:
base_model.add_adapter(adapter_name2, config=adapter_config)

# Now train the second adapter
base_model.add_classification_head(
        adapter_name2,
        num_labels=num_labels2,
        overwrite_ok=True
    )
base_model.train_adapter(adapter_name2)
adapter_trainer2 = AdapterTrainer(
        model=base_model,
        args=adapter_training_args,
        train_dataset=dataset2["train"],
        eval_dataset=dataset2["dev"],
        compute_metrics=compute_f1,
    )
adapter_trainer2.train()



dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Epoch,Training Loss,Validation Loss,Macro F1
1,1.3724,1.378178,0.22113
2,0.6373,0.603502,0.702372
3,0.5536,0.531738,0.784442
4,0.5865,0.48519,0.787648
5,0.2841,0.479583,0.819737
6,0.1599,0.441486,0.849281
7,0.0631,0.457161,0.826352
8,0.1141,0.469063,0.8592
9,0.0849,0.499128,0.854583
10,0.0713,0.501069,0.848801




TrainOutput(global_step=2020, training_loss=0.44856462642402933, metrics={'train_runtime': 339.0775, 'train_samples_per_second': 94.934, 'train_steps_per_second': 5.957, 'total_flos': 1379316602966400.0, 'train_loss': 0.44856462642402933, 'epoch': 10.0})

In [14]:
for name, param in base_model.named_parameters():
    print(f"{name}: {param.numel()}")

roberta.embeddings.word_embeddings.weight: 38603520
roberta.embeddings.position_embeddings.weight: 394752
roberta.embeddings.token_type_embeddings.weight: 768
roberta.embeddings.LayerNorm.weight: 768
roberta.embeddings.LayerNorm.bias: 768
roberta.encoder.layer.0.attention.self.query.weight: 589824
roberta.encoder.layer.0.attention.self.query.bias: 768
roberta.encoder.layer.0.attention.self.key.weight: 589824
roberta.encoder.layer.0.attention.self.key.bias: 768
roberta.encoder.layer.0.attention.self.value.weight: 589824
roberta.encoder.layer.0.attention.self.value.bias: 768
roberta.encoder.layer.0.attention.output.dense.weight: 589824
roberta.encoder.layer.0.attention.output.dense.bias: 768
roberta.encoder.layer.0.attention.output.LayerNorm.weight: 768
roberta.encoder.layer.0.attention.output.LayerNorm.bias: 768
roberta.encoder.layer.0.intermediate.dense.weight: 2359296
roberta.encoder.layer.0.intermediate.dense.bias: 3072
roberta.encoder.layer.0.output.dense.weight: 2359296
roberta.enc

In [18]:
# We evaluate the adapter1 first
base_model.set_active_adapters(adapter_name1)

adapter_trainer = AdapterTrainer(
    model=base_model,
    eval_dataset=dataset1['test'],
    compute_metrics=compute_f1,
)

# Perform the evaluation on the test dataset
evaluation_results = adapter_trainer.evaluate()

print("Evaluation Results for Citation Intent Dataset:")
print(evaluation_results)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Evaluation Results for Citation Intent Dataset:
{'eval_loss': 0.9000866413116455, 'eval_macro_f1': 0.6445163583461455, 'eval_runtime': 0.8038, 'eval_samples_per_second': 172.937, 'eval_steps_per_second': 22.395}


In [19]:
# Next we evaluate the second adapter
base_model.set_active_adapters(adapter_name2)

adapter_trainer = AdapterTrainer(
    model=base_model,
    eval_dataset=dataset2['test'],
    compute_metrics=compute_f1,
)

# Perform the evaluation on the test dataset
evaluation_results = adapter_trainer.evaluate()

print("Evaluation Results for Citation Intent Dataset:")
print(evaluation_results)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Evaluation Results for Citation Intent Dataset:
{'eval_loss': 0.6637074947357178, 'eval_macro_f1': 0.7976293959521256, 'eval_runtime': 6.4193, 'eval_samples_per_second': 151.729, 'eval_steps_per_second': 19.005}


In [21]:
# Iterate through the model's layers to get the parameter count for each layer
layer_info = []

for name, param in base_model.named_parameters():
    # Get the number of parameters in each layer
    param_count = param.numel()
    # Collect layer name and parameter count
    layer_info.append((name, param_count))

# Print the total number of parameters
total_params = sum(param_count for _, param_count in layer_info)

print("Total Parameter Count:", total_params)

Total Parameter Count: 128268262
