# Language Adapters

## Plug in model and add adapter

In [None]:
import adapters
from adapters import AdapterModelInterface
from transformers import AutoModelForMaskedLM
import os

In [None]:
#Based on: https://github.com/adapter-hub/adapters/blob/main/src/adapters/wrappers/interfaces.py

plugin_interface = AdapterModelInterface(
    adapter_methods=["bottleneck", "invertible"], #Enable all bottleneck (task) and invertible (lang)
    model_embeddings="embeddings",
    model_layers="layers",
    layer_self_attn="attn",
    layer_cross_attn=None,
    attn_qkv_proj="Wqkv",
    attn_o_proj="Wo",
    layer_intermediate_proj="mlp.Wi",
    layer_output_proj="mlp.Wo",
    layer_pre_self_attn="attn",
    layer_pre_cross_attn=None,
    layer_pre_ffn="mlp",
    layer_ln_1="mlp_norm",
    layer_ln_2=None,
)

#Load model and plug it in
model = AutoModelForMaskedLM.from_pretrained("jhu-clsp/mmBERT-base")
adapters.init(model, interface=plugin_interface)

In [None]:
from adapters import SeqBnInvConfig

In [None]:
#On this link: https://docs.adapterhub.ml/training.html, they show that they train language adapters on "seq_bn_inv", so we do that too. Name of config can be found here: https://docs.adapterhub.ml/overview.html
config = SeqBnInvConfig()
model.add_adapter("English_adapter_2", config=config)

In [None]:
print(model.adapter_summary())

In [None]:
model.set_active_adapters("English_adapter_2")

In [None]:
model.train_adapter("English_adapter_2")

In [None]:
model.delete_adapter("English_adapter_2")

## Get dataset

The language adapters trained by the authors of the library are trained on Wikipedia articles. So, we also do that, using the Wikimedia Hugging Face dataset \
Every instance (e.g. dataset['train'][0]) is a whole article. Find out whether to make them shorter

Reference:
https://huggingface.co/docs/datasets/use_dataset - shows how to extract the text we want from each datapoint (here we want the text-columns and don't care about title- or url-columns) and get them tokenized. 
    
Next, we want to concatenate the dataset and split it anew to get chunks that are readable by the model. Then, we mask them using Hugging Face's pipeline. Link: https://huggingface.co/docs/transformers/main/tasks/masked_language_modeling

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/mmBERT-base")

In [None]:
#This dataset only has 1 split, "train", no "test"
ds = load_dataset("wikimedia/wikipedia", "20231101.en", split = 'train[:10000]')

In [None]:
ds1 = load_dataset("wikimedia/wikipedia", "20231101.en", split = 'train[10000:12000]')

In [None]:
ds[0]['text']

In [None]:
type(ds[0]['text'])

In [None]:
tokenizer(ds[0]['text'])

In [None]:
ds

In [None]:
#According to video on https://huggingface.co/docs/transformers/main/tasks/masked_language_modeling, this is a good way of chunking when our inputs are very long
def tokenize_and_chunk(example):
    tokenized = tokenizer(
        example["text"],
        truncation=True,
        max_length=1024, #Smaller than the model's max size to save memory. Try 1024, faster + lower loss
        return_overflowing_tokens=True,
        return_length=True,
    )

    result = {
    'input_ids': [],
    'attention_mask': [],
    }

    for i in range(len(tokenized["input_ids"])):
        if tokenized["length"][i] < 100:
            continue
        result["input_ids"].append(tokenized["input_ids"][i])
        result["attention_mask"].append(tokenized["attention_mask"][i])

    return result

In [None]:
train_batched = ds.map(tokenize_and_chunk, batched = True, remove_columns = ['id', 'url', 'title', 'text'])

In [None]:
test_batched = ds1.map(tokenize_and_chunk, batched = True, remove_columns = ['id', 'url', 'title', 'text'])

In [None]:
train_batched #This should altsÃ¥ work. There are 4006 rows, so more than the number of articles.
#Every row is 2048 long.

In [None]:
train_batched.set_format(type ='torch')

In [None]:
test_batched.set_format(type ='torch')

In [None]:
train_batched[:10]

## Train adapter

In [None]:
#The adapter trainer does take a collate function: https://docs.adapterhub.ml/classes/adapter_training.html

In [None]:
import torch

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
#Based on Github code and https://docs.adapterhub.ml/training.html

import numpy as np
from transformers import TrainingArguments, EvalPrediction
from adapters import AdapterTrainer

training_args = TrainingArguments(
    learning_rate= 1e-4, #it is a higher rate and should converge faster
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=16, #Compute gradient for batches of 16
    #gradient_checkpointing=True, #Saves memory but makes training slower
    logging_steps=200,
    output_dir="./training_output_English_2",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)


In [None]:
trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train_batched,
    eval_dataset=test_batched,
    data_collator=data_collator,
)

In [None]:
test = trainer.get_train_dataloader()
batch = next(iter(test))

print(batch.keys())
print(batch["labels"])

In [None]:
trainer.train()

In [None]:
model.save_adapter("./English_adapter_loss_20.57", "English_adapter_2")