In [1]:
packages_to_install = {
    "ipywidgets": "ipywidgets",
    "numpy": "numpy=1.24.0",
    "torch": "torch",
    "matplotlib": "matplotlib",
    "sentencepiece": "sentencepiece",
    "protobuf": "protobuf",
    "datasets": "datasets",
    "transformers": "transformers",
    "diffusers": "diffusers",
    "peft": "peft",
    "h5py": "h5py",
    "scikit-learn": "scikit-learn",
    "scipy": "scipy",
    "wandb": "wandb",
    "mwparserfromhell": "mwparserfromhell",
    "apache_beam": "apache_beam"
}

In [2]:
%%time
import importlib

for package_name, install_command in packages_to_install.items():
    try:
        importlib.import_module(package_name)
        print(f"{package_name} is already installed.")
    except ImportError:
        print(f"{package_name} is not installed. Installing it now...")
        !conda install -y {install_command}

ipywidgets is already installed.
numpy is already installed.
torch is already installed.
matplotlib is already installed.
sentencepiece is already installed.
protobuf is not installed. Installing it now...
Retrieving notices: ...working... done
done
Solving environment: done


  current version: 23.7.4
  latest version: 24.1.2

Please update conda by running

    $ conda update -n base -c conda-forge conda

Or to minimize the number of packages updated during conda update use

     conda install conda=24.1.2



# All requested packages already installed.

datasets is already installed.
transformers is already installed.
diffusers is already installed.
peft is already installed.
h5py is already installed.
scikit-learn is not installed. Installing it now...
done
Solving environment: done


  current version: 23.7.4
  latest version: 24.1.2

Please update conda by running

    $ conda update -n base -c conda-forge conda

Or to minimize the number of packages

In [3]:
# import dependencies
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Build Dataset

## Training Dataset

In [28]:
from datasets import load_dataset

qu_data = load_dataset("wikipedia", language="qu", date="20240301", trust_remote_code=True)

In [34]:
filtered_dataset = qu_data.filter(lambda example: len(example['text']) <= 2048)

In [35]:
shuffled_dataset = filtered_dataset.shuffle(seed=42)
filtered_dataset = shuffled_dataset["train"].select(range(2500))

In [36]:
MODEL_NAME = "facebook/xglm-564M" # specify model name

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, device_map = 'cuda')

def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized_qu_data = filtered_dataset.map(tokenize_function, batched=True, num_proc=8, remove_columns=filtered_dataset.column_names)

In [37]:
block_size = 128
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [56]:
lm_datasets = tokenized_qu_data.map(
    group_texts,
    batched=True,
    batch_size=64,
    num_proc=8,
)

Map (num_proc=8):   0%|          | 0/2500 [00:00<?, ? examples/s]

In [57]:
tokenizer.decode(lm_datasets[1]["input_ids"])

"m) 153 km Yunkay (2.500 m) 163 km Qaras (2.290 m) 205 km Wallanka (1.820 m) 215 km Yuramarka (1.420 m) 343 km Santa (20 m) Kaypipas qhaway Patu Wayq'u Waylas Pukyukuna Instituto Nacional Geográfico Mayu (Piruw) Mayu (Anqash suyu) Mayu (Qispi kay suyu) Rikuway pruwinsya Santa pruwinsya Waras pruwinsya Waylas pruwinsya</s> Nonato Rufino Chuquimamani Valer sutiyuq runaqa (1946 watapi pa"

## Validation Dataset

In [58]:
DATA_SET_NAME = "facebook/flores"

In [59]:
# specify languages
LANGUAGES = [
    "eng_Latn",
    "spa_Latn",
    "ita_Latn",
    "deu_Latn",
    "arb_Arab",
    "tel_Telu",
    "tam_Taml",
    "quy_Latn"
]

In [60]:
# load flores data for each language
# TODO: your code goes here
multilang_dataset = {}
for language in LANGUAGES:
    multilang_dataset[language] = load_dataset(DATA_SET_NAME, language, trust_remote_code=True)

In [61]:
# tokenize the data
from transformers import DataCollatorForLanguageModeling

# load a pre-trained tokenizer from the huggingface hub
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, device_map = 'cuda')

# specify the tokenization function
def tokenization(example):
    return tokenizer(example['sentence'])

# TODO: your code goes here
tokenization(multilang_dataset["eng_Latn"]["dev"])
tokenized_multilang_dataset = {}
for key, data in multilang_dataset.items():
    tokenized_multilang_dataset[key] = data.map(tokenization, batched=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [62]:
for key, data in tokenized_multilang_dataset.items():
    tokenized_multilang_dataset[key] = tokenized_multilang_dataset[key]["dev"].remove_columns(["id", "URL", "domain", "topic", "has_image", "has_hyperlink", "sentence"])
    tokenized_multilang_dataset[key].set_format("torch")

In [63]:
tokenized_multilang_dataset

{'eng_Latn': Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 997
 }),
 'spa_Latn': Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 997
 }),
 'ita_Latn': Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 997
 }),
 'deu_Latn': Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 997
 }),
 'arb_Arab': Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 997
 }),
 'tel_Telu': Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 997
 }),
 'tam_Taml': Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 997
 }),
 'quy_Latn': Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 997
 })}

In [64]:
# # construct a pytorch data loader for each dataset
# BATCH_SIZE = 2 # for testing purposes, we start with a batch size of 2. You can change this later.

# from torch.utils.data import DataLoader

# # random_test_dataloader = DataLoader(random_test_dataset, batch_size=1, shuffle=False)
# # for i,data in random_test_dataloader:
# #     print(i)

# def make_dataloaders(multilang_dataset):
#     dataloaders_dict = {}
#     for key, dataset_dict in multilang_dataset.items():
#         dataloaders_dict[key] = {"dev":DataLoader(multilang_dataset[key]["dev"], batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle = True),
#                                  "devtest": DataLoader(multilang_dataset[key]["devtest"], batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle = True)}
#     return dataloaders_dict

# dataloaders_dict = make_dataloaders(tokenized_multilang_dataset)

# Model

In [65]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [66]:
# load pre-trained model from the huggingface hub
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map = 'cuda')

In [67]:
model

XGLMForCausalLM(
  (model): XGLMModel(
    (embed_tokens): Embedding(256008, 1024, padding_idx=1)
    (embed_positions): XGLMSinusoidalPositionalEmbedding()
    (layers): ModuleList(
      (0-23): 24 x XGLMDecoderLayer(
        (self_attn): XGLMAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (activation_fn): GELUActivation()
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine

# Bitfit:
### Version 1:
turns off params param.requires_grad = False
turns on biases required_grad = True

### Version 2:
turns off params
Trains only biases with high gradients

### Version 2: Only biases with high gradients are trained

In [68]:
for param in model.parameters():
    param.requires_grad = False

names = ["final_layer_norm.bias", "self_attn_layer_norm.bias", "model.layer_norm.bias"]
for i in model.named_parameters():
    for name in names:
        if name in i[0]:
            i[1].requires_grad = True

In [69]:
a = [param.requires_grad for param in model.parameters()]
print(sum(a))
print(len(a))

49
387


# Training

In [70]:
import os

os.environ["WANDB_PROJECT"] = "XGLM finetuning"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint"
os.environ["WANDB_WATCH"]="all"
# os.environ["WANDB_SILENT"]="true"

In [74]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    "finetuned-bitfit-two",
    evaluation_strategy = "steps",
    eval_steps=200,
    save_total_limit=2,
    save_steps=200,
    load_best_model_at_end=True,
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
    report_to=["wandb"],
    run_name="BITFIT_TWO_1",
    logging_strategy="steps",
    logging_steps=1,
    metric_for_best_model="quy_Latn_loss",
    num_train_epochs=3,
    gradient_accumulation_steps=4,
)

In [75]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets,
    eval_dataset=tokenized_multilang_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [76]:
import wandb
trainer.train()
wandb.finish()

Step,Training Loss,Validation Loss,Eng Latn Loss,Spa Latn Loss,Ita Latn Loss,Deu Latn Loss,Arb Arab Loss,Tel Telu Loss,Tam Taml Loss,Quy Latn Loss
200,5.2768,No log,5.123478,4.719269,4.766665,4.79219,4.885338,4.505744,4.322783,6.760704


Step,Training Loss,Validation Loss,Eng Latn Loss,Spa Latn Loss,Ita Latn Loss,Deu Latn Loss,Arb Arab Loss,Tel Telu Loss,Tam Taml Loss,Quy Latn Loss
200,5.2768,No log,5.123478,4.719269,4.766665,4.79219,4.885338,4.505744,4.322783,6.760704


Step,Training Loss,Validation Loss,Eng Latn Loss,Spa Latn Loss,Ita Latn Loss,Deu Latn Loss,Arb Arab Loss,Tel Telu Loss,Tam Taml Loss,Quy Latn Loss
200,5.2768,No log,5.123478,4.719269,4.766665,4.79219,4.885338,4.505744,4.322783,6.760704


Step,Training Loss,Validation Loss,Eng Latn Loss,Spa Latn Loss,Ita Latn Loss,Deu Latn Loss,Arb Arab Loss,Tel Telu Loss,Tam Taml Loss,Quy Latn Loss
200,5.2768,No log,5.123478,4.719269,4.766665,4.79219,4.885338,4.505744,4.322783,6.760704


VBox(children=(Label(value='87.234 MB of 4349.327 MB uploaded\r'), FloatProgress(value=0.020056909081894208, m…

wandb: Network error (TransientError), entering retry loop.


0,1
eval/arb_Arab_loss,▁
eval/arb_Arab_runtime,▁
eval/arb_Arab_samples_per_second,▁
eval/arb_Arab_steps_per_second,▁
eval/deu_Latn_loss,▁
eval/deu_Latn_runtime,▁
eval/deu_Latn_samples_per_second,▁
eval/deu_Latn_steps_per_second,▁
eval/eng_Latn_loss,▁
eval/eng_Latn_runtime,▁

0,1
eval/arb_Arab_loss,4.88534
eval/arb_Arab_runtime,19.7808
eval/arb_Arab_samples_per_second,50.402
eval/arb_Arab_steps_per_second,6.319
eval/deu_Latn_loss,4.79219
eval/deu_Latn_runtime,25.6286
eval/deu_Latn_samples_per_second,38.902
eval/deu_Latn_steps_per_second,4.877
eval/eng_Latn_loss,5.12348
eval/eng_Latn_runtime,16.3286


In [None]:
# model.save_pretrained("./pretrained_FFT_1")

In [None]:
# torch.cuda.empty_cache()