In [6]:
packages_to_install = {
    "ipywidgets": "ipywidgets",
    "numpy": "numpy=1.24.0",
    "torch": "torch",
    "matplotlib": "matplotlib",
    "sentencepiece": "sentencepiece",
    "protobuf": "protobuf",
    "datasets": "datasets",
    "transformers": "transformers",
    "diffusers": "diffusers",
    "peft": "peft",
    "h5py": "h5py",
    "scikit-learn": "scikit-learn",
    "scipy": "scipy",
    "wandb": "wandb",
    "mwparserfromhell": "mwparserfromhell",
    "apache_beam": "apache_beam"
}

In [7]:
%%time
import importlib

for package_name, install_command in packages_to_install.items():
    try:
        importlib.import_module(package_name)
        print(f"{package_name} is already installed.")
    except ImportError:
        print(f"{package_name} is not installed. Installing it now...")
        !conda install -y {install_command}

ipywidgets is already installed.
numpy is already installed.
torch is already installed.
matplotlib is already installed.
sentencepiece is already installed.
protobuf is not installed. Installing it now...
done
Solving environment: done


  current version: 23.7.4
  latest version: 24.1.2

Please update conda by running

    $ conda update -n base -c conda-forge conda

Or to minimize the number of packages updated during conda update use

     conda install conda=24.1.2



# All requested packages already installed.

datasets is already installed.
transformers is already installed.
diffusers is already installed.
peft is already installed.
h5py is already installed.
scikit-learn is not installed. Installing it now...
done
Solving environment: done


  current version: 23.7.4
  latest version: 24.1.2

Please update conda by running

    $ conda update -n base -c conda-forge conda

Or to minimize the number of packages updated during conda update use

    

In [8]:
import h5py

import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [9]:
# import dependencies
import matplotlib.pyplot as plt
import numpy as np
import torch

# from datasets import load_dataset, load_dataset_builder, get_dataset_split_names, get_dataset_config_names
from transformers import XGLMTokenizer, XGLMTokenizerFast, XGLMForCausalLM, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, DataCollatorWithPadding

# Build Dataset

## Training Dataset

In [10]:
from datasets import load_dataset

qu_data = load_dataset("wikipedia", language="qu", date="20240301", trust_remote_code=True)

Generating train split: 0 examples [00:00, ? examples/s]

Extracting content from /home/reni/.cache/huggingface/datasets/downloads/d7201af134283fb091b2110c88fe778c686b0d9c49f1d3a573d40a87cce26d13


In [11]:
filtered_dataset = qu_data.filter(lambda example: len(example['text']) <= 2048)

Filter:   0%|          | 0/24240 [00:00<?, ? examples/s]

In [12]:
shuffled_dataset = filtered_dataset.shuffle(seed=42)
filtered_dataset = shuffled_dataset["train"].select(range(2500))

In [13]:
MODEL_NAME = "facebook/xglm-564M" # specify model name

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, device_map = 'cuda')

def tokenize_function(examples):
    return tokenizer(examples["text"])
# tokenized_qu_data = filtered_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=filtered_dataset["train"].column_names)
tokenized_qu_data = filtered_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=filtered_dataset.column_names)

Map (num_proc=4):   0%|          | 0/2500 [00:00<?, ? examples/s]

In [14]:
block_size = 128
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [15]:
lm_datasets = tokenized_qu_data.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/2500 [00:00<?, ? examples/s]

In [16]:
# tokenizer.decode(lm_datasets["train"][1]["input_ids"])
tokenizer.decode(lm_datasets[1]["input_ids"])

"m) 153 km Yunkay (2.500 m) 163 km Qaras (2.290 m) 205 km Wallanka (1.820 m) 215 km Yuramarka (1.420 m) 343 km Santa (20 m) Kaypipas qhaway Patu Wayq'u Waylas Pukyukuna Instituto Nacional Geográfico Mayu (Piruw) Mayu (Anqash suyu) Mayu (Qispi kay suyu) Rikuway pruwinsya Santa pruwinsya Waras pruwinsya Waylas pruwinsya</s> Nonato Rufino Chuquimamani Valer sutiyuq runaqa (1946 watapi pa"

## Validation Dataset

In [17]:
DATA_SET_NAME = "facebook/flores"

In [18]:
# specify languages
LANGUAGES = [
    "eng_Latn",
    "spa_Latn",
    "ita_Latn",
    "deu_Latn",
    "arb_Arab",
    "tel_Telu",
    "tam_Taml",
    "quy_Latn"
]

In [19]:
# load flores data for each language
# TODO: your code goes here
multilang_dataset = {}
for language in LANGUAGES:
    multilang_dataset[language] = load_dataset(DATA_SET_NAME, language)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [20]:
# tokenize the data
from transformers import DataCollatorForLanguageModeling

# load a pre-trained tokenizer from the huggingface hub
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, device_map = 'cuda')

# gpt2 does not have a padding token, so we have to add it manually
if MODEL_NAME == "gpt2":
    tokenizer.add_special_tokens({'pad_token': tokenizer.unk_token})

# specify the tokenization function
def tokenization(example):
    return tokenizer(example['sentence'])

# TODO: your code goes here
tokenization(multilang_dataset["eng_Latn"]["dev"])
tokenized_multilang_dataset = {}
for key, data in multilang_dataset.items():
    tokenized_multilang_dataset[key] = data.map(tokenization, batched=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Map:   0%|          | 0/997 [00:00<?, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Map:   0%|          | 0/997 [00:00<?, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Map:   0%|          | 0/997 [00:00<?, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Map:   0%|          | 0/997 [00:00<?, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Map:   0%|          | 0/997 [00:00<?, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Map:   0%|          | 0/997 [00:00<?, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Map:   0%|          | 0/997 [00:00<?, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Map:   0%|          | 0/997 [00:00<?, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

In [21]:
for key, data in tokenized_multilang_dataset.items():
    tokenized_multilang_dataset[key] = tokenized_multilang_dataset[key]["dev"].remove_columns(["id", "URL", "domain", "topic", "has_image", "has_hyperlink", "sentence"])
    tokenized_multilang_dataset[key].set_format("torch")

In [22]:
tokenized_multilang_dataset

{'eng_Latn': Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 997
 }),
 'spa_Latn': Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 997
 }),
 'ita_Latn': Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 997
 }),
 'deu_Latn': Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 997
 }),
 'arb_Arab': Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 997
 }),
 'tel_Telu': Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 997
 }),
 'tam_Taml': Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 997
 }),
 'quy_Latn': Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 997
 })}

In [23]:
# # construct a pytorch data loader for each dataset
# BATCH_SIZE = 2 # for testing purposes, we start with a batch size of 2. You can change this later.

# from torch.utils.data import DataLoader

# # random_test_dataloader = DataLoader(random_test_dataset, batch_size=1, shuffle=False)
# # for i,data in random_test_dataloader:
# #     print(i)

# def make_dataloaders(multilang_dataset):
#     dataloaders_dict = {}
#     for key, dataset_dict in multilang_dataset.items():
#         dataloaders_dict[key] = {"dev":DataLoader(multilang_dataset[key]["dev"], batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle = True),
#                                  "devtest": DataLoader(multilang_dataset[key]["devtest"], batch_size=BATCH_SIZE, collate_fn=data_collator, shuffle = True)}
#     return dataloaders_dict

# dataloaders_dict = make_dataloaders(tokenized_multilang_dataset)

# Model

In [24]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [32]:
# load pre-trained model from the huggingface hub
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map = 'cuda') # HERE , device_map = 'cuda'
# model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# put the model into evaluation mode
# TODO: your code goes here
# model.eval()

In [33]:
model

XGLMForCausalLM(
  (model): XGLMModel(
    (embed_tokens): Embedding(256008, 1024, padding_idx=1)
    (embed_positions): XGLMSinusoidalPositionalEmbedding()
    (layers): ModuleList(
      (0-23): 24 x XGLMDecoderLayer(
        (self_attn): XGLMAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (activation_fn): GELUActivation()
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine

In [34]:
import os

os.environ["WANDB_PROJECT"] = "XGLM finetuning"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint"
os.environ["WANDB_WATCH"]="all"
# os.environ["WANDB_SILENT"]="true"

In [35]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    "finetuned-full",
    evaluation_strategy = "steps",
    eval_steps=200,
    save_total_limit=4,
    save_steps=200,
    load_best_model_at_end=True,
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
    report_to="wandb",
    run_name="FullFT_1",
    logging_strategy="steps",
    logging_steps=1,
    metric_for_best_model="quy_Latn_loss"
)

In [36]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=lm_datasets["train"],
#     eval_dataset=tokenized_multilang_dataset,
#     data_collator=data_collator,
#     tokenizer=tokenizer
# )

In [37]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets,
    eval_dataset=tokenized_multilang_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [38]:
import wandb
trainer.train()
wandb.finish()

Step,Training Loss,Validation Loss,Eng Latn Loss,Spa Latn Loss,Ita Latn Loss,Deu Latn Loss,Arb Arab Loss,Tel Telu Loss,Tam Taml Loss,Quy Latn Loss
200,3.8772,No log,5.139099,4.662704,4.814327,4.831931,4.995979,4.636125,4.420599,6.30055


Step,Training Loss,Validation Loss,Eng Latn Loss,Spa Latn Loss,Ita Latn Loss,Deu Latn Loss,Arb Arab Loss,Tel Telu Loss,Tam Taml Loss,Quy Latn Loss
200,3.8772,No log,5.139099,4.662704,4.814327,4.831931,4.995979,4.636125,4.420599,6.30055
400,3.1966,No log,5.206576,4.687542,4.879029,4.879976,5.010794,4.66202,4.423977,6.27011


Step,Training Loss,Validation Loss,Eng Latn Loss,Spa Latn Loss,Ita Latn Loss,Deu Latn Loss,Arb Arab Loss,Tel Telu Loss,Tam Taml Loss,Quy Latn Loss
200,3.8772,No log,5.139099,4.662704,4.814327,4.831931,4.995979,4.636125,4.420599,6.30055
400,3.1966,No log,5.206576,4.687542,4.879029,4.879976,5.010794,4.66202,4.423977,6.27011
600,3.6784,No log,5.307767,4.731968,4.946666,4.941481,5.012454,4.689486,4.434676,6.220406


Step,Training Loss,Validation Loss,Eng Latn Loss,Spa Latn Loss,Ita Latn Loss,Deu Latn Loss,Arb Arab Loss,Tel Telu Loss,Tam Taml Loss,Quy Latn Loss
200,3.8772,No log,5.139099,4.662704,4.814327,4.831931,4.995979,4.636125,4.420599,6.30055
400,3.1966,No log,5.206576,4.687542,4.879029,4.879976,5.010794,4.66202,4.423977,6.27011
600,3.6784,No log,5.307767,4.731968,4.946666,4.941481,5.012454,4.689486,4.434676,6.220406


Step,Training Loss,Validation Loss,Eng Latn Loss,Spa Latn Loss,Ita Latn Loss,Deu Latn Loss,Arb Arab Loss,Tel Telu Loss,Tam Taml Loss,Quy Latn Loss
200,3.8772,No log,5.139099,4.662704,4.814327,4.831931,4.995979,4.636125,4.420599,6.30055
400,3.1966,No log,5.206576,4.687542,4.879029,4.879976,5.010794,4.66202,4.423977,6.27011
600,3.6784,No log,5.307767,4.731968,4.946666,4.941481,5.012454,4.689486,4.434676,6.220406


Step,Training Loss,Validation Loss,Eng Latn Loss,Spa Latn Loss,Ita Latn Loss,Deu Latn Loss,Arb Arab Loss,Tel Telu Loss,Tam Taml Loss,Quy Latn Loss
200,3.8772,No log,5.139099,4.662704,4.814327,4.831931,4.995979,4.636125,4.420599,6.30055
400,3.1966,No log,5.206576,4.687542,4.879029,4.879976,5.010794,4.66202,4.423977,6.27011
600,3.6784,No log,5.307767,4.731968,4.946666,4.941481,5.012454,4.689486,4.434676,6.220406
800,3.3207,No log,5.381832,4.768534,4.988316,4.988153,5.026425,4.709599,4.444465,6.205238


Step,Training Loss,Validation Loss,Eng Latn Loss,Spa Latn Loss,Ita Latn Loss,Deu Latn Loss,Arb Arab Loss,Tel Telu Loss,Tam Taml Loss,Quy Latn Loss
200,3.8772,No log,5.139099,4.662704,4.814327,4.831931,4.995979,4.636125,4.420599,6.30055
400,3.1966,No log,5.206576,4.687542,4.879029,4.879976,5.010794,4.66202,4.423977,6.27011
600,3.6784,No log,5.307767,4.731968,4.946666,4.941481,5.012454,4.689486,4.434676,6.220406
800,3.3207,No log,5.381832,4.768534,4.988316,4.988153,5.026425,4.709599,4.444465,6.205238
1000,3.4461,No log,5.417552,4.788678,5.015732,5.013163,5.035207,4.71515,4.447172,6.195675


Step,Training Loss,Validation Loss,Eng Latn Loss,Spa Latn Loss,Ita Latn Loss,Deu Latn Loss,Arb Arab Loss,Tel Telu Loss,Tam Taml Loss,Quy Latn Loss
200,3.8772,No log,5.139099,4.662704,4.814327,4.831931,4.995979,4.636125,4.420599,6.30055
400,3.1966,No log,5.206576,4.687542,4.879029,4.879976,5.010794,4.66202,4.423977,6.27011
600,3.6784,No log,5.307767,4.731968,4.946666,4.941481,5.012454,4.689486,4.434676,6.220406
800,3.3207,No log,5.381832,4.768534,4.988316,4.988153,5.026425,4.709599,4.444465,6.205238
1000,3.4461,No log,5.417552,4.788678,5.015732,5.013163,5.035207,4.71515,4.447172,6.195675


Step,Training Loss,Validation Loss,Eng Latn Loss,Spa Latn Loss,Ita Latn Loss,Deu Latn Loss,Arb Arab Loss,Tel Telu Loss,Tam Taml Loss,Quy Latn Loss
200,3.8772,No log,5.139099,4.662704,4.814327,4.831931,4.995979,4.636125,4.420599,6.30055
400,3.1966,No log,5.206576,4.687542,4.879029,4.879976,5.010794,4.66202,4.423977,6.27011
600,3.6784,No log,5.307767,4.731968,4.946666,4.941481,5.012454,4.689486,4.434676,6.220406
800,3.3207,No log,5.381832,4.768534,4.988316,4.988153,5.026425,4.709599,4.444465,6.205238
1000,3.4461,No log,5.417552,4.788678,5.015732,5.013163,5.035207,4.71515,4.447172,6.195675


wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)


VBox(children=(Label(value='16381.026 MB of 34581.274 MB uploaded (84.518 MB deduped)\r'), FloatProgress(value…

wandb: Network error (TransientError), entering retry loop.


0,1
eval/arb_Arab_loss,▁▄▄▆█
eval/arb_Arab_runtime,▅█▂▁▁
eval/arb_Arab_samples_per_second,▃▁▇██
eval/arb_Arab_steps_per_second,▃▁▇██
eval/deu_Latn_loss,▁▃▅▇█
eval/deu_Latn_runtime,▆█▃▁▁
eval/deu_Latn_samples_per_second,▃▁▆██
eval/deu_Latn_steps_per_second,▃▁▆██
eval/eng_Latn_loss,▁▃▅▇█
eval/eng_Latn_runtime,▇█▂▁▁

0,1
eval/arb_Arab_loss,5.03521
eval/arb_Arab_runtime,245.9169
eval/arb_Arab_samples_per_second,4.054
eval/arb_Arab_steps_per_second,0.508
eval/deu_Latn_loss,5.01316
eval/deu_Latn_runtime,231.2773
eval/deu_Latn_samples_per_second,4.311
eval/deu_Latn_steps_per_second,0.54
eval/eng_Latn_loss,5.41755
eval/eng_Latn_runtime,197.881


In [41]:
model.save_pretrained("./pretrained_FFT_1")

In [49]:
torch.cuda.empty_cache()