In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import torch
from functools import partial
import time

DATASET = 'hackathon-pln-es/spanish-to-quechua'
MODEL_NAME = 'facebook/xglm-564M'
SEQ_LEN   = 32

In [2]:
def getDataset():

    print(f'\nin getDataset')

    #data and tokenizer
    data = load_dataset(DATASET)
    tokenizer = getTokenizer(MODEL_NAME)

    print(data)

    #split data
    # data = data["train"].train_test_split(test_size=.2, seed=1)

    data = data.map( preprocess,
                     # batched = True,
                     # num_proc = 4,
                     fn_kwargs = {'tokenizer' : tokenizer},
                     remove_columns = data['train'].column_names
                     )

    lm_dataset = data.map(group_texts,
                          batched=True,
                          num_proc=4,
                          fn_kwargs = {'block_size' : SEQ_LEN } )

    print(lm_dataset['train'])
    print(lm_dataset['train'][0])

    return lm_dataset

def getTokenizer(TOKENIZER):
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
    # tokenizer.pad_token = tokenizer.eos_token
    return tokenizer


def preprocess(data_row, tokenizer):
    return tokenizer(data_row['qu'])

def group_texts(examples, block_size):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.

    # if total_length >= block_size:
    total_length = (total_length // block_size) * block_size

    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }

    # labels because the model expects the argument to be named labels
    result["labels"] = result["input_ids"].copy()
    # del result['input_ids']
    return result

In [3]:
class LoRALayer(torch.nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
        self.A = torch.nn.Parameter(torch.randn(in_dim, rank) * std_dev)
        self.B = torch.nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha = alpha

    def forward(self, x):
        x = self.alpha * (x @ self.A @ self.B)
        return x

class LinearWithLoRA(torch.nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )

    def forward(self, x):
        return self.linear(x) + self.lora(x)


def get_lora_model(model):
    # default hyperparameter choices
    lora_r = 8
    lora_alpha = 16
    lora_dropout = 0.05
    lora_query = True
    lora_key = True
    lora_value = True
    lora_projection = True
    lora_mlp = True
    lora_head = False

    assign_lora = partial(LinearWithLoRA, rank=lora_r, alpha=lora_alpha)

    for param in model.parameters():
        param.requires_grad = False

    for layer in model.model.layers:
        if lora_query:
            layer.self_attn.q_proj = assign_lora(layer.self_attn.q_proj)
        if lora_key:
            layer.self_attn.k_proj = assign_lora(layer.self_attn.k_proj)
        if lora_value:
            layer.self_attn.v_proj = assign_lora(layer.self_attn.v_proj)
        if lora_projection:
            layer.self_attn.out_proj = assign_lora(layer.self_attn.out_proj)
        if lora_mlp:
            layer.fc1 = assign_lora(layer.fc1)
            layer.fc2 = assign_lora(layer.fc2)

    if lora_head:
        model.model.lm_head = assign_lora(model.model.lm_head)

    return model

def train_XGLM_lora():
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
    print(model)
    model_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total trainable parameters in model : {model_params}")

    lora_model = get_lora_model(model)
    print(lora_model)
    lora_model_params = sum(p.numel() for p in lora_model.parameters() if p.requires_grad)
    print(f"Total trainable parameters in lora model : {lora_model_params} and are {(lora_model_params/model_params)*100} % of the original model")

    lm_dataset = getDataset()
    train_XGLM(lora_model, lm_dataset, "xglm_lora")


def train_XGLM(model, lm_dataset, output_dir):

    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        weight_decay=0.01,
        #push_to_hub=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=lm_dataset["train"],
        eval_dataset=lm_dataset["validation"],
        # data_collator=data_collator,
    )

    st = time.time()
    trainer.train()
    et = time.time()

    print(f"total training time : {(et - st)} sec.")

In [4]:
train_XGLM_lora()

  return self.fget.__get__(instance, owner)()


XGLMForCausalLM(
  (model): XGLMModel(
    (embed_tokens): Embedding(256008, 1024, padding_idx=1)
    (embed_positions): XGLMSinusoidalPositionalEmbedding()
    (layers): ModuleList(
      (0-23): 24 x XGLMDecoderLayer(
        (self_attn): XGLMAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (activation_fn): GELUActivation()
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine

Downloading readme:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/945 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 11.7M/11.7M [00:02<00:00, 4.88MB/s]
Downloading data: 100%|██████████| 1.46M/1.46M [00:00<00:00, 4.67MB/s]
Downloading data: 100%|██████████| 1.45M/1.45M [00:00<00:00, 4.91MB/s]


Generating train split:   0%|          | 0/102747 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/12844 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/12843 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['es', 'qu'],
        num_rows: 102747
    })
    validation: Dataset({
        features: ['es', 'qu'],
        num_rows: 12844
    })
    test: Dataset({
        features: ['es', 'qu'],
        num_rows: 12843
    })
})


Map:   0%|          | 0/102747 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4235 > 2048). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/12844 [00:00<?, ? examples/s]

Map:   0%|          | 0/12843 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/102747 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/12844 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/12843 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 80787
})
{'input_ids': [2, 4049, 39822, 27076, 2800, 3451, 27076, 7382, 106026, 129598, 2597, 6580, 10988, 81990, 78702, 247, 134073, 5, 78511, 1190, 21167, 133189, 78702, 116, 118, 42783, 162637, 80, 65704, 81990, 6606, 134073], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [2, 4049, 39822, 27076, 2800, 3451, 27076, 7382, 106026, 129598, 2597, 6580, 10988, 81990, 78702, 247, 134073, 5, 78511, 1190, 21167, 133189, 78702, 116, 118, 42783, 162637, 80, 65704, 81990, 6606, 134073]}


[34m[1mwandb[0m: Currently logged in as: [33maleksey-morshnev[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,3.5991,3.42414
2,3.4199,3.266045
3,3.3207,3.188933


total training time : 5304.447983503342 sec.
