In [None]:
!nvidia-smi

Tue Dec 19 09:28:42 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
## Install dependencies
!pip install datasets transformers sentencepiece accelerate peft bitsandbytes evaluate




## Prepare dataset

In [None]:
import os
import sys
import transformers
import torch
import datasets
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, AdamWeightDecay, BitsAndBytesConfig
from datasets import get_dataset_split_names

In [None]:
from datasets import get_dataset_config_names

configs = get_dataset_config_names("opus100")
print(configs)

Downloading builder script:   0%|          | 0.00/7.25k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/192k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/46.7k [00:00<?, ?B/s]

['af-en', 'am-en', 'an-en', 'ar-en', 'as-en', 'az-en', 'be-en', 'bg-en', 'bn-en', 'br-en', 'bs-en', 'ca-en', 'cs-en', 'cy-en', 'da-en', 'de-en', 'dz-en', 'el-en', 'en-eo', 'en-es', 'en-et', 'en-eu', 'en-fa', 'en-fi', 'en-fr', 'en-fy', 'en-ga', 'en-gd', 'en-gl', 'en-gu', 'en-ha', 'en-he', 'en-hi', 'en-hr', 'en-hu', 'en-hy', 'en-id', 'en-ig', 'en-is', 'en-it', 'en-ja', 'en-ka', 'en-kk', 'en-km', 'en-ko', 'en-kn', 'en-ku', 'en-ky', 'en-li', 'en-lt', 'en-lv', 'en-mg', 'en-mk', 'en-ml', 'en-mn', 'en-mr', 'en-ms', 'en-mt', 'en-my', 'en-nb', 'en-ne', 'en-nl', 'en-nn', 'en-no', 'en-oc', 'en-or', 'en-pa', 'en-pl', 'en-ps', 'en-pt', 'en-ro', 'en-ru', 'en-rw', 'en-se', 'en-sh', 'en-si', 'en-sk', 'en-sl', 'en-sq', 'en-sr', 'en-sv', 'en-ta', 'en-te', 'en-tg', 'en-th', 'en-tk', 'en-tr', 'en-tt', 'en-ug', 'en-uk', 'en-ur', 'en-uz', 'en-vi', 'en-wa', 'en-xh', 'en-yi', 'en-yo', 'en-zh', 'en-zu', 'ar-de', 'ar-fr', 'ar-nl', 'ar-ru', 'ar-zh', 'de-fr', 'de-nl', 'de-ru', 'de-zh', 'fr-nl', 'fr-ru', 'fr-zh', 

In [None]:
from datasets import load_dataset
dataset = load_dataset("opus100", "en-fr")
dataset

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

In [None]:
dataset['train']['translation'][:5]

[{'en': 'The time now is 05:08 .', 'fr': 'The time now is 05:05 .'},
 {'en': 'This Regulation shall enter into force on the seventh day following its publication in the Official Journal of the European Union.',
  'fr': "Le présent règlement entre en vigueur le septième jour suivant celui de sa publication au Journal officiel de l'Union européenne."},
 {'en': "Hello, what's that?", 'fr': "Qu'est-ce que c'est que ça ?"},
 {'en': 'And then I will teach you everything i know.',
  'fr': "Et alors, je t'apprendrai tout ce que je sais."},
 {'en': 'Did you find something?', 'fr': 'Par ici !'}]

In [None]:
dataset.save_to_disk("data")

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2000 [00:00<?, ? examples/s]

## PreProcessing the data

In [None]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, load_in_8bit=True, device_map="auto")

In [None]:
print("Sample token:\n",tokenizer(["My name is Wolfgang and I live in Berlin", "hi thre"]))

Sample token:
 {'input_ids': [[499, 564, 19, 26513, 11, 27, 619, 16, 4308, 1], [7102, 3, 189, 60, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}


In [None]:
def preprocess_function(data):
  inputs = [ex['en'] for ex in data['translation']]
  targets = [ex['fr'] for ex in data['translation']]

  model_inputs = tokenizer(inputs, truncation=True)
  labels = tokenizer(targets, truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

preprocess_function(dataset['train'][:1])


{'input_ids': [[37, 97, 230, 19, 3, 3076, 10, 4018, 3, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[37, 97, 230, 19, 3, 3076, 10, 3076, 3, 5, 1]]}

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
# small_eval_dataset = tokenized_datasets["train"].shuffle(seed=20).select(range(200))

train_dataset = tokenized_dataset['train'].shuffle(seed=42).select(range(2000))
val_dataset = tokenized_dataset['validation']

## Download Model

In [None]:
model_id = "t5-small"
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
#     low_cpu_mem_usage=True,
# )

# model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, quantization_config=bnb_config, device_map=device_map)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")

In [None]:
model.save_pretrained("models/")

## Training

In [None]:
from peft import PeftModel,prepare_model_for_int8_training, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate
import numpy as np

In [None]:
accuracy = evaluate.load("accuracy")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}



Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
model = prepare_model_for_int8_training(model)

peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM,
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules=[ "k", "q", "v"])

peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 221,184 || all params: 60,727,808 || trainable%: 0.36422193931320557




In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=peft_model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, Trainer
import torch

torch.set_default_dtype(torch.float32)

output_dir="en2fr"
num_epochs = 20
batch_size=8

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=num_epochs,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=peft_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
peft_model.config.use_cache = False # silence the warnings. Please re-enable for inference!


In [None]:
# train model
trainer.train()




Step,Training Loss
500,1.4966
1000,1.3986
1500,1.3508
2000,1.3017
2500,1.2851
3000,1.2603
3500,1.2383
4000,1.2193
4500,1.2057
5000,1.1965


TrainOutput(global_step=5000, training_loss=1.295289501953125, metrics={'train_runtime': 1672.4371, 'train_samples_per_second': 23.917, 'train_steps_per_second': 2.99, 'total_flos': 685071170273280.0, 'train_loss': 1.295289501953125, 'epoch': 20.0})

In [None]:
# save model
peft_model.save_pretrained("translation/en2fr")
tokenizer.save_pretrained("translation/en2fr")

('translation/en2fr/tokenizer_config.json',
 'translation/en2fr/special_tokens_map.json',
 'translation/en2fr/spiece.model',
 'translation/en2fr/added_tokens.json',
 'translation/en2fr/tokenizer.json')

In [None]:
# load model
from transformers import AutoTokenizer, AutoModel
m = AutoModel.from_pretrained("/content/translation/en2fr")
t = AutoTokenizer.from_pretrained("/content/translation/en2fr")

In [None]:
peft_model.config.use_cache = True # silence the warnings. Please re-enable for inference!
context = tokenizer(["Do you want coffee?"], return_tensors='pt')
output = peft_model.generate(**context)
tokenizer.decode(output[0], skip_special_tokens=True)



'Vous voulez du café?'

## Push to HF hub

In [None]:
!huggingface-cli login

In [None]:
## push using trainer
trainer.push_to_hub("End of training")

In [None]:
## push maually
peft_model.push_to_hub("dmedhi/eng2french-t5-small")
tokenizer.push_to_hub("dmedhi/eng2french-t5-small")

## Try out model

In [None]:
!pip install transformers peft

Collecting peft
  Downloading peft-0.7.1-py3-none-any.whl (168 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate, peft
Successfully installed accelerate-0.25.0 peft-0.7.1


In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dmedhi/eng2french-t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
model = PeftModel.from_pretrained(model, "dmedhi/eng2french-t5-small")

# config = PeftConfig.from_pretrained("dmedhi/eng2french-t5-small")

In [None]:
# peft_model.config.use_cache = True # silence the warnings. Please re-enable for inference!
context = tokenizer(["Do you want coffee?"], return_tensors='pt')
output = model.generate(**context)
tokenizer.decode(output[0], skip_special_tokens=True)



'Tu veux du café?'

In [None]:
# m.config.use_cache = True
output = model.generate(**context)
tokenizer.decode(output[0], skip_special_tokens=True)



'Tu veux du café?'