In [1]:
!pip install transformers==4.28.0 sentencepiece sacremoses datasets sacrebleu GPUtil

Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting GPUtil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l- \ done
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Building wheels for collected packages: sacremoses, GPUtil
  Building wheel for sacremoses (setup.py

In [2]:
import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache() 

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 |  9% |  2% |


In [3]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
import wandb
user_secrets = UserSecretsClient()
secret_value = user_secrets.get_secret("HF_KEY")
login(secret_value)
secret_value1 = user_secrets.get_secret("WANDB_KEY")
wandb.login(key=secret_value1)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
from datasets import load_dataset, Dataset
training_data = load_dataset("ethansimrm/wmt_16_19_22_biomed_train_processed", split = "train") 
validation_data = load_dataset("ethansimrm/wmt_20_21_biomed_validation", split = "validation")

Downloading and preparing dataset text/ethansimrm--wmt_16_19_22_biomed_train_processed to /root/.cache/huggingface/datasets/text/ethansimrm--wmt_16_19_22_biomed_train_processed-99315a1c01bbbf4e/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/30.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/ethansimrm--wmt_16_19_22_biomed_train_processed-99315a1c01bbbf4e/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8. Subsequent calls will reuse this data.
Downloading and preparing dataset text/ethansimrm--wmt_20_21_biomed_validation to /root/.cache/huggingface/datasets/text/ethansimrm--wmt_20_21_biomed_validation-727b0ab23cf9429a/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/303k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/ethansimrm--wmt_20_21_biomed_validation-727b0ab23cf9429a/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8. Subsequent calls will reuse this data.


In [5]:
from datasets import load_metric
metric = load_metric("sacrebleu")

Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [6]:
#Converts from SRC [TAB] TGT [NEWLINE] to the usual dictionary format
def convertToDictFormat(data): #input should be of the form data['text'], etc.
    source = []
    target = []
    for example in data:
        example = example.strip()
        sentences = example.split("\t")
        source.append(sentences[0])
        target.append(sentences[1])
    ready = Dataset.from_dict({"en":source, "fr":target})
    return ready

In [7]:
train_data_ready = convertToDictFormat(training_data['text'])
val_data_ready = convertToDictFormat(validation_data['text'])

In [8]:
#Load correct tokenizer
from transformers import AutoTokenizer
checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

In [9]:
source_lang = 'en'
target_lang = 'fr'
def preprocess_function(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(inputs, text_target=targets, padding="longest")
    #Pad to longest sequence in batch, no truncation - we filtered too-long sentences out already
    return model_inputs

In [10]:
tokenized_train = train_data_ready.map(preprocess_function, batched=True)
tokenized_val = val_data_ready.map(preprocess_function, batched=True)

  0%|          | 0/412 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [11]:
#Builds batches from dataset
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [12]:
import numpy as np

def postprocess_text(preds, labels): 
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) #Convert back into words

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id) #Ignore padded labels added by the data collator to the test set
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) 

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) #Remove leading and trailing spaces

    result = metric.compute(predictions=decoded_preds, references=decoded_labels) #BLEU score for provided input and references
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens) #Compute mean prediction length
    result = {k: round(v, 4) for k, v in result.items()} #Round score to 4dp
    return result

In [13]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [14]:
from transformers import EarlyStoppingCallback, IntervalStrategy
import torch

batch_size = 16 #Set as high as possible per Popel & Bojar (2018)

training_args = Seq2SeqTrainingArguments( #Collects hyperparameters
    output_dir="opus_wmt_finetuned_enfr",
    evaluation_strategy=IntervalStrategy.STEPS, #Evaluates every N steps
    save_steps=8000, #Save model every evaluation
    eval_steps=8000, #Evaluate every ~128k sentences
    num_train_epochs=3, #About 1hr per eval interval,4 eval intervals per epoch, 12hr compute budget
    learning_rate=2e-5, #Initial learning rate for AdamW
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01, #Weight decay for loss computation; Loss = Loss + WD * sum (weights squared)
    save_total_limit=3, #Number of checkpoints to save
    predict_with_generate=True, #Use with ROUGE/BLEU and other translation metrics (see below)
    fp16=True, #Remove fp16 = True if not using CUDA
    push_to_hub=True,
    metric_for_best_model='bleu', #Determines our best model
    load_best_model_at_end=True, #We will choose the best model from among our 5 checkpoints
)

trainer = Seq2SeqTrainer( #Saves us from writing our own training loops
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.1)]
    #If we don't improve by at least 0.1 BLEU on the validation set for 3 evaluations, we stop and find the best 
    #model of the 5 we have saved
)

Cloning https://huggingface.co/ethansimrm/opus_wmt_finetuned_enfr into local empty directory.


Download file pytorch_model.bin:   0%|          | 1.40k/285M [00:00<?, ?B/s]

Download file runs/Jun19_16-53-26_52f716f83ae7/1687193616.1802845/events.out.tfevents.1687193616.52f716f83ae7.…

Download file training_args.bin: 100%|##########| 3.68k/3.68k [00:00<?, ?B/s]

Download file runs/Jun19_19-35-50_9eb095590415/events.out.tfevents.1687203446.9eb095590415.22.0: 100%|########…

Download file runs/Jun19_19-35-50_9eb095590415/1687203446.305718/events.out.tfevents.1687203446.9eb095590415.2…

Download file source.spm:   0%|          | 1.40k/760k [00:00<?, ?B/s]

Download file runs/Jun19_16-53-26_52f716f83ae7/events.out.tfevents.1687193616.52f716f83ae7.23.0: 100%|########…

Download file target.spm:   0%|          | 1.40k/784k [00:00<?, ?B/s]

Clean file runs/Jun19_19-35-50_9eb095590415/1687203446.305718/events.out.tfevents.1687203446.9eb095590415.22.1…

Clean file runs/Jun19_16-53-26_52f716f83ae7/events.out.tfevents.1687193616.52f716f83ae7.23.0:  12%|#2        |…

Clean file training_args.bin:  27%|##7       | 1.00k/3.68k [00:00<?, ?B/s]

Clean file runs/Jun19_16-53-26_52f716f83ae7/1687193616.1802845/events.out.tfevents.1687193616.52f716f83ae7.23.…

Clean file runs/Jun19_19-35-50_9eb095590415/events.out.tfevents.1687203446.9eb095590415.22.0:  10%|9         |…

Clean file source.spm:   0%|          | 1.00k/760k [00:00<?, ?B/s]

Clean file target.spm:   0%|          | 1.00k/784k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/285M [00:00<?, ?B/s]

In [15]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33methansimrm[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.15.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20230619_213643-zc09ij3j[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mgolden-gorge-21[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/ethansimrm/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/ethansimrm/huggingface/runs/zc09ij3j[0m


Step,Training Loss,Validation Loss,Bleu,Gen Len
8000,0.1675,0.10904,40.7464,35.9616
16000,0.1567,0.106454,38.989,38.3327
24000,0.1531,0.104749,41.3302,34.6251
32000,0.1402,0.104637,40.0711,36.7565
40000,0.1403,0.103343,41.426,36.0259
48000,0.1378,0.103084,41.6555,35.7747
56000,0.1298,0.10294,42.4603,35.2848
64000,0.133,0.102609,42.3371,34.4746
72000,0.1274,0.10227,42.5076,35.2454


TrainOutput(global_step=77190, training_loss=0.1463828679270225, metrics={'train_runtime': 28602.7969, 'train_samples_per_second': 43.178, 'train_steps_per_second': 2.699, 'total_flos': 6.379696034788147e+16, 'train_loss': 0.1463828679270225, 'epoch': 3.0})