<a href="https://colab.research.google.com/github/chizuchizu/IOAI/blob/main/Task2/redrock_006_task2_pretrain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import userdata

read_access_token = userdata.get('hf_read')
write_access_token = userdata.get('hf_write')

### Dependencies

In [2]:
import importlib
import torch, transformers

if '2.3.0' not in torch.__version__:
  !pip install torch==2.3.0
if transformers.__version__!='4.41.2':
  !pip install transformers==4.41.2

if importlib.util.find_spec('datasets') is None:
  !pip install datasets==2.18.0
  !pip install evaluate==0.4.2
  !pip install accelerate -U

if importlib.util.find_spec("wandb") is None:
  !pip install wandb -q

Collecting datasets==2.18.0
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets==2.18.0)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets==2.18.0)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets==2.18.0)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
S

If you've just installed `accelerate`, execute `Runtime > Restart session and run all` in the Colab UI menu above.

In [3]:
import os
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.functional import F

import torchvision
from torchvision import datasets, transforms, models

from tqdm.auto import tqdm

from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, get_scheduler, BertForMaskedLM, BertTokenizer
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset, DatasetDict

import torch.cuda.amp as amp # or import torch.cuda.amp as amp for PyTorch's native amp


import evaluate
import wandb

In [4]:
# ====================================================
# CFG
# ====================================================

class CFG:
    num_workers=4
    project = "IOAI_Task2"
    name = "redrock_006_task2_pretrain"

    # model
    base_model_name = "google-bert/bert-base-multilingual-uncased"
    tokenizer_name = "ioai2024japan/redrock_006_task2_tokenizer"
    mlm_probability = 0.15

    # training
    epochs = 1

    scheduler='CosineAnnealingLR' # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']

    lr = 5e-05

    # dataset
    max_length = 256

    # T4: 32
    # L4: 64
    train_batch_size = 32

    seed=42
    train=True

In [5]:
wandb.login(key=userdata.get('wandb_token'))

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [6]:
# for wandb
cfg = dict(vars(CFG))
cfg = {k: v for k, v in cfg.items() if "__" not in k}

In [7]:
# classification_dataset = load_dataset('InternationalOlympiadAI/NLP_problem', token=read_access_token)
# tokenizer = AutoTokenizer.from_pretrained(CFG.base_model_name)
brahmi_to_devanagari = {
    '𑀓': 'क', '𑀔': 'ख', '𑀕': 'ग', '𑀖': 'घ', '𑀗': 'ङ', '𑀘': 'च', '𑀙': 'छ',
    '𑀚': 'ज', '𑀛': 'झ', '𑀜': 'ञ', '𑀝': 'ट', '𑀞': 'ठ', '𑀟': 'ड', '𑀠': 'ढ',
    '𑀡': 'ण', '𑀢': 'त', '𑀣': 'थ', '𑀤': 'द', '𑀥': 'ध', '𑀦': 'न', '𑀧': 'प',
    '𑀨': 'फ', '𑀩': 'ब', '𑀪': 'भ', '𑀫': 'म', '𑀬': 'य', '𑀭': 'र', '𑀮': 'ल',
    '𑀯': 'व', '𑀰': 'श', '𑀱': 'ष', '𑀲': 'स', '𑀳': 'ह', '𑁦':'०', '𑁣': '90'
}

def transliterate_brahmi_to_devanagari(text):
    transliterated_text = ''
    for char in text:
        if char in brahmi_to_devanagari:
            transliterated_text += brahmi_to_devanagari[char]
        else:
            transliterated_text += char
    return transliterated_text

transliterate_dict = {
    'क': 'a', 'ख': 'b', 'ग': 'c', 'घ': 'd', 'ङ': 'e', 'च': 'f', 'छ': 'g',
    'ज': 'h', 'झ': 'i', 'ञ': 'j', 'ट': 'k', 'ठ': 'l', 'ड': 'm', 'ढ': 'n',
    'ण': 'o', 'त': 'p', 'थ': 'q', 'द': 'r', 'ध': 's', 'न': 't', 'प': 'u',
    'फ': 'v', 'ब': 'w', 'भ': 'x', 'म': 'y', 'य': 'z', 'र': 'A', 'ल': 'B',
    'व': 'C', 'श': 'D', 'ष': 'E', 'स': 'F', 'ह': 'G', '०': 'H', '90': 'I'
}

def transliterate_text(text):
    for key, value in transliterate_dict.items():
        text = text.replace(key, value)
    return text

def transliterate_to_latin(text):
    transliterated_text = ''
    for char in text:
        if char in brahmi_to_devanagari:
            transliterated_text += transliterate_text(brahmi_to_devanagari[char])
        else:
            transliterated_text += transliterate_text(char)
    return transliterated_text

def to_device(batch, device):
    output = {}
    for k, v in batch.items():
        try:
            output[k] = v.to(device)
        except:
            output[k] = v
    return output

In [8]:
raw_dataset = load_dataset('InternationalOlympiadAI/NLP_problem_raw', token=read_access_token)
tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_name, token=read_access_token)

Downloading readme:   0%|          | 0.00/281 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/90.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/611245 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/816k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.51M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [9]:
def transform(example_batch):
        example_batch["text"] = [transliterate_to_latin(x) for x in example_batch["text"]]
        inputs = tokenizer([x for x in example_batch["text"]],  truncation=True, max_length=CFG.max_length, padding="max_length", return_tensors="pt")
        return inputs

tokenized_data = raw_dataset.with_transform(transform)

In [10]:
def main():
    wandb.init(
        name=CFG.name,
        project=CFG.project,
        config=cfg
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=CFG.mlm_probability
    )

    # dataset
    train_dataset = tokenized_data["train"]
    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.train_batch_size,
        num_workers=0,
        pin_memory=True,
        shuffle=True,
        drop_last=True,
        collate_fn=data_collator,
    )

    model = BertForMaskedLM.from_pretrained(
        CFG.base_model_name
    ).cuda()

    num_training_steps = CFG.epochs * len(train_loader)
    optimizer = optim.AdamW(model.parameters(), lr=CFG.lr)
    scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    scaler = torch.cuda.amp.GradScaler()

    def train_one_epoch(model, scheduler, train_loader, optimizer):
        model.train()
        running_loss = 0.0
        progress_bar = tqdm(train_loader, dynamic_ncols=True)

        for step, batch in enumerate(progress_bar):
            batch = to_device(batch, device)

            # Convert data to fp16
            with amp.autocast():
                outputs = model(**batch)
                loss = outputs.loss

            # Scale loss for fp16 training
            scaler.scale(loss).backward()
            text = f"step {step}, loss: {loss:.5f}"
            progress_bar.set_description(text)

            wandb.log(
                {
                    "train_loss": loss,
                    "lr": optimizer.param_groups[0]["lr"],
                    "step": step,
                }
            )

            # Optimizer step with gradient scaling
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()

    # Train and evaluate the model
    model.to(device)
    for i in range(CFG.epochs):
        train_one_epoch(model, scheduler, train_loader, optimizer)
        print(f'Epoch {i+1}')
    return model

In [12]:
model = main()

VBox(children=(Label(value='0.013 MB of 0.013 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

model.safetensors:  37%|###7      | 252M/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at google-bert/bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/19101 [00:00<?, ?it/s]



Epoch 1


In [13]:
model.push_to_hub(
    f"ioai2024japan/{CFG.name}",
    token=userdata.get('hf_write'), private=True
)

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ioai2024japan/redrock_006_task2_pretrain/commit/17e13c97a042884cd78c021ec6ad869c4a1d11d5', commit_message='Upload BertForMaskedLM', commit_description='', oid='17e13c97a042884cd78c021ec6ad869c4a1d11d5', pr_url=None, pr_revision=None, pr_num=None)

In [14]:
def terminate_session():
    # Terminate this session

    from google.colab import runtime
    runtime.unassign()

In [15]:
terminate_session()

In [None]:
import gc
import torch

def flush():
  gc.collect()
  torch.cuda.empty_cache()
flush()