<a href="https://colab.research.google.com/github/chizuchizu/IOAI/blob/main/Task2/redrock_006_task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ====================================================
# CFG
# ====================================================

class CFG:
    num_workers=4
    project = "IOAI_Task2_classification"
    name = "redrock_006_task2_classification"

    # model
    base_model_name = "ioai2024japan/redrock_006_task2_pretrain"
    tokenizer_name = "ioai2024japan/redrock_006_task2_tokenizer"
    num_classes = 5

    # training
    epochs = 30

    scheduler='CosineAnnealingLR' # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']

    lr = 1e-05

    # dataset
    max_length = 256

    # T4: 32
    # L4: 64
    train_batch_size = 64
    eval_batch_size = 64

    seed=42
    train=True

# for wandb
cfg = dict(vars(CFG))
cfg = {k: v for k, v in cfg.items() if "__" not in k}

In [2]:
from google.colab import userdata

read_access_token = userdata.get('hf_read')
write_access_token = userdata.get('hf_write')

### Dependencies

In [3]:
import importlib
import torch, transformers

if '2.3.0' not in torch.__version__:
  !pip install torch==2.3.0
if transformers.__version__!='4.41.2':
  !pip install transformers==4.41.2

if importlib.util.find_spec('datasets') is None:
  !pip install datasets==2.18.0s
  !pip install evaluate==0.4.2
  !pip install accelerate -U

if importlib.util.find_spec('wandb') is None:
  !pip install wandb -q

If you've just installed `accelerate`, execute `Runtime > Restart session and run all` in the Colab UI menu above.

In [4]:
import os
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.functional import F

import torchvision
from torchvision import datasets, transforms, models

from tqdm.auto import tqdm

from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, get_scheduler
from datasets import load_dataset, Dataset, DatasetDict

import evaluate

import wandb

from huggingface_hub import login

wandb.login(key=userdata.get('wandb_token'))
login(token=read_access_token)


from huggingface_hub import login

login(token=read_access_token)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
# classification_dataset = load_dataset('InternationalOlympiadAI/NLP_problem', token=read_access_token)
# tokenizer = AutoTokenizer.from_pretrained(CFG.base_model_name)
brahmi_to_devanagari = {
    '𑀓': 'क', '𑀔': 'ख', '𑀕': 'ग', '𑀖': 'घ', '𑀗': 'ङ', '𑀘': 'च', '𑀙': 'छ',
    '𑀚': 'ज', '𑀛': 'झ', '𑀜': 'ञ', '𑀝': 'ट', '𑀞': 'ठ', '𑀟': 'ड', '𑀠': 'ढ',
    '𑀡': 'ण', '𑀢': 'त', '𑀣': 'थ', '𑀤': 'द', '𑀥': 'ध', '𑀦': 'न', '𑀧': 'प',
    '𑀨': 'फ', '𑀩': 'ब', '𑀪': 'भ', '𑀫': 'म', '𑀬': 'य', '𑀭': 'र', '𑀮': 'ल',
    '𑀯': 'व', '𑀰': 'श', '𑀱': 'ष', '𑀲': 'स', '𑀳': 'ह', '𑁦':'०', '𑁣': '90'
}

def transliterate_brahmi_to_devanagari(text):
    transliterated_text = ''
    for char in text:
        if char in brahmi_to_devanagari:
            transliterated_text += brahmi_to_devanagari[char]
        else:
            transliterated_text += char
    return transliterated_text

transliterate_dict = {
    'क': 'a', 'ख': 'b', 'ग': 'c', 'घ': 'd', 'ङ': 'e', 'च': 'f', 'छ': 'g',
    'ज': 'h', 'झ': 'i', 'ञ': 'j', 'ट': 'k', 'ठ': 'l', 'ड': 'm', 'ढ': 'n',
    'ण': 'o', 'त': 'p', 'थ': 'q', 'द': 'r', 'ध': 's', 'न': 't', 'प': 'u',
    'फ': 'v', 'ब': 'w', 'भ': 'x', 'म': 'y', 'य': 'z', 'र': 'A', 'ल': 'B',
    'व': 'C', 'श': 'D', 'ष': 'E', 'स': 'F', 'ह': 'G', '०': 'H', '90': 'I'
}

def transliterate_text(text):
    for key, value in transliterate_dict.items():
        text = text.replace(key, value)
    return text

def transliterate_to_latin(text):
    transliterated_text = ''
    for char in text:
        if char in brahmi_to_devanagari:
            transliterated_text += transliterate_text(brahmi_to_devanagari[char])
        else:
            transliterated_text += transliterate_text(char)
    return transliterated_text


f1 = evaluate.load("f1")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')

def to_device(batch, device):
    output = {}
    for k, v in batch.items():
        try:
            output[k] = v.to(device)
        except:
            output[k] = v
    return output


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [6]:
def main():

    wandb.init(
        name=CFG.name,
        project=CFG.project,
        config=cfg
    )
    classification_dataset = load_dataset('InternationalOlympiadAI/NLP_problem', token=read_access_token)
    tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_name, token=read_access_token)

    def transform(example_batch):
        example_batch["text"] = [transliterate_to_latin(x) for x in example_batch["text"]]
        inputs = tokenizer([x for x in example_batch["text"]],  truncation=True, max_length=CFG.max_length, padding="max_length", return_tensors="pt")
        inputs["labels"] = example_batch["label"]
        return inputs


    tokenized_data = classification_dataset.with_transform(transform)

    model = AutoModelForSequenceClassification.from_pretrained(
        CFG.base_model_name, num_labels=CFG.num_classes, token=read_access_token
    ).cuda()



    # dataset
    train_dataset = tokenized_data["train"]
    eval_dataset = tokenized_data["dev"]
    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.train_batch_size,
        num_workers=0,
        pin_memory=True,
        shuffle=True,
        drop_last=True,
    )
    eval_loader = DataLoader(
        eval_dataset,
        batch_size=CFG.eval_batch_size,
        num_workers=0,
        pin_memory=True,
        shuffle=False,
    )

    epochs = CFG.epochs
    num_training_steps = epochs * len(train_loader)
    optimizer = optim.AdamW(model.parameters(), lr=CFG.lr)
    scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    def train_one_epoch(model, scheduler, train_loader, optimizer):
        model.train()
        running_loss = 0.0
        progress_bar = tqdm(train_loader, dynamic_ncols=True)

        for step, batch in enumerate(progress_bar):
            batch = to_device(batch, device)
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                token_type_ids=batch["token_type_ids"],
                labels=batch["labels"],
            )
            loss = outputs.loss
            loss.backward()

            text = f"step {step}, loss: {loss:.5f}"
            progress_bar.set_description(text)

            wandb.log(
                {
                    "train_loss": loss,
                    "lr": optimizer.param_groups[0]["lr"],
                    "step": step,
                }
            )
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

    def evaluate_model(model, test_loader):
        model.eval()
        for batch in eval_loader:
            batch = to_device(batch, device)
            with torch.no_grad():
                outputs = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    token_type_ids=batch["token_type_ids"],
                    labels=batch["labels"],
                )

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
        return f1.compute(predictions=predictions, references=batch["labels"], average='macro')

    # Train and evaluate the model
    model.to(device)
    for i in range(CFG.epochs):
        train_one_epoch(model, scheduler, train_loader, optimizer)
        accuracy = evaluate_model(model, eval_loader)
        wandb.log(
            {
                "epoch": i+1,
                "accuracy": accuracy
            }
        )
        print(f'Epoch {i+1} {accuracy}')
    return model

In [7]:
model = main()

[34m[1mwandb[0m: Currently logged in as: [33masiatic-cheetah[0m ([33masiatic-cheetah-a[0m). Use [1m`wandb login --relogin`[0m to force relogin


Downloading readme:   0%|          | 0.00/397 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/126k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/19.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1524 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/218 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/816k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.51M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ioai2024japan/redrock_006_task2_pretrain and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 1 {'f1': 0.2376470588235294}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 2 {'f1': 0.45681818181818185}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 3 {'f1': 0.5351158645276292}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 4 {'f1': 0.5498989898989899}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 5 {'f1': 0.5145743145743145}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 6 {'f1': 0.7444444444444445}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 7 {'f1': 0.7444444444444445}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 8 {'f1': 0.7847058823529413}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 9 {'f1': 0.7847058823529413}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 10 {'f1': 0.7847058823529413}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 11 {'f1': 0.7847058823529413}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 12 {'f1': 0.7847058823529413}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 13 {'f1': 0.7847058823529413}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 14 {'f1': 0.7847058823529413}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 15 {'f1': 0.7847058823529413}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 16 {'f1': 0.7847058823529413}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 17 {'f1': 0.7847058823529413}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 18 {'f1': 0.7847058823529413}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 19 {'f1': 0.752584670231729}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 20 {'f1': 0.7847058823529413}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 21 {'f1': 0.752584670231729}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 22 {'f1': 0.752584670231729}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 23 {'f1': 0.752584670231729}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 24 {'f1': 0.752584670231729}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 25 {'f1': 0.752584670231729}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 26 {'f1': 0.752584670231729}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 27 {'f1': 0.752584670231729}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 28 {'f1': 0.752584670231729}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 29 {'f1': 0.752584670231729}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 30 {'f1': 0.752584670231729}


In [8]:
model.push_to_hub(
    f"ioai2024japan/{CFG.name}",
    token=userdata.get('hf_write'), private=True
)

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ioai2024japan/redrock_006_task2_classification/commit/2839ea355eb6454632fe9a985ab1da08e3fcae20', commit_message='Upload BertForSequenceClassification', commit_description='', oid='2839ea355eb6454632fe9a985ab1da08e3fcae20', pr_url=None, pr_revision=None, pr_num=None)