<a href="https://colab.research.google.com/github/chizuchizu/IOAI/blob/main/Task2/chizu_013_task2_chizu007_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# ====================================================
# CFG
# ====================================================

class CFG:
    num_workers=4
    project = "IOAI_Task2_classification"
    name = "task2_chizu_013_task2_chizu007_base"

    # model
    base_model_name = "ioai2024japan/Task2_chizu_007_pretrain"
    tokenizer_name = "google-bert/bert-base-multilingual-uncased"
    num_classes = 5

    # training
    epochs = 20

    scheduler='CosineAnnealingLR' # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']

    lr = 1e-05

    # dataset
    max_length = 256

    # T4: 32
    # L4: 64
    train_batch_size = 64
    eval_batch_size = 64

    seed=42
    train=True

# for wandb
cfg = dict(vars(CFG))
cfg = {k: v for k, v in cfg.items() if "__" not in k}

In [2]:
from google.colab import userdata

read_access_token = userdata.get('hf_read')
write_access_token = userdata.get('hf_write')

### Dependencies

In [3]:
import importlib
import torch, transformers

if '2.3.0' not in torch.__version__:
  !pip install torch==2.3.0
if transformers.__version__!='4.41.2':
  !pip install transformers==4.41.2

if importlib.util.find_spec('datasets') is None:
  !pip install datasets==2.18.0s
  !pip install evaluate==0.4.2
  !pip install accelerate -U

if importlib.util.find_spec('wandb') is None:
  !pip install wandb -q

If you've just installed `accelerate`, execute `Runtime > Restart session and run all` in the Colab UI menu above.

In [10]:
import os
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.functional import F

import torchvision
from torchvision import datasets, transforms, models

from tqdm.auto import tqdm

from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, get_scheduler
from datasets import load_dataset, Dataset, DatasetDict

import evaluate

import wandb

from huggingface_hub import login

wandb.login(key=userdata.get('wandb_token'))
login(token=read_access_token)


from huggingface_hub import login

login(token=read_access_token)



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:


brahmi_to_devanagari = {
    'ëÄì': '‡§ï', 'ëÄî': '‡§ñ', 'ëÄï': '‡§ó', 'ëÄñ': '‡§ò', 'ëÄó': '‡§ô', 'ëÄò': '‡§ö', 'ëÄô': '‡§õ',
    'ëÄö': '‡§ú', 'ëÄõ': '‡§ù', 'ëÄú': '‡§û', 'ëÄù': '‡§ü', 'ëÄû': '‡§†', 'ëÄü': '‡§°', 'ëÄ†': '‡§¢',
    'ëÄ°': '‡§£', 'ëÄ¢': '‡§§', 'ëÄ£': '‡§•', 'ëÄ§': '‡§¶', 'ëÄ•': '‡§ß', 'ëÄ¶': '‡§®', 'ëÄß': '‡§™',
    'ëÄ®': '‡§´', 'ëÄ©': '‡§¨', 'ëÄ™': '‡§≠', 'ëÄ´': '‡§Æ', 'ëÄ¨': '‡§Ø', 'ëÄ≠': '‡§∞', 'ëÄÆ': '‡§≤',
    'ëÄØ': '‡§µ', 'ëÄ∞': '‡§∂', 'ëÄ±': '‡§∑', 'ëÄ≤': '‡§∏', 'ëÄ≥': '‡§π', 'ëÅ¶':'‡•¶', 'ëÅ£': '‡§®‡§¨‡•ç‡§¨‡•á'
}

def transliterate_brahmi_to_latin(text):
    transliterated_text = ''
    for char in text:
        if char in brahmi_to_devanagari:
            transliterated_text += brahmi_to_devanagari[char]
        else:
            transliterated_text += char
    return transliterated_text


f1 = evaluate.load("f1")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')

def to_device(batch, device):
    output = {}
    for k, v in batch.items():
        try:
            output[k] = v.to(device)
        except:
            output[k] = v
    return output


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [16]:
def main():

    wandb.init(
        name=CFG.name,
        project=CFG.project,
        config=cfg
    )
    classification_dataset = load_dataset('InternationalOlympiadAI/NLP_problem', token=read_access_token)
    tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_name)

    def transform(example_batch):
        example_batch["text"] = [transliterate_brahmi_to_latin(x) for x in example_batch["text"]]
        inputs = tokenizer([x for x in example_batch["text"]],  truncation=True, max_length=CFG.max_length, padding="max_length", return_tensors="pt")
        inputs["labels"] = example_batch["label"]
        return inputs


    tokenized_data = classification_dataset.with_transform(transform)

    model = AutoModelForSequenceClassification.from_pretrained(
        CFG.base_model_name, num_labels=CFG.num_classes, token=read_access_token
    ).cuda()



    # dataset
    train_dataset = tokenized_data["train"]
    eval_dataset = tokenized_data["dev"]
    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.train_batch_size,
        num_workers=0,
        pin_memory=True,
        shuffle=True,
        drop_last=True,
    )
    eval_loader = DataLoader(
        eval_dataset,
        batch_size=CFG.eval_batch_size,
        num_workers=0,
        pin_memory=True,
        shuffle=False,
    )

    epochs = CFG.epochs
    num_training_steps = epochs * len(train_loader)
    optimizer = optim.AdamW(model.parameters(), lr=CFG.lr)
    scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    def train_one_epoch(model, scheduler, train_loader, optimizer):
        model.train()
        running_loss = 0.0
        progress_bar = tqdm(train_loader, dynamic_ncols=True)

        for step, batch in enumerate(progress_bar):
            batch = to_device(batch, device)
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                token_type_ids=batch["token_type_ids"],
                labels=batch["labels"],
            )
            loss = outputs.loss
            loss.backward()

            text = f"step {step}, loss: {loss:.5f}"
            progress_bar.set_description(text)

            wandb.log(
                {
                    "train_loss": loss,
                    "lr": optimizer.param_groups[0]["lr"],
                    "step": step,
                }
            )
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

    def evaluate_model(model, test_loader):
        model.eval()
        for batch in eval_loader:
            batch = to_device(batch, device)
            with torch.no_grad():
                outputs = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    token_type_ids=batch["token_type_ids"],
                    labels=batch["labels"],
                )

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
        return f1.compute(predictions=predictions, references=batch["labels"], average='macro')

    # Train and evaluate the model
    model.to(device)
    for i in range(CFG.epochs):
        train_one_epoch(model, scheduler, train_loader, optimizer)
        accuracy = evaluate_model(model, eval_loader)
        wandb.log(
            {
                "epoch": i+1,
                "accuracy": accuracy
            }
        )
        print(f'Epoch {i+1} {accuracy}')
    return model

In [17]:
model = main()

VBox(children=(Label(value='0.012 MB of 0.012 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ioai2024japan/Task2_chizu_007_pretrain and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 1 {'f1': 0.3719480519480519}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 2 {'f1': 0.5406060606060605}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 3 {'f1': 0.6682539682539683}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 4 {'f1': 0.8533868092691621}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 5 {'f1': 0.7955555555555556}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 6 {'f1': 0.888888888888889}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 7 {'f1': 0.888888888888889}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 8 {'f1': 0.888888888888889}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 9 {'f1': 0.8277777777777778}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 10 {'f1': 0.888888888888889}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 11 {'f1': 0.888888888888889}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 12 {'f1': 0.888888888888889}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 13 {'f1': 0.888888888888889}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 14 {'f1': 0.888888888888889}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 15 {'f1': 0.888888888888889}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 16 {'f1': 0.888888888888889}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 17 {'f1': 0.888888888888889}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 18 {'f1': 0.888888888888889}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 19 {'f1': 0.888888888888889}


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 20 {'f1': 0.888888888888889}


In [None]:
model.push_to_hub(
    f"ioai2024japan/{CFG.name}",
    token=userdata.get('hf_write'), private=True
)

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ioai2024japan/task2_chizu_012_redrock006_base/commit/5c403fdf5fad00dc17e50f62f4a2e2c6872ff19a', commit_message='Upload BertForSequenceClassification', commit_description='', oid='5c403fdf5fad00dc17e50f62f4a2e2c6872ff19a', pr_url=None, pr_revision=None, pr_num=None)