<a href="https://colab.research.google.com/github/chizuchizu/IOAI/blob/main/Task2/redrock_003_alpha_task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import userdata

read_access_token = userdata.get('hf_read')
write_access_token = userdata.get('hf_write')

In [2]:
model_url = 'ioai2024japan/redrock-002-pretrained-model'

### Dependencies

In [3]:
import importlib
import torch, transformers

if '2.3.0' not in torch.__version__:
  !pip install torch==2.3.0
if transformers.__version__!='4.41.2':
  !pip install transformers==4.41.2

if importlib.util.find_spec('datasets') is None:
  !pip install datasets==2.18.0s
  !pip install evaluate==0.4.2
  !pip install accelerate -U


If you've just installed `accelerate`, execute `Runtime > Restart session and run all` in the Colab UI menu above.

In [4]:
import os
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.functional import F

import torchvision
from torchvision import datasets, transforms, models

from tqdm.auto import tqdm

from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, get_scheduler
from datasets import load_dataset, Dataset, DatasetDict

import evaluate

In [5]:
# ====================================================
# CFG
# ====================================================

class CFG:
    num_workers=4
    project = "IOAI_Task2"
    name = "redrock_002"

    tokenizer_name = "google-bert/bert-base-multilingual-uncased"

    # model
    base_model_name = model_url
    num_classes = 5

    # training
    epochs = 20

    scheduler='CosineAnnealingLR' # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']

    lr = 1e-05

    # dataset
    max_length = 256

    # T4: 32
    # L4: 64
    train_batch_size = 32
    eval_batch_size = 32

    seed=42
    train=True

In [6]:
classification_dataset = load_dataset('InternationalOlympiadAI/NLP_problem', token=read_access_token)
tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_name)

brahmi_to_devanagari = {
    '𑀓': 'क', '𑀔': 'ख', '𑀕': 'ग', '𑀖': 'घ', '𑀗': 'ङ', '𑀘': 'च', '𑀙': 'छ',
    '𑀚': 'ज', '𑀛': 'झ', '𑀜': 'ञ', '𑀝': 'ट', '𑀞': 'ठ', '𑀟': 'ड', '𑀠': 'ढ',
    '𑀡': 'ण', '𑀢': 'त', '𑀣': 'थ', '𑀤': 'द', '𑀥': 'ध', '𑀦': 'न', '𑀧': 'प',
    '𑀨': 'फ', '𑀩': 'ब', '𑀪': 'भ', '𑀫': 'म', '𑀬': 'य', '𑀭': 'र', '𑀮': 'ल',
    '𑀯': 'व', '𑀰': 'श', '𑀱': 'ष', '𑀲': 'स', '𑀳': 'ह', '𑁦':'०', '𑁣': 'नब्बे'
}

def transliterate_brahmi_to_latin(text):
    transliterated_text = ''
    for char in text:
        if char in brahmi_to_devanagari:
            transliterated_text += brahmi_to_devanagari[char]
        else:
            transliterated_text += char
    return transliterated_text

def transform(example_batch):
    example_batch["text"] = [transliterate_brahmi_to_latin(x) for x in example_batch["text"]]
    inputs = tokenizer([x for x in example_batch["text"]],  truncation=True, max_length=CFG.max_length, padding="max_length", return_tensors="pt")
    inputs["labels"] = example_batch["label"]
    return inputs


tokenized_data = classification_dataset.with_transform(transform)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
f1 = evaluate.load("f1")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    CFG.base_model_name, num_labels=CFG.num_classes, token=read_access_token
).cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ioai2024japan/redrock-002-pretrained-model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def to_device(batch, device):
    output = {}
    for k, v in batch.items():
        try:
            output[k] = v.to(device)
        except:
            output[k] = v
    return output

In [10]:
# dataset
train_dataset = tokenized_data["train"]
eval_dataset = tokenized_data["dev"]
train_loader = DataLoader(
    train_dataset,
    batch_size=CFG.train_batch_size,
    num_workers=0,
    pin_memory=True,
    shuffle=True,
    drop_last=True,
)
eval_loader = DataLoader(
    eval_dataset,
    batch_size=CFG.eval_batch_size,
    num_workers=0,
    pin_memory=True,
    shuffle=False,
)

epochs = 20
num_training_steps = epochs * len(train_loader)
optimizer = optim.AdamW(model.parameters(), lr=CFG.lr)
scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [11]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

def train_one_epoch(model, scheduler, train_loader, optimizer):
    model.train()
    running_loss = 0.0
    for batch in tqdm(train_loader):
        batch = to_device(batch, device)
        #print(batch)
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            token_type_ids=batch["token_type_ids"],
            labels=batch["labels"],
        )
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

def evaluate_model(model, test_loader):
    model.eval()
    for batch in eval_loader:
        batch = to_device(batch, device)
        with torch.no_grad():
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                token_type_ids=batch["token_type_ids"],
                labels=batch["labels"],
            )

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
    return f1.compute(predictions=predictions, references=batch["labels"], average='macro')

# Train and evaluate the model
model.to(device)
for i in range(CFG.epochs):
    train_one_epoch(model, scheduler, train_loader, optimizer)
    accuracy = evaluate_model(model, eval_loader)
    print(f'Epoch {i+1} {accuracy}')

  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 1 {'f1': 0.28524712002972874}


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 2 {'f1': 0.6313279857397504}


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 3 {'f1': 0.8502392344497608}


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 4 {'f1': 0.7883422459893048}


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 5 {'f1': 0.8033868092691622}


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 6 {'f1': 0.8616755793226382}


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 7 {'f1': 0.8631002331002332}


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 8 {'f1': 0.8668181818181818}


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 9 {'f1': 0.8501515151515152}


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 10 {'f1': 0.8668181818181818}


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 11 {'f1': 0.8157192807192807}


  0%|          | 0/47 [00:00<?, ?it/s]

Epoch 12 {'f1': 0.8668181818181818}


  0%|          | 0/47 [00:00<?, ?it/s]

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

# Data

In [None]:
# write the predictions to a file
"""
with open('{}_predictions.txt'.format(data_split), 'w') as outfile:
  outfile.write('\n'.join([str(p) for p in predictions.tolist()]))
"""

In [None]:
import gc
import torch

def flush():
  gc.collect()
  torch.cuda.empty_cache()
flush()