<a href="https://colab.research.google.com/github/chizuchizu/IOAI/blob/main/Task2/002_redrock_complete_task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# !cp /content/drive/MyDrive/fasttext_model.bin .

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ====================================================
# CFG
# ====================================================

class CFG:
    num_workers=4
    project = "IOAI_Task2_classification"
    name = "002_redrock_complete_task2"

    # pseudo_base_model_name = "ioai2024japan/redrock_015_task2_finetune"
    base_model_name = "ioai2024japan/mbert_fasttext"
    base_tokenizer_name = "google-bert/bert-base-multilingual-uncased"

    # None -> training, otherwise -> load the model
    pretrained_model_name = "ioai2024japan/fast_chizu_024_task2_complete_pretrain"
    tokenizer_name = "google-bert/bert-base-multilingual-uncased"

    fasttext_path = "/content/drive/MyDrive/fasttext_model_BDAIO.bin"

    num_classes = 5

    # training
    pretrain_epochs = 2
    grid_epochs = 8
    classification_epochs = 20
    mlm_probability = 0.15

    if_grid = True
    scheduler='cosine' # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']
    lr = 1e-5

    # dataset
    max_length = 256

    train_batch_size = 64
    eval_batch_size = 64

    seed=42
    train=True

    pseudo_size = 60000
    pseudo_select_size = 1500

    if_wandb = True

    if_fasttext_tokenizer = True

# for wandb
cfg = dict(vars(CFG))
cfg = {k: v for k, v in cfg.items() if "__" not in k}

In [None]:
from google.colab import userdata

read_access_token = userdata.get('hf_read')
write_access_token = userdata.get('hf_write')

import importlib
import torch, transformers

if '2.3.0' not in torch.__version__:
  !pip install torch==2.3.0
if transformers.__version__!='4.41.2':
  !pip install transformers==4.41.2

if importlib.util.find_spec('datasets') is None:
  !pip install datasets==2.18.0s
  !pip install evaluate==0.4.2
  !pip install accelerate -U

if importlib.util.find_spec('wandb') is None:
  !pip install wandb -q

if importlib.util.find_spec('fasttext') is None:
  !pip install fasttext -q

import os
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.functional import F
import torch.cuda.amp as amp

import torchvision
from torchvision import datasets, transforms, models

from tqdm.auto import tqdm
from transformers import DataCollatorForLanguageModeling

from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, get_scheduler, BertForMaskedLM, BertTokenizer, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets

import evaluate

import wandb

import fasttext

from joblib import Parallel, delayed


from huggingface_hub import login

if CFG.if_wandb:
    wandb.login(key=userdata.get('wandb_token'))

login(token=read_access_token)

# brahmi_to_devanagari = {
#     '𑀓': 'क', '𑀔': 'ख', '𑀕': 'ग', '𑀖': 'घ', '𑀗': 'ङ', '𑀘': 'च', '𑀙': 'छ',
#     '𑀚': 'ज', '𑀛': 'झ', '𑀜': 'ञ', '𑀝': 'ट', '𑀞': 'ठ', '𑀟': 'ड', '𑀠': 'ढ',
#     '𑀡': 'ण', '𑀢': 'त', '𑀣': 'थ', '𑀤': 'द', '𑀥': 'ध', '𑀦': 'न', '𑀧': 'प',
#     '𑀨': 'फ', '𑀩': 'ब', '𑀪': 'भ', '𑀫': 'म', '𑀬': 'य', '𑀭': 'र', '𑀮': 'ल',
#     '𑀯': 'व', '𑀰': 'श', '𑀱': 'ष', '𑀲': 'स', '𑀳': 'ह', '𑁦':'०', '𑁣': '90'
# }

# def transliterate_brahmi_to_devanagari(text):
#     transliterated_text = ''
#     for char in text:
#         if char in brahmi_to_devanagari:
#             transliterated_text += brahmi_to_devanagari[char]
#         else:
#             transliterated_text += char
#     return transliterated_text

f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')

def to_device(batch, device):
    output = {}
    for k, v in batch.items():
        try:
            output[k] = v.to(device)
        except:
            output[k] = v
    return output


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/73.4 kB[0m [31m723.1 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m71.7/73.4 kB[0m [31m985.4 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m874.7 kB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone


[34m[1mwandb[0m: Currently logged in as: [33masiatic-cheetah[0m ([33masiatic-cheetah-a[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
def train_tokenizer(raw_dataset):
    train_corpus = []

    num_cores = 8

    # train_corpus = Parallel(n_jobs=num_cores)(
    #     delayed(text) for text in tqdm(raw_dataset['train']["text"])
    # )
    train_corpus = [text for text in raw_dataset['train']["text"]]

    base_tokenizer = AutoTokenizer.from_pretrained(CFG.base_tokenizer_name)

    tokenizer = base_tokenizer.train_new_from_iterator(train_corpus, base_tokenizer.vocab_size)

    return tokenizer

In [None]:
def train_one_epoch(model, scheduler, train_loader, optimizer, if_grid, fp16=False):
    model.train()
    running_loss = 0.0
    progress_bar = tqdm(train_loader, dynamic_ncols=True, leave=(not if_grid))
    scaler = torch.cuda.amp.GradScaler()

    for step, batch in enumerate(progress_bar):
        batch = to_device(batch, "cuda")

        if fp16:
            with amp.autocast():
                outputs = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    token_type_ids=batch["token_type_ids"],
                    labels=batch["labels"],
                )
                loss = outputs.loss

            # Scale loss for fp16 training
            scaler.scale(loss).backward()

            # Optimizer step with gradient scaling
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()
        else:
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                token_type_ids=batch["token_type_ids"],
                labels=batch["labels"],
            )
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        if CFG.if_wandb and (not if_grid):
            wandb.log(
                {
                    "train_loss": loss,
                    "lr": optimizer.param_groups[0]["lr"],
                    "step": step,
                }
            )

        text = f"step {step}, loss: {loss:.5f}"
        progress_bar.set_description(text)

def evaluate_model(model, eval_loader):
    model.eval()
    predictions = []
    labels = []
    for batch in eval_loader:
        batch = to_device(batch, "cuda")
        with torch.no_grad():
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                token_type_ids=batch["token_type_ids"],
                labels=batch["labels"],
            )

        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1)
        predictions.append(prediction.cpu().numpy())
        labels.append(batch["labels"].cpu().numpy())

    predictions = np.concatenate(predictions)
    labels = np.concatenate(labels)
    f1_score = f1.compute(predictions=predictions, references=labels, average='macro')
    return f1_score, predictions

def test_model(model, eval_loader):
    model.eval()
    predictions = []
    labels = []
    for batch in eval_loader:
        batch = to_device(batch, "cuda")
        with torch.no_grad():
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                token_type_ids=batch["token_type_ids"],
            )

        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1)
        predictions.append(prediction.cpu().numpy())

    predictions = np.concatenate(predictions)
    return predictions

In [None]:
def pretrain(raw_dataset, tokenizer, transform_raw, fp16=True):
    print("=== Pretrain ===")

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=CFG.mlm_probability
    )

    tokenized_data = raw_dataset.with_transform(transform_raw)

    train_dataset = tokenized_data["train"]

    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.train_batch_size,
        num_workers=0,
        pin_memory=True,
        shuffle=True,
        drop_last=False,
        collate_fn=data_collator,
    )

    model = BertForMaskedLM.from_pretrained(
        CFG.base_model_name
    ).cuda()

    num_training_steps = CFG.pretrain_epochs * len(train_loader)
    optimizer = optim.AdamW(model.parameters(), lr=CFG.lr, eps=1e-08, betas=(0.9, 0.999))
    scheduler = get_scheduler(name=CFG.scheduler, optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    for i in range(CFG.pretrain_epochs):
        train_one_epoch(model, scheduler, train_loader, optimizer, False, fp16)
        print(f'Epoch {i+1}')

    model_path = f"{CFG.name}_pretrain"
    model.save_pretrained(model_path)
    return model_path

In [None]:
def finetune(base_model, train_dataset, eval_dataset, device, lr, scheduler, fp16=False):
    print("=== Finetune ===")
    model = AutoModelForSequenceClassification.from_pretrained(
        base_model, num_labels=CFG.num_classes
    ).cuda()

    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.train_batch_size,
        num_workers=0,
        pin_memory=True,
        shuffle=True,
        drop_last=True,
    )

    eval_loader = DataLoader(
        eval_dataset,
        batch_size=CFG.eval_batch_size,
        num_workers=0,
        pin_memory=True,
        shuffle=False,
        drop_last=False,
    )

    num_training_steps = CFG.classification_epochs * len(train_loader)
    optimizer = optim.AdamW(model.parameters(), lr=lr, eps=1e-08, betas=(0.9, 0.999))

    if scheduler == 'cosine_with_warmup':
        scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=num_training_steps/10, num_training_steps=num_training_steps)
    elif scheduler == 'linear_with_warmup':
        scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=num_training_steps/10, num_training_steps=num_training_steps)
    else:
        scheduler = get_scheduler(name=scheduler, optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    best_f1 = 0.0
    best_model_path = None

    model.to(device)
    for i in range(CFG.classification_epochs):
        train_one_epoch(model, scheduler, train_loader, optimizer, False)
        f1_score, _ = evaluate_model(model, eval_loader)
        f1_score = f1_score["f1"]

        if f1_score > best_f1:
            best_f1 = f1_score
            best_model_path = f"{CFG.name}_finetune_epoch_{i+1}"
            model.save_pretrained(best_model_path)


        if CFG.if_wandb:
            wandb.log(
                {
                    "epoch": i+1,
                    "f1": f1_score
                }
            )
        print(f'Epoch {i+1} {f1_score}')

    # model_path = f"{CFG.name}_finetune"
    # model.save_pretrained(model_path)
    return best_model_path

In [None]:
def up_to_hub(model_name, model_path, tokenizer):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path, num_labels=CFG.num_classes
    )
    model.push_to_hub(
        f"ioai2024japan/{model_name}",
        token=write_access_token, private=True
    )
    tokenizer.push_to_hub(
        f"ioai2024japan/{model_name}",
        token=write_access_token, private=True
    )

In [None]:
def param_test(base_model, train_dataset, eval_dataset, device, lr, scheduler, fp16=False):
    model = AutoModelForSequenceClassification.from_pretrained(
        base_model, num_labels=CFG.num_classes
    ).cuda()

    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.train_batch_size,
        num_workers=0,
        pin_memory=True,
        shuffle=True,
        drop_last=True,
    )

    eval_loader = DataLoader(
        eval_dataset,
        batch_size=CFG.eval_batch_size,
        num_workers=0,
        pin_memory=True,
        shuffle=False,
        drop_last=False,
    )

    num_training_steps = CFG.grid_epochs * len(train_loader)
    optimizer = optim.AdamW(model.parameters(), lr=lr, eps=1e-08, betas=(0.9, 0.999))
    if scheduler == 'cosine_with_warmup':
        scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=num_training_steps/10, num_training_steps=num_training_steps)
    elif scheduler == 'linear_with_warmup':
        scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=num_training_steps/10, num_training_steps=num_training_steps)
    else:
        scheduler = get_scheduler(name=scheduler, optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    best_f1 = 0.0
    best_model_path = None

    model.to(device)
    for i in range(CFG.grid_epochs):
        train_one_epoch(model, scheduler, train_loader, optimizer, True)
        f1_score, _ = evaluate_model(model, eval_loader)
        f1_score = f1_score["f1"]

        if f1_score > best_f1:
            best_f1 = f1_score
            # best_model_path = f"{CFG.name}_finetune_epoch_{i+1}"
            # model.save_pretrained(best_model_path)

    return best_f1

In [None]:
def grid_search(grid_search_params, base_model, train_dataset, eval_dataset, device):
    print("=== GRID SEARCH===")

    best_f1 = 0.0

    for lr in grid_search_params[0]:
        print(f"lr: {lr}")
        for scheduler in grid_search_params[1]:
            f1_score = param_test(base_model, train_dataset, eval_dataset, device, lr=lr, scheduler=scheduler)
            print(f'scheduler: {scheduler}, f1: {f1_score}')
            if f1_score > best_f1:
                best_f1 = f1_score
                best_scheduler = scheduler
                best_lr = lr

    return best_lr, best_scheduler

In [None]:
def main():
    raw_dataset = load_dataset('InternationalOlympiadAI/NLP_problem_raw', token=read_access_token)
    classification_dataset = load_dataset('InternationalOlympiadAI/NLP_problem', token=read_access_token)

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    # Train tokenizer to Lang X
    if CFG.if_fasttext_tokenizer:
        fasttext_model = fasttext.load_model(CFG.fasttext_path)

        vocab = {word: i for i, word in enumerate(fasttext_model.get_words())}
        special_tokens = [
            "[PAD]",
            "[CLS]",
            "[SEP]",
            "[MASK]",
            "[UNK]"
        ]
        vocab_file = "vocab.txt"  # Temporary vocab file name

        # Write the vocabulary to a temporary file
        with open(vocab_file, "w") as f:
            for word in special_tokens:
                f.write(word + "\n")
            for word in vocab:
                f.write(word + "\n")

        tokenizer = BertTokenizer(
            vocab_file="vocab.txt",
            do_lower_case=False,
            unk_token='[UNK]',
            sep_token='[SEP]',
            pad_token='[PAD]',
            cls_token='[CLS]',
            mask_token='[MASK]',
        )
    else:
        tokenizer = train_tokenizer(raw_dataset)

    # for raw set
    def transform_raw(example_batch):
        # example_batch["text"] = [transliterate_brahmi_to_devanagari(x) for x in example_batch["text"]]
        inputs = tokenizer([x for x in example_batch["text"]],  truncation=True, max_length=CFG.max_length, padding="max_length", return_tensors="pt")
        return inputs

    # for problem set
    def transform(example_batch):
        # example_batch["text"] = [transliterate_brahmi_to_devanagari(x) for x in example_batch["text"]]
        inputs = tokenizer([x for x in example_batch["text"]],  truncation=True, max_length=CFG.max_length, padding="max_length", return_tensors="pt")
        inputs["labels"] = example_batch["label"]
        return inputs

    tokenized_data = classification_dataset.with_transform(transform)

    train_dataset = tokenized_data["train"]
    eval_dataset = tokenized_data["dev"]

    # Continual Pre-Training of MLM
    if CFG.pretrained_model_name is None:
        if CFG.if_wandb:
            wandb.init(
                name=CFG.name,
                project="IOAI_Task2_pretrain",
                config=cfg
            )
        pretrained_model_path = pretrain(raw_dataset, tokenizer, transform_raw, fp16=True)
        up_to_hub(f"{CFG.name}_pretrain", pretrained_model_path, tokenizer)
    else:
        pretrained_model_path = CFG.pretrained_model_name

    if CFG.if_wandb:
        wandb.init(
            name=CFG.name,
            project="IOAI_Task2_finetune",
            config=cfg
        )

    # Finetune with normal dataset

    grid_search_params = [
        [6e-6, 7e-6, 8e-6, 9e-6, 1e-5, 2e-5, 3e-5, 5e-5, 1e-4],
        ['linear', 'cosine', 'linear_with_warmup', 'cosine_with_warmup']
    ]

    if CFG.if_grid:
        best_lr, best_scheduler = grid_search(grid_search_params, pretrained_model_path, train_dataset, eval_dataset, device)
    else:
        best_lr = CFG.lr
        best_scheduler = CFG.scheduler

    finetuned_model_path = finetune(pretrained_model_path, train_dataset, eval_dataset, device, best_lr, best_scheduler)

    if CFG.if_wandb:
        wandb.finish()

    # # Get pseudo label
    # pseudo_data, confidences = pseudo_get_data(raw_dataset, transform_raw, finetuned_model_path, device)
    # pseudo_labeled_tokens = pseudo_data.with_transform(transform)

    # combined_train_dataset = concatenate_datasets([pseudo_labeled_tokens, train_dataset])

    # # Finetune with pseudo dataset and normal dataset
    # final_model_name = finetune(pretrained_model_path, combined_train_dataset, eval_dataset, device)

    return finetuned_model_path, tokenizer

In [None]:
final_model_name, tokenizer = main()

VBox(children=(Label(value='0.002 MB of 0.015 MB uploaded\r'), FloatProgress(value=0.1419737663960025, max=1.0…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112305933334534, max=1.0…

=== GRID SEARCH===
lr: 6e-06


  0%|          | 0/23 [00:00<?, ?it/s]

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
up_to_hub(CFG.name, final_model_name, tokenizer)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    final_model_name, num_labels=CFG.num_classes
)

In [None]:
# run the trained model on a dev/test split
classification_dataset = load_dataset('InternationalOlympiadAI/NLP_problem', token=read_access_token)

def transform_raw(example_batch):
    # example_batch["text"] = [transliterate_brahmi_to_devanagari(x) for x in example_batch["text"]]
    inputs = tokenizer([x for x in example_batch["text"]],  truncation=True, max_length=CFG.max_length, padding="max_length", return_tensors="pt")
    return inputs

data_split = "dev"
tokenized_data = classification_dataset.with_transform(transform_raw)
test_dataset = tokenized_data[data_split]

test_loader = DataLoader(
    test_dataset,
    batch_size=CFG.eval_batch_size,
    num_workers=0,
    pin_memory=True,
    shuffle=False,
    drop_last=False,
)
model.cuda()

In [None]:
predictions = test_model(model, test_loader)

In [None]:
# write the predictions to a file
with open('{}_predictions.txt'.format(data_split), 'w') as outfile:
  outfile.write('\n'.join([str(p) for p in predictions.tolist()]))

In [None]:
predictions

In [None]:
# UPDATE THIS CELL ACCORDINGLY

# define a funciton to load your tokenizer and model from a HF path
# the path variables can be strings or lists of strings (for ensemble solutions)
def load_model(path_to_tokenizer, path_to_model, token):
  # Example:
  tokenizer = AutoTokenizer.from_pretrained(path_to_tokenizer, token=token)
  model = AutoModelForSequenceClassification.from_pretrained(path_to_model, token=token)
  model.eval()

  return tokenizer, model

# define a "predict" function that takes the model and a list of input strings
# and returns the outputs as a list of integer classes
def predict(tokenizer, model, input_texts):
  #Example:
  predictions = []
  for input_text in input_texts:

    # input_ids = tokenizer(input_text, return_tensors="pt")
    # devanagari_text = transliterate_brahmi_to_devanagari(input_text)
    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=CFG.max_length, padding="max_length")
        # example_batch["text"] = [transliterate_brahmi_to_devanagari(x) for x in example_batch["text"]]
        # inputs = tokenizer([x for x in example_batch["text"]],  truncation=True, max_length=CFG.max_length, padding="max_length", return_tensors="pt")
    with torch.no_grad():
      logits = model(**input_ids).logits

    predictions.append(logits.argmax().item())

  return predictions


# set variables
path_to_model = "ioai2024japan/chizu_010_task2_complete" # can be a list instead
path_to_tokenizer = "ioai2024japan/chizu_010_task2_complete" # can be a list instead
model_access_token = read_access_token # a fine-grained token with read rights for your model repository
data_split = "test"

In [None]:
# DO NOT CHANGE THIS CELL!!!

tokenizer, model = load_model(path_to_model, path_to_tokenizer, token=model_access_token)

test_data = load_dataset("InternationalOlympiadAI/NLP_problem_test")['test']['text']

predictions = predict(tokenizer, model, test_data)

with open('{}_predictions.txt'.format(data_split), 'w') as outfile:
  outfile.write('\n'.join([str(p) for p in predictions]))

In [None]:
def terminate_session():
    # Terminate this session

    from google.colab import runtime
    runtime.unassign()

terminate_session()