<a href="https://colab.research.google.com/github/chizuchizu/IOAI/blob/main/Task2/redrock_009_task2_pseudo_copy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 準備

In [1]:
from google.colab import userdata

read_access_token = userdata.get('hf_read')
write_access_token = userdata.get('hf_write')

import importlib
import torch, transformers

if '2.3.0' not in torch.__version__:
  !pip install torch==2.3.0
if transformers.__version__!='4.41.2':
  !pip install transformers==4.41.2

if importlib.util.find_spec('datasets') is None:
  !pip install datasets==2.18.0
  !pip install evaluate==0.4.2
  !pip install accelerate -U

if importlib.util.find_spec("wandb") is None:
  !pip install wandb -q

In [2]:
import os
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.functional import F

import torchvision
from torchvision import datasets, transforms, models

from tqdm.auto import tqdm

from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, get_scheduler, BertForMaskedLM, BertTokenizer
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
import torch.nn.functional as F

import torch.cuda.amp as amp # or import torch.cuda.amp as amp for PyTorch's native amp


import evaluate
import wandb

In [3]:
# ====================================================
# CFG
# ====================================================

class CFG:
    num_workers=4
    project = "IOAI_Task2_classification"
    name = "redrock_009_task2_pseudo_from_pretrained"

    # model
    base_model_name = "ioai2024japan/redrock_005_task2_pretrain_wandb"
    num_classes = 5
    tokenizer_name = "ioai2024japan/redrock_005_task2_tokenizer"

    # training
    epochs = 20

    scheduler='CosineAnnealingLR' # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']

    lr = 1e-05

    # dataset
    max_length = 256

    # T4: 32
    # L4: 64
    train_batch_size = 32
    eval_batch_size = 32

    pseudo_size = 2000

    seed=42
    train=True

In [4]:
wandb.login(key=userdata.get('wandb_token'))
cfg = dict(vars(CFG))
cfg = {k: v for k, v in cfg.items() if "__" not in k}

[34m[1mwandb[0m: Currently logged in as: [33masiatic-cheetah[0m ([33masiatic-cheetah-a[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [6]:
# print(transliterate_brahmi_to_devanagari(raw_dataset["train"]["text"][0]))

brahmi_to_devanagari = {
    '𑀓': 'क', '𑀔': 'ख', '𑀕': 'ग', '𑀖': 'घ', '𑀗': 'ङ', '𑀘': 'च', '𑀙': 'छ',
    '𑀚': 'ज', '𑀛': 'झ', '𑀜': 'ञ', '𑀝': 'ट', '𑀞': 'ठ', '𑀟': 'ड', '𑀠': 'ढ',
    '𑀡': 'ण', '𑀢': 'त', '𑀣': 'थ', '𑀤': 'द', '𑀥': 'ध', '𑀦': 'न', '𑀧': 'प',
    '𑀨': 'फ', '𑀩': 'ब', '𑀪': 'भ', '𑀫': 'म', '𑀬': 'य', '𑀭': 'र', '𑀮': 'ल',
    '𑀯': 'व', '𑀰': 'श', '𑀱': 'ष', '𑀲': 'स', '𑀳': 'ह', '𑁦':'ऻ', '𑁣': 'ॉ'
}

def transliterate_brahmi_to_devanagari(text):
    transliterated_text = ''
    for char in text:
        if char in brahmi_to_devanagari:
            transliterated_text += brahmi_to_devanagari[char]
        else:
            transliterated_text += char
    return transliterated_text

def to_device(batch, device):
    output = {}
    for k, v in batch.items():
        try:
            output[k] = v.to(device)
        except:
            output[k] = v
    return output

def transform_raw(example_batch):
        example_batch["text"] = [transliterate_brahmi_to_devanagari(x) for x in example_batch["text"]]
        inputs = tokenizer([x for x in example_batch["text"]],  truncation=True, max_length=CFG.max_length, padding="max_length", return_tensors="pt")
        return inputs

def transform_class(example_batch):
        example_batch["text"] = [transliterate_brahmi_to_devanagari(x) for x in example_batch["text"]]
        inputs = tokenizer([x for x in example_batch["text"]],  truncation=True, max_length=CFG.max_length, padding="max_length", return_tensors="pt")
        inputs["labels"] = example_batch["label"]
        return inputs

f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')

def evaluate_model(model, eval_loader):
    model.eval()
    for batch in eval_loader:
        batch = to_device(batch, device)
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

    return f1.compute(predictions=predictions, references=batch["labels"], average='macro')


# データ

In [7]:
classification_dataset = load_dataset('InternationalOlympiadAI/NLP_problem', token=read_access_token)
raw_dataset = load_dataset('InternationalOlympiadAI/NLP_problem_raw', token=read_access_token)

tokenizer = AutoTokenizer.from_pretrained(CFG.tokenizer_name, token=read_access_token)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
def pred_pseudo(model, train_loader):
    model.eval()
    predictions_list = []
    confidences_list = []
    for batch in tqdm(train_loader):
        batch = to_device(batch, device)
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        confidences = F.softmax(logits, dim=-1)
        pred_confidence,_ = confidences.max(dim=-1)
        #pred_confidence,_ = pred_confidence_temp.max(dim=-1)
        # print(pred_confidence.shape)
        predictions = torch.argmax(logits, dim=-1)
        #predictions = torch.mode(predictions, dim=0).values
        # print(predictions.shape)
        predictions_list.extend(predictions.cpu().numpy())
        confidences_list.extend(pred_confidence.cpu().numpy())

    return predictions_list, confidences_list

# 推論

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
        CFG.base_model_name,
        token=read_access_token,
        num_labels=CFG.num_classes
).cuda()

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ioai2024japan/redrock_005_task2_pretrain_wandb and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
pseudo_size = CFG.pseudo_size

raw_eval_batch = raw_dataset["train"].select(range(0, pseudo_size))

print(raw_eval_batch)

tokenized_eval_batch = raw_eval_batch.with_transform(transform_raw)

raw_loader = DataLoader(
    tokenized_eval_batch,
    batch_size=CFG.train_batch_size,
    num_workers=0,
    pin_memory=True,
    shuffle=True,
    drop_last=True,
)

predictions, confidences = pred_pseudo(model, raw_loader)

Dataset({
    features: ['text'],
    num_rows: 2000
})


  0%|          | 0/62 [00:00<?, ?it/s]

In [11]:
print(predictions[0])
print(confidences[0])

0
0.23173913


In [12]:
def main():
    top_conf = np.argsort(confidences)[-500:]


    selected_texts = [transliterate_brahmi_to_devanagari(raw_eval_batch[int(i)]["text"]) for i in top_conf]
    selected_labels = [predictions[int(i)] for i in top_conf]

    pseudo_labeled_dataset = Dataset.from_dict({
        'text': selected_texts,
        'label': selected_labels
        })


    pseudo_labeled_tokens = pseudo_labeled_dataset.with_transform(transform_class)

    tokenized_data = classification_dataset.with_transform(transform_class)
    print(tokenized_data)

    train_dataset = tokenized_data["train"]
    eval_dataset = tokenized_data["dev"]

    print(pseudo_labeled_tokens.features)
    print(train_dataset.features)

    combined_train_dataset = concatenate_datasets([pseudo_labeled_tokens, train_dataset])
    combined_train_tokens = combined_train_dataset.with_transform(transform_class)
    print(combined_train_tokens)

    combined_train_loader = DataLoader(
        combined_train_tokens,
        batch_size=CFG.train_batch_size,
        num_workers=0,
        pin_memory=True,
        shuffle=True,
        drop_last=True,
    )

    print(eval_dataset.shape)

    eval_loader = DataLoader(
        eval_dataset,
        batch_size=CFG.eval_batch_size,
        num_workers=0,
        pin_memory=True,
        shuffle=False,
    )
    eval_loader = DataLoader(
        eval_dataset,
        batch_size=CFG.eval_batch_size,
        num_workers=0,
        pin_memory=True,
        shuffle=False,
    )

    wandb.init(
        name=CFG.name,
        project=CFG.project,
        config=cfg
    )

    num_training_steps = CFG.epochs * len(combined_train_loader)
    optimizer = optim.AdamW(model.parameters(), lr=CFG.lr)
    scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    scaler = torch.cuda.amp.GradScaler()

    def train_one_epoch(model, scheduler, train_loader, optimizer):
        model.train()
        running_loss = 0.0
        progress_bar = tqdm(train_loader, dynamic_ncols=True)

        for step, batch in enumerate(progress_bar):
            batch = to_device(batch, device)
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                token_type_ids=batch["token_type_ids"],
                labels=batch["labels"],
            )
            loss = outputs.loss
            loss.backward()

            text = f"step {step}, loss: {loss:.5f}"
            progress_bar.set_description(text)

            wandb.log(
                {
                    "train_loss": loss,
                    "lr": optimizer.param_groups[0]["lr"],
                    "step": step,
                }
            )
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

    # Train and evaluate the model
    model.to(device)
    for i in range(CFG.epochs):
        train_one_epoch(model, scheduler, combined_train_loader, optimizer)
        accuracy = evaluate_model(model, eval_loader)
        wandb.log(
            {
                "epoch": i+1,
                "accuracy": accuracy
            }
        )
        print(f'Epoch {i+1} {accuracy}')

    return model

In [13]:
model = main()

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1524
    })
    dev: Dataset({
        features: ['text', 'label'],
        num_rows: 218
    })
})
{'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}
{'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}
Dataset({
    features: ['text', 'label'],
    num_rows: 2024
})
(218, 2)


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 1 {'f1': 0.5480033416875523}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 2 {'f1': 0.41666666666666663}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 3 {'f1': 0.5168864468864469}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 4 {'f1': 0.5734343434343434}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 5 {'f1': 0.61003663003663}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 6 {'f1': 0.6841025641025641}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 7 {'f1': 0.5408369408369408}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 8 {'f1': 0.5851592851592852}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 9 {'f1': 0.5476190476190477}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 10 {'f1': 0.6257942057942059}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 11 {'f1': 0.48673796791443846}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 12 {'f1': 0.5852813852813853}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 13 {'f1': 0.6205128205128204}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 14 {'f1': 0.5829370629370629}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 15 {'f1': 0.5957575757575757}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 16 {'f1': 0.6333333333333332}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 17 {'f1': 0.5852813852813853}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 18 {'f1': 0.5957575757575757}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 19 {'f1': 0.5852813852813853}


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 20 {'f1': 0.5957575757575757}


In [None]:
model.push_to_hub(
    f"ioai2024japan/{CFG.name}",
    token=userdata.get('hf_write'), private=True
)

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ioai2024japan/redrock_009_task2_pseudo_from_pretrained/commit/49fa80426907c2172a871c7742d16c4a554388cc', commit_message='Upload BertForSequenceClassification', commit_description='', oid='49fa80426907c2172a871c7742d16c4a554388cc', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
def terminate_session():
    # Terminate this session

    from google.colab import runtime
    runtime.unassign()

In [None]:
terminate_session()

In [None]:
import gc
import torch

def flush():
  gc.collect()
  torch.cuda.empty_cache()
flush()