<a href="https://colab.research.google.com/github/chizuchizu/IOAI/blob/main/Task2/redrock_004_task2_pretrain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import userdata

read_access_token = userdata.get('hf_read')
write_access_token = userdata.get('hf_write')

### Dependencies

In [None]:
import importlib
import torch, transformers

if '2.3.0' not in torch.__version__:
  !pip install torch==2.3.0
if transformers.__version__!='4.41.2':
  !pip install transformers==4.41.2

if importlib.util.find_spec('datasets') is None:
  !pip install datasets==2.18.0s
  !pip install evaluate==0.4.2
  !pip install accelerate -U


If you've just installed `accelerate`, execute `Runtime > Restart session and run all` in the Colab UI menu above.

In [None]:
import os
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.functional import F

import torchvision
from torchvision import datasets, transforms, models

from tqdm.auto import tqdm

from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, get_scheduler, BertForMaskedLM, BertTokenizer
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset, DatasetDict

import evaluate

In [None]:
# ====================================================
# CFG
# ====================================================

class CFG:
    num_workers=4
    project = "IOAI_Task2_classification"
    name = "task2_redrock_004_pretrain"

    # model
    base_model_name = "google-bert/bert-base-multilingual-uncased"
    mlm_probability = 0.15

    # training
    epochs = 1

    scheduler="CosineAnnealingLR" # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']

    lr = 1e-05

    # dataset
    max_length = 256

    # T4: 32
    # L4: 64
    train_batch_size = 32
    eval_batch_size = 32

    seed=42
    train=True

In [None]:
model = BertForMaskedLM.from_pretrained(
    CFG.base_model_name,
).cuda()#.half()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at google-bert/bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expe

In [None]:
raw_dataset = load_dataset('InternationalOlympiadAI/NLP_problem_raw', token=read_access_token)

# classification_dataset = load_dataset('InternationalOlympiadAI/NLP_problem', token=read_access_token)
# tokenizer = AutoTokenizer.from_pretrained(CFG.base_model_name)
brahmi_to_devanagari = {
    '𑀓': 'क', '𑀔': 'ख', '𑀕': 'ग', '𑀖': 'घ', '𑀗': 'ङ', '𑀘': 'च', '𑀙': 'छ',
    '𑀚': 'ज', '𑀛': 'झ', '𑀜': 'ञ', '𑀝': 'ट', '𑀞': 'ठ', '𑀟': 'ड', '𑀠': 'ढ',
    '𑀡': 'ण', '𑀢': 'त', '𑀣': 'थ', '𑀤': 'द', '𑀥': 'ध', '𑀦': 'न', '𑀧': 'प',
    '𑀨': 'फ', '𑀩': 'ब', '𑀪': 'भ', '𑀫': 'म', '𑀬': 'य', '𑀭': 'र', '𑀮': 'ल',
    '𑀯': 'व', '𑀰': 'श', '𑀱': 'ष', '𑀲': 'स', '𑀳': 'ह', '𑁦':'०', '𑁣': '90'
}

def transliterate_brahmi_to_devanagari(text):
    transliterated_text = ''
    for char in text:
        if char in brahmi_to_devanagari:
            transliterated_text += brahmi_to_devanagari[char]
        else:
            transliterated_text += char
    return transliterated_text

In [None]:
file_name = "raw_text.txt"

connected_text = ""

for x in raw_dataset['train']["text"]:
    connected_text += transliterate_brahmi_to_devanagari(x)
    connected_text += "\n"

# print(connected_text)

with open(file_name, 'w', encoding='utf-8') as f:
    f.write(connected_text)

In [None]:
!pip install sentencepiece
import sentencepiece as spm

spm.SentencePieceTrainer.train(input='raw_text.txt', model_prefix='tokenize', vocab_size=1000, character_coverage=0.9995, user_defined_symbols=['[CLS]', '[SEP]', '[PAD]', '[UNK]', '[MASK]'])

sp = spm.SentencePieceProcessor(model_file='tokenize.model')



In [None]:
!pip install protobuf
!wget https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py

from tokenizers.implementations import SentencePieceUnigramTokenizer
from tokenizers.processors import BertProcessing
from transformers import PreTrainedTokenizerFast

spm_tokenizer = SentencePieceUnigramTokenizer.from_spm(
    'tokenize.model'
)

special_tokens = ['[CLS]', '[SEP]', '[PAD]', '[UNK]', '[MASK]']
for token in special_tokens:
    if token not in spm_tokenizer.get_vocab():
        spm_tokenizer.add_special_tokens([token])

spm_tokenizer.post_processor = BertProcessing(
    cls=("[CLS]", spm_tokenizer.token_to_id('[CLS]')),
    sep=("[SEP]", spm_tokenizer.token_to_id('[SEP]'))
)

bert_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object = spm_tokenizer._tokenizer,
    unk_token = '<unk>',
    bos_token = '<s>',
    eos_token = '</s>',
    cls_token = '[CLS]',
    sep_token = '[SEP]',
    pad_token = '[PAD]',
    mask_token = '[MASK]',
)


"""
print(
    bert_tokenizer.batch_encode_plus(
        ["च𑀪त𑀫च𑁦𑁣ल𑁣बण 𑀣च तचध𑀢पचल𑀢𑀳𑀠𑀕 𑀣चबच 𑀣च𑀙च 𑀤न𑀱च 𑀳𑀢णच𑀳च𑀯", "𑀱च𑀟𑀣च 𑀫च𑀞च𑀟 𑀟च 𑀞च𑀱𑁣 𑀲चञच 𑀣च 𑀲𑀢प𑀢𑀟च𑀯"], padding=True
    )
)
"""

def transform(example_batch):
    example_batch["text"] = [transliterate_brahmi_to_devanagari(x) for x in example_batch["text"]]
    inputs =  bert_tokenizer.batch_encode_plus([x for x in example_batch["text"]],  truncation=True, max_length=CFG.max_length, padding="max_length", return_tensors="pt")
    # inputs["labels"] = example_batch["label"]
    return inputs

tokenized_data = raw_dataset.with_transform(transform)

--2024-07-11 13:10:00--  https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6257 (6.1K) [text/plain]
Saving to: ‘sentencepiece_model_pb2.py.1’


2024-07-11 13:10:00 (58.1 MB/s) - ‘sentencepiece_model_pb2.py.1’ saved [6257/6257]



In [None]:
def to_device(batch, device):
    output = {}
    for k, v in batch.items():
        try:
            output[k] = v.to(device)
        except:
            output[k] = v
    return output

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=bert_tokenizer,
    mlm=True,
    mlm_probability=CFG.mlm_probability
)

# dataset
train_dataset = tokenized_data["train"]
train_loader = DataLoader(
    train_dataset,
    batch_size=CFG.train_batch_size,
    num_workers=0,
    pin_memory=False,
    shuffle=True,
    drop_last=True,
    collate_fn=data_collator,
)

epochs = 1
num_training_steps = epochs * len(train_loader)
optimizer = optim.AdamW(model.parameters(), lr=CFG.lr)
scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

scaler = torch.cuda.amp.GradScaler()

import torch.cuda.amp as amp # or import torch.cuda.amp as amp for PyTorch's native amp

def train_one_epoch(model, scheduler, train_loader, optimizer):
    model.train()
    running_loss = 0.0
    progress_bar = tqdm(train_loader, dynamic_ncols=True)

    for step, batch in enumerate(progress_bar):
        batch = to_device(batch, device)

        # Convert data to fp16
        with amp.autocast():
            outputs = model(**batch)
            loss = outputs.loss

        # Scale loss for fp16 training
        scaler.scale(loss).backward()
        text = f"step {step}, loss: {loss:.5f}"
        progress_bar.set_description(text)

        # Optimizer step with gradient scaling
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        scheduler.step()


# Train and evaluate the model
model.to(device)
for i in range(CFG.epochs):
    train_one_epoch(model, scheduler, train_loader, optimizer)
    print(f'Epoch {i+1}')

  0%|          | 0/19101 [00:00<?, ?it/s]



Epoch 1


In [None]:
model.push_to_hub("ioai2024japan/redrock-004-pretrained-model", token=write_access_token, private=True)

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ioai2024japan/redrock-004-pretrained-model/commit/e8a98343c3d8c4c2f8c9bc523aba92ca756d7cd8', commit_message='Upload BertForMaskedLM', commit_description='', oid='e8a98343c3d8c4c2f8c9bc523aba92ca756d7cd8', pr_url=None, pr_revision=None, pr_num=None)

# Data

In [None]:
import gc
import torch

def flush():
  gc.collect()
  torch.cuda.empty_cache()
flush()