<a href="https://colab.research.google.com/github/chizuchizu/IOAI/blob/main/Task2/redrock_007_task2_devanagari_tokenizer_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import userdata

read_access_token = userdata.get('hf_read')
write_access_token = userdata.get('hf_write')

### Dependencies

In [None]:
import importlib
import torch, transformers

if '2.3.0' not in torch.__version__:
  !pip install torch==2.3.0
if transformers.__version__!='4.41.2':
  !pip install transformers==4.41.2

if importlib.util.find_spec('datasets') is None:
  !pip install datasets==2.18.0
  !pip install evaluate==0.4.2
  !pip install accelerate -U

if importlib.util.find_spec("wandb") is None:
  !pip install wandb -q

Collecting datasets==2.18.0
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets==2.18.0)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets==2.18.0)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets==2.18.0)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, dataset

If you've just installed `accelerate`, execute `Runtime > Restart session and run all` in the Colab UI menu above.

In [None]:
import os
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.functional import F

import torchvision
from torchvision import datasets, transforms, models

from tqdm.auto import tqdm

from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, get_scheduler, BertForMaskedLM, BertTokenizer
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset, DatasetDict

import torch.cuda.amp as amp # or import torch.cuda.amp as amp for PyTorch's native amp


import evaluate
import wandb

In [None]:
# ====================================================
# CFG
# ====================================================

class CFG:
    num_workers=4
    project = "IOAI_Task2"
    name = "redrock_006_task2_pretrain_wandb"

    # model
    base_model_name = "google-bert/bert-base-multilingual-uncased"
    tokenizer_name = "google-bert/bert-base-multilingual-uncased"
    mlm_probability = 0.15

    # training
    epochs = 1

    scheduler='CosineAnnealingLR' # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']

    lr = 5e-05

    # dataset
    max_length = 256

    # T4: 32
    # L4: 64
    train_batch_size = 32

    seed=42
    train=True

In [None]:
wandb.login(key=userdata.get('wandb_token'))

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# for wandb
cfg = dict(vars(CFG))
cfg = {k: v for k, v in cfg.items() if "__" not in k}

In [None]:
# classification_dataset = load_dataset('InternationalOlympiadAI/NLP_problem', token=read_access_token)
# tokenizer = AutoTokenizer.from_pretrained(CFG.base_model_name)
brahmi_to_devanagari = {
    '𑀓': 'क', '𑀔': 'ख', '𑀕': 'ग', '𑀖': 'घ', '𑀗': 'ङ', '𑀘': 'च', '𑀙': 'छ',
    '𑀚': 'ज', '𑀛': 'झ', '𑀜': 'ञ', '𑀝': 'ट', '𑀞': 'ठ', '𑀟': 'ड', '𑀠': 'ढ',
    '𑀡': 'ण', '𑀢': 'त', '𑀣': 'थ', '𑀤': 'द', '𑀥': 'ध', '𑀦': 'न', '𑀧': 'प',
    '𑀨': 'फ', '𑀩': 'ब', '𑀪': 'भ', '𑀫': 'म', '𑀬': 'य', '𑀭': 'र', '𑀮': 'ल',
    '𑀯': 'व', '𑀰': 'श', '𑀱': 'ष', '𑀲': 'स', '𑀳': 'ह', '𑁦':'०', '𑁣': '90'
}

def transliterate_brahmi_to_devanagari(text):
    transliterated_text = ''
    for char in text:
        if char in brahmi_to_devanagari:
            transliterated_text += brahmi_to_devanagari[char]
        else:
            transliterated_text += char
    return transliterated_text

def to_device(batch, device):
    output = {}
    for k, v in batch.items():
        try:
            output[k] = v.to(device)
        except:
            output[k] = v
    return output

In [None]:
raw_dataset = load_dataset('InternationalOlympiadAI/NLP_problem_raw', token=read_access_token)

Downloading readme:   0%|          | 0.00/281 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/90.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/611245 [00:00<?, ? examples/s]

In [None]:
print(transliterate_brahmi_to_devanagari(raw_dataset["train"]["text"][0]))

"""
file_name = "to_latin.txt"

for text in tqdm(raw_dataset["train"]["text"][:1000]):
    with open(file_name, "a") as f:
        f.write(transliterate_to_latin(text) + "\n")
"""

  मचठचडचड दचछच तणच चढसचडत थच ब०ड90पणध० थ90ड ठ90ढचषच दनषच चलल०ल०ह ठ90 ढचढढचडत०षढचढढचडत०ड थच ढनपनढ ठ० ञचनठच च ततठतड षचडत पचठचढचतढचड दनढतड मचलतपपच ठ90 षनभतड ठषचण90णतड मचलतपपचव


'\nfile_name = "to_latin.txt"\n\nfor text in tqdm(raw_dataset["train"]["text"][:1000]):\n    with open(file_name, "a") as f:\n        f.write(transliterate_to_latin(text) + "\n")\n'

In [None]:
train_corpus = []

from joblib import Parallel, delayed

num_cores = 8


def process_text(x):
    train_corpus = []
    y = x
    if len(y) <= 1000:
        train_corpus.append(transliterate_brahmi_to_devanagari(y))
    else:
        i = 0
        while len(y) >= 1000:
            ret = y[1000 * i : 1000 * (i + 1)]
            train_corpus.append(transliterate_brahmi_to_devanagari(ret))
            y = y[1000 * (i + 1):]
            i += 1
        if len(y) > 0:
            train_corpus.append(transliterate_brahmi_to_devanagari(y))
    return train_corpus

train_corpus_list = Parallel(n_jobs=-1)(delayed(process_text)(x) for x in tqdm(raw_dataset['train']["text"]))

train_corpus = [item for sublist in train_corpus_list for item in sublist]

print(train_corpus[:5])

  0%|          | 0/611245 [00:00<?, ?it/s]

  pid = os.fork()


['  मचठचडचड दचछच तणच चढसचडत थच ब०ड90पणध० थ90ड ठ90ढचषच दनषच चलल०ल०ह ठ90 ढचढढचडत०षढचढढचडत०ड थच ढनपनढ ठ० ञचनठच च ततठतड षचडत पचठचढचतढचड दनढतड मचलतपपच ठ90 षनभतड ठषचण90णतड मचलतपपचव', ' च बचहचभ षचहचडडतड 90लणढधततह छणचड षचहचडहन हनड हचढन लचढढ90ढतड णचढ90 रठन बचढत थच दतडचभ० घव', 'भचणढ90डथ चडपम90डण चल०90बम90 थ90ठध०हत टचड मचतस० हमत च भचडचभ ठघ बच षचपचड 90ठप90ढच डच हम०ठचभचभ रथघर ष ठथ ढचणन ठपठशल च बचभतड तढचथचड थचड ठचहनषच ड० डच ठचसचस०ड णचथच लचढचभचत डच डचच०भतणचव', 'णचथथच चठ० ठतष90ड पचडपचढचभचथनठ ढनपनढतड थच णच पचह90 ततठतड ठचभठचभच षचप90 झचनण० णचडच थच तभभतड ठतष90ड थच णच ठचभढ० हमतव', 'भचहमतड थचतथचतप90 डच हचढनड ठनथतड हमतबच ठचढचभ णचथथच च हम०ठचभच पच ठपरपन बतडत त90०ससततत०डप डच डचच०भतणच चड ठतणचहपच ढचपहचठचतततन च पव']


In [None]:
old_tokenizer = AutoTokenizer.from_pretrained(CFG.base_model_name)

tokenizer = old_tokenizer.train_new_from_iterator(train_corpus, old_tokenizer.vocab_size)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

In [None]:
print(transliterate_brahmi_to_devanagari(raw_dataset["train"]["text"][0]))
print(tokenizer.tokenize(transliterate_brahmi_to_devanagari(raw_dataset["train"]["text"][0])))

  मचठचडचड दचछच तणच चढसचडत थच ब०ड90पणध० थ90ड ठ90ढचषच दनषच चलल०ल०ह ठ90 ढचढढचडत०षढचढढचडत०ड थच ढनपनढ ठ० ञचनठच च ततठतड षचडत पचठचढचतढचड दनढतड मचलतपपच ठ90 षनभतड ठषचण90णतड मचलतपपचव
['मचठचडचड', 'दचछच', 'तणच', 'चढसचडत', 'थच', 'ब०ड90पणध०', 'थ90ड', 'ठ90ढचषच', 'दनषच', 'चलल०ल०ह', 'ठ90', 'ढचढढचडत०षढचढढचडत०ड', 'थच', 'ढनपनढ', 'ठ०', 'ञचनठच', 'च', 'ततठतड', 'षचडत', 'पचठचढचतढचड', 'दनढतड', 'मचलतपपच', 'ठ90', 'षनभतड', 'ठषचण90णतड', 'मचलतपपचव']


In [None]:
tokenizer.push_to_hub("ioai2024japan/redrock_007_task2_tokenizer_devanagari", private=True, token=write_access_token)

CommitInfo(commit_url='https://huggingface.co/ioai2024japan/redrock_007_task2_tokenizer_devanagari/commit/d377e5ae6aba5a3b78272f674080b18fa763c30e', commit_message='Upload tokenizer', commit_description='', oid='d377e5ae6aba5a3b78272f674080b18fa763c30e', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
def terminate_session():
    # Terminate this session

    from google.colab import runtime
    runtime.unassign()

In [None]:
terminate_session()

In [None]:
import gc
import torch

def flush():
  gc.collect()
  torch.cuda.empty_cache()
flush()