<a href="https://colab.research.google.com/github/chizuchizu/IOAI/blob/main/Task2/redrock_008_task2_tokenizer_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import userdata

read_access_token = userdata.get('hf_read')
write_access_token = userdata.get('hf_write')

### Dependencies

In [2]:
import importlib
import torch, transformers

if '2.3.0' not in torch.__version__:
  !pip install torch==2.3.0
if transformers.__version__!='4.41.2':
  !pip install transformers==4.41.2

if importlib.util.find_spec('datasets') is None:
  !pip install datasets==2.18.0
  !pip install evaluate==0.4.2
  !pip install accelerate -U

if importlib.util.find_spec("wandb") is None:
  !pip install wandb -q

If you've just installed `accelerate`, execute `Runtime > Restart session and run all` in the Colab UI menu above.

In [3]:
import os
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.functional import F

import torchvision
from torchvision import datasets, transforms, models

from tqdm.auto import tqdm

from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, get_scheduler, BertForMaskedLM, BertTokenizer
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset, DatasetDict

import torch.cuda.amp as amp # or import torch.cuda.amp as amp for PyTorch's native amp


import evaluate
import wandb

In [4]:
# ====================================================
# CFG
# ====================================================

class CFG:
    num_workers=4
    project = "IOAI_Task2"
    name = "redrock_008_task2_tokenizer_train"

    # model
    base_model_name = "google-bert/bert-base-multilingual-uncased"
    tokenizer_name = "google-bert/bert-base-multilingual-uncased"
    mlm_probability = 0.15

    # training
    epochs = 1

    scheduler='CosineAnnealingLR' # ['ReduceLROnPlateau', 'CosineAnnealingLR', 'CosineAnnealingWarmRestarts']

    lr = 5e-05

    # dataset
    max_length = 256

    # T4: 32
    # L4: 64
    train_batch_size = 32

    seed=42
    train=True

In [5]:
wandb.login(key=userdata.get('wandb_token'))

[34m[1mwandb[0m: Currently logged in as: [33masiatic-cheetah[0m ([33masiatic-cheetah-a[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [6]:
# for wandb
cfg = dict(vars(CFG))
cfg = {k: v for k, v in cfg.items() if "__" not in k}

In [41]:
# classification_dataset = load_dataset('InternationalOlympiadAI/NLP_problem', token=read_access_token)
# tokenizer = AutoTokenizer.from_pretrained(CFG.base_model_name)
brahmi_to_devanagari = {
    '𑀓': 'क', '𑀔': 'ख', '𑀕': 'ग', '𑀖': 'घ', '𑀗': 'ङ', '𑀘': 'च', '𑀙': 'छ',
    '𑀚': 'ज', '𑀛': 'झ', '𑀜': 'ञ', '𑀝': 'ट', '𑀞': 'ठ', '𑀟': 'ड', '𑀠': 'ढ',
    '𑀡': 'ण', '𑀢': 'त', '𑀣': 'थ', '𑀤': 'द', '𑀥': 'ध', '𑀦': 'न', '𑀧': 'प',
    '𑀨': 'फ', '𑀩': 'ब', '𑀪': 'भ', '𑀫': 'म', '𑀬': 'य', '𑀭': 'र', '𑀮': 'ल',
    '𑀯': 'व', '𑀰': 'श', '𑀱': 'ष', '𑀲': 'स', '𑀳': 'ह', '𑁦':'ऻ', '𑁣': 'ॉ'
}

def transliterate_brahmi_to_devanagari(text):
    transliterated_text = ''
    for char in text:
        if char in brahmi_to_devanagari:
            transliterated_text += brahmi_to_devanagari[char]
        else:
            transliterated_text += char
    return transliterated_text

In [8]:
raw_dataset = load_dataset('InternationalOlympiadAI/NLP_problem_raw', token=read_access_token)

In [9]:
!git clone https://github.com/libindic/indic-trans.git

fatal: destination path 'indic-trans' already exists and is not an empty directory.


In [10]:
!pip install -r ./indic-trans/requirements.txt
!pip install ./indic-trans/

Processing ./indic-trans
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: indictrans
  Building wheel for indictrans (pyproject.toml) ... [?25l[?25hdone
  Created wheel for indictrans: filename=indictrans-1.2.3-cp310-cp310-linux_x86_64.whl size=337821164 sha256=ee2d7dedaf67dba2504c4de78d0cac738cad68d5e6c4e3db255268d26a664c72
  Stored in directory: /root/.cache/pip/wheels/3e/c9/43/39c5aaa9a570043089bac219a37343294fa5b47f1350dea53a
Successfully built indictrans
Installing collected packages: indictrans
  Attempting uninstall: indictrans
    Found existing installation: indictrans 1.2.3
    Uninstalling indictrans-1.2.3:
      Successfully uninstalled indictrans-1.2.3
Successfully installed indictrans-1.2.3


In [42]:
from indictrans import Transliterator

trn = Transliterator(source='hin', target='eng', build_lookup=True, rb=True)

print(transliterate_brahmi_to_devanagari(raw_dataset["train"]["text"][0]))

transliterate_dict = {}

for key, value in brahmi_to_devanagari.items():
    transliterate_dict[value] = trn.transform(value)
    if len(transliterate_dict[value]) == 0:
        transliterate_dict[value] = "o"

print(transliterate_dict)

def transliterate_text(text):
    for key, value in transliterate_dict.items():
        text = text.replace(key, value)
    return text

def transliterate_to_latin(text):
    transliterated_text = ''
    for char in text:
        if char in brahmi_to_devanagari:
            transliterated_text += transliterate_text(brahmi_to_devanagari[char])
        else:
            transliterated_text += transliterate_text(char)
    return transliterated_text

print(transliterate_to_latin(raw_dataset["train"]["text"][0]))


  मचठचडचड दचछच तणच चढसचडत थच बऻडॉपणधऻ थॉड ठॉढचषच दनषच चललऻलऻह ठॉ ढचढढचडतऻषढचढढचडतऻड थच ढनपनढ ठऻ ञचनठच च ततठतड षचडत पचठचढचतढचड दनढतड मचलतपपच ठॉ षनभतड ठषचणॉणतड मचलतपपचव
{'क': 'k', 'ख': 'kha', 'ग': 'ga', 'घ': 'gha', 'ङ': 'ng', 'च': 'c', 'छ': 'chha', 'ज': 'ja', 'झ': 'jha', 'ञ': 'na', 'ट': 't', 'ठ': 'tha', 'ड': 'da', 'ढ': 'dha', 'ण': 'na', 'त': 't', 'थ': 'tha', 'द': 'da', 'ध': 'dha', 'न': 'na', 'प': 'pa', 'फ': 'pha', 'ब': 'ba', 'भ': 'bha', 'म': 'm', 'य': 'ya', 'र': 'r', 'ल': 'l', 'व': 'va', 'श': 'sha', 'ष': 'sha', 'स': 'sa', 'ह': 'ha', 'ऻ': 'o', 'ॉ': 'o'}
  mcthacdacda dacchhac tnac cdhasacdat thac baodaopanadhao thaoda thaodhacshac danashac clloloha thao dhacdhadhacdatoshadhacdhadhacdatoda thac dhanapanadha thao nacnathac c ttthatda shacdat pacthacdhactdhacda danadhatda mcltpapac thao shanabhatda thashacnaonatda mcltpapacva


In [43]:
train_corpus = []

from joblib import Parallel, delayed

num_cores = 8

def process_text(x):
    return transliterate_to_latin(x)

train_corpus = Parallel(n_jobs=-1)(delayed(process_text)(x) for x in tqdm(raw_dataset['train']["text"]))

  0%|          | 0/611245 [00:00<?, ?it/s]

In [44]:
old_tokenizer = AutoTokenizer.from_pretrained(CFG.base_model_name)

tokenizer = old_tokenizer.train_new_from_iterator(train_corpus, old_tokenizer.vocab_size)

In [46]:
print(transliterate_to_latin(raw_dataset["train"]["text"][0]))
print(tokenizer.tokenize(transliterate_to_latin(raw_dataset["train"]["text"][0])))

  mcthacdacda dacchhac tnac cdhasacdat thac baodaopanadhao thaoda thaodhacshac danashac clloloha thao dhacdhadhacdatoshadhacdhadhacdatoda thac dhanapanadha thao nacnathac c ttthatda shacdat pacthacdhactdhacda danadhatda mcltpapac thao shanabhatda thashacnaonatda mcltpapacva
['mcthacdacda', 'dacchhac', 'tnac', 'cdhasacdat', 'thac', 'baodaopanadhao', 'thaoda', 'thaodhacshac', 'danashac', 'clloloha', 'thao', 'dhacdhadhacdatoshadhacdhadhacdatoda', 'thac', 'dhanapanadha', 'thao', 'nacnathac', 'c', 'ttthatda', 'shacdat', 'pacthacdhactdhacda', 'danadhatda', 'mcltpapac', 'thao', 'shanabhatda', 'thashacnaonatda', 'mcltpapacva']


In [47]:
tokenizer.push_to_hub("ioai2024japan/redrock_008_task2_tokenizer", private=True, token=write_access_token)

CommitInfo(commit_url='https://huggingface.co/ioai2024japan/redrock_008_task2_tokenizer/commit/5b33175eb1f8ac33748faa9d23c130665502d8f1', commit_message='Upload tokenizer', commit_description='', oid='5b33175eb1f8ac33748faa9d23c130665502d8f1', pr_url=None, pr_revision=None, pr_num=None)

In [48]:
def terminate_session():
    # Terminate this session

    from google.colab import runtime
    runtime.unassign()

In [49]:
terminate_session()

In [None]:
import gc
import torch

def flush():
  gc.collect()
  torch.cuda.empty_cache()
flush()