#Creating tokenizer

In [31]:
!pip install datasets transformers[sentencepiece]



In [32]:
from transformers.utils import send_example_telemetry

send_example_telemetry("tokenizer_training_notebook", framework="none")

In [33]:
from datasets import load_dataset

# Load your text file
dataset = load_dataset('text', data_files={'train': '/content/un_labelled_output.txt'}, split='train')

# Print the first few examples
print(dataset[:5])

{'text': ['Kohima nung nisung asem Tang tashi nung tashidak nisung 121 temang anepaludar, tashidak agi nisung 94 süogo aser keta aliba temaitsü terara sayudar Nagaland nung tang tashi nung Nagaland nung aliba nüburtem Iba benjung amungba mapang Iba tebur tajung benjung benjung amungba mapanganema aliba, yimji yimjung agi', 'asenok ajak semdangtsü imlaa Paisa, nüburtem dang benjung kanga kümdanga amungtsü lematepogo aser iba benjung amungba ajanga nisung ajak ASI Thejangulie-i anir aliba ajanga tenzüktsüba sülen iba asalentong timi yanglutsü atema Nagaland nung aliba tribal memelenshitsü maneni nenshiaashi, kechiaser iba anogo nung', 'Iba shisadokden senden nung atema senden nung adenertemi hoho tem dang, iba standing Anungji ILP/RIIN mapa kuma inyaka aotsü aliba kanga sashia renlokba mechi kongshir aliba mapang iba osang linük nung cases ajak agi mezüng thinen aser iba sülen atema jembishinü akatsü mechi', 'Iba senden nung Chief menogo. Iba senden nung benoka aliba asadangshi. Iba denO

In [34]:
dataset

Dataset({
    features: ['text'],
    num_rows: 4319
})

In [35]:
dataset[:5]

{'text': ['Kohima nung nisung asem Tang tashi nung tashidak nisung 121 temang anepaludar, tashidak agi nisung 94 süogo aser keta aliba temaitsü terara sayudar Nagaland nung tang tashi nung Nagaland nung aliba nüburtem Iba benjung amungba mapang Iba tebur tajung benjung benjung amungba mapanganema aliba, yimji yimjung agi',
  'asenok ajak semdangtsü imlaa Paisa, nüburtem dang benjung kanga kümdanga amungtsü lematepogo aser iba benjung amungba ajanga nisung ajak ASI Thejangulie-i anir aliba ajanga tenzüktsüba sülen iba asalentong timi yanglutsü atema Nagaland nung aliba tribal memelenshitsü maneni nenshiaashi, kechiaser iba anogo nung',
  'Iba shisadokden senden nung atema senden nung adenertemi hoho tem dang, iba standing Anungji ILP/RIIN mapa kuma inyaka aotsü aliba kanga sashia renlokba mechi kongshir aliba mapang iba osang linük nung cases ajak agi mezüng thinen aser iba sülen atema jembishinü akatsü mechi',
  'Iba senden nung Chief menogo. Iba senden nung benoka aliba asadangshi. Ib

In [36]:
batch_size = 1000
all_texts = [dataset[i : i + batch_size]["text"] for i in range(0, len(dataset), batch_size)]

In [37]:
def batch_iterator():
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["text"]

In [38]:
for batch in batch_iterator():
    print(batch[:5])  # Print the first 5 texts in each batch

['Kohima nung nisung asem Tang tashi nung tashidak nisung 121 temang anepaludar, tashidak agi nisung 94 süogo aser keta aliba temaitsü terara sayudar Nagaland nung tang tashi nung Nagaland nung aliba nüburtem Iba benjung amungba mapang Iba tebur tajung benjung benjung amungba mapanganema aliba, yimji yimjung agi', 'asenok ajak semdangtsü imlaa Paisa, nüburtem dang benjung kanga kümdanga amungtsü lematepogo aser iba benjung amungba ajanga nisung ajak ASI Thejangulie-i anir aliba ajanga tenzüktsüba sülen iba asalentong timi yanglutsü atema Nagaland nung aliba tribal memelenshitsü maneni nenshiaashi, kechiaser iba anogo nung', 'Iba shisadokden senden nung atema senden nung adenertemi hoho tem dang, iba standing Anungji ILP/RIIN mapa kuma inyaka aotsü aliba kanga sashia renlokba mechi kongshir aliba mapang iba osang linük nung cases ajak agi mezüng thinen aser iba sülen atema jembishinü akatsü mechi', 'Iba senden nung Chief menogo. Iba senden nung benoka aliba asadangshi. Iba denOngpangkon

In [39]:
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer

tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [40]:
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)

In [41]:
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

In [42]:
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [43]:
tokenizer.pre_tokenizer.pre_tokenize_str("Sikkim kübok Namchi Central Jail nung puoka alir aser staff sentepa nisung")

[('Sikkim', (0, 6)),
 ('kübok', (7, 12)),
 ('Namchi', (13, 19)),
 ('Central', (20, 27)),
 ('Jail', (28, 32)),
 ('nung', (33, 37)),
 ('puoka', (38, 43)),
 ('alir', (44, 48)),
 ('aser', (49, 53)),
 ('staff', (54, 59)),
 ('sentepa', (60, 67)),
 ('nisung', (68, 74))]

In [44]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

In [45]:
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)

In [46]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

2 3


In [47]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", cls_token_id),
        ("[SEP]", sep_token_id),
    ],
)

In [48]:
encoding = tokenizer.encode("Sikkim kübok Namchi Central Jail nung puoka alir aser staff sentepa nisung")

In [49]:
encoding.tokens

['[CLS]',
 'sikkim',
 'kubok',
 'namchi',
 'central',
 'jail',
 'nung',
 'puoka',
 'alir',
 'aser',
 'staff',
 'sentepa',
 'nisung',
 '[SEP]']

In [50]:
print(encoding.tokens)

['[CLS]', 'sikkim', 'kubok', 'namchi', 'central', 'jail', 'nung', 'puoka', 'alir', 'aser', 'staff', 'sentepa', 'nisung', '[SEP]']


In [51]:
tokenizer.decoder = decoders.WordPiece(prefix="##")

In [52]:
from transformers import BertTokenizerFast
new_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

In [53]:
# Example sentences to encode
sentences = [
    "Sikkim kübok Namchi Central Jail nung puoka alir aser staff sentepa nisung",
    "Another example sentence to test the tokenizer.",
    "Let's see how this tokenizer handles different inputs."
]

# Encode the sentences
for sentence in sentences:
    encoding = tokenizer.encode(sentence)
    print(f"Sentence: {sentence}")
    print(f"Tokens: {encoding.tokens}")
    print(f"Type IDs: {encoding.type_ids}")
    print()

Sentence: Sikkim kübok Namchi Central Jail nung puoka alir aser staff sentepa nisung
Tokens: ['[CLS]', 'sikkim', 'kubok', 'namchi', 'central', 'jail', 'nung', 'puoka', 'alir', 'aser', 'staff', 'sentepa', 'nisung', '[SEP]']
Type IDs: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Sentence: Another example sentence to test the tokenizer.
Tokens: ['[CLS]', 'ano', '##ther', 'exam', '##ple', 'sen', '##ten', '##ce', 'to', 'test', 'the', 'tok', '##eni', '##zer', '.', '[SEP]']
Type IDs: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Sentence: Let's see how this tokenizer handles different inputs.
Tokens: ['[CLS]', 'let', "'", 's', 'se', '##e', 'how', 'this', 'tok', '##eni', '##zer', 'hand', '##les', 'di', '##ff', '##ere', '##nt', 'in', '##pu', '##ts', '.', '[SEP]']
Type IDs: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]



In [54]:
from transformers import BertTokenizerFast
new_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

In [55]:
!pip install huggingface_hub



In [56]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [57]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("Blue7Bird/chungli_Ao_tokenizer")

In [58]:
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset, Dataset
import pandas as pd
from huggingface_hub import HfApi, HfFolder
from transformers import BertTokenizerFast
# Load the dataset from a file
def load_data(file_path):
    data = {'text': [], 'label': []}
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines[1:]:  # Skip header if there is one
            parts = line.strip().split('\t')  # Assuming tab-separated values
            if len(parts) == 2:
                text, sentiment = parts
                data['text'].append(text)
                data['label'].append(sentiment)
    return data

# Path to your dataset file
file_path = '/content/output_file.txt'
data = load_data(file_path)

# Convert sentiment labels to numerical values
label_map = {"NEGATIVE": 0, "POSITIVE": 1}
data["label"] = [label_map[label] for label in data["label"]]

# Create a DataFrame and then a Hugging Face Dataset
df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

# Load your tokenizer
tokenizer = BertTokenizerFast.from_pretrained("Blue7Bird/chungli_Ao_tokenizer")

# Add special tokens if they're missing
special_tokens_dict = {
    'unk_token': '[UNK]',
    'sep_token': '[SEP]',
    'pad_token': '[PAD]',
    'cls_token': '[CLS]',
    'mask_token': '[MASK]'
}
tokenizer.add_special_tokens(special_tokens_dict)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load the model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)  # Assuming binary classification
model.resize_token_embeddings(len(tokenizer))

# Training arguments
training_args = TrainingArguments(
    output_dir="./chungli_ao_bert_model",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

# Train the model
trainer.train()

Map:   0%|          | 0/230153 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.3275
1000,0.3173
1500,0.3179
2000,0.3226
2500,0.3203
3000,0.3181
3500,0.313
4000,0.318
4500,0.3125
5000,0.3204


TrainOutput(global_step=14385, training_loss=0.3150605144374769, metrics={'train_runtime': 5037.513, 'train_samples_per_second': 45.688, 'train_steps_per_second': 2.856, 'total_flos': 1.513894968106752e+16, 'train_loss': 0.3150605144374769, 'epoch': 1.0})

In [59]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [70]:
from huggingface_hub import HfApi, HfFolder

# Login using your token
HfFolder.save_token("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")

In [74]:
model.save_pretrained("chungli_ao_bert_model")
tokenizer.save_pretrained("chungli_ao_bert_model")

('chungli_ao_bert_model/tokenizer_config.json',
 'chungli_ao_bert_model/special_tokens_map.json',
 'chungli_ao_bert_model/vocab.txt',
 'chungli_ao_bert_model/added_tokens.json',
 'chungli_ao_bert_model/tokenizer.json')

In [75]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("chungli_ao_bert_model")
model = AutoModelForSequenceClassification.from_pretrained("chungli_ao_bert_model")

In [76]:
model.push_to_hub("Blue7Bird/chungli_ao_bert_model")
tokenizer.push_to_hub("Blue7Bird/chungli_ao_tokenizer")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Blue7Bird/chungli_ao_tokenizer/commit/50306be7063037ebd723bbb6f8e48c659682a6b2', commit_message='Upload tokenizer', commit_description='', oid='50306be7063037ebd723bbb6f8e48c659682a6b2', pr_url=None, pr_revision=None, pr_num=None)

In [77]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("Blue7Bird/chungli_ao_bert_model")
model = AutoModelForSequenceClassification.from_pretrained("Blue7Bird/chungli_ao_bert_model")

tokenizer_config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/169k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/504k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/682 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]