## Library

In [1]:
import pandas as pd
import copy
import json
import logging
import os
import json
import logging
logger = logging.getLogger(__name__)


# Preprocessing Data

In [2]:
train_dataset = pd.read_csv("/kaggle/input/ner-dataset-location/train_df.csv", encoding='utf-8')
dev_dataset = pd.read_csv("/kaggle/input/ner-dataset-location/val_df.csv", encoding='utf-8')
test_dataset = pd.read_csv("/kaggle/input/ner-dataset-location/test_df.csv", encoding='utf-8')

In [3]:
from datasets import Dataset

"""
    Convert dataframe to list
"""
def prepare_dataset(df):
    df['tokens'] = df['tokens'].apply(lambda x: ' '.join(eval(x)))
    df['labels'] = df['labels'].apply(lambda x: ' '.join(eval(x)))
    return df
    
train_dataset = prepare_dataset(train_dataset)
dev_dataset = prepare_dataset(dev_dataset)
test_dataset = prepare_dataset(test_dataset)

print(train_dataset.sample(5))

"""
    Convert strings of tokens and labels into arrays
"""
def process_string_to_array(dataset):
    return {
        'tokens': [sentence.split() for sentence in dataset['tokens']],
        'labels': [label_seq.split() for label_seq in dataset['labels']]
    }

"""
    Process the dataset for token and label lists
"""
train_dataset = process_string_to_array(train_dataset)
dev_dataset = process_string_to_array(dev_dataset)
test_dataset = process_string_to_array(test_dataset)

"""
    Convert processed datasets into Hugging Face Dataset objects
"""
train_dataset = Dataset.from_dict(train_dataset)
dev_dataset = Dataset.from_dict(dev_dataset)
test_dataset = Dataset.from_dict(test_dataset)

"""
    Print the size of each dataset and a sample for verification
"""
print(f"Train dataset size: {len(train_dataset)}")
print(f"Dev dataset size: {len(dev_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")
print("Train dataset sample:", train_dataset[0])
print("Dev dataset sample:", dev_dataset[0])
print("Test dataset sample:", test_dataset[0])

"""
    Define an Example class
"""
class Example:
    def __init__(self, words, slot_labels, guid=None):
        self.words = words
        self.slot_labels = slot_labels
        self.guid = guid

"""
    Convert the dataset to Example objects
"""
def convert_to_examples(dataset):
    return [
        Example(words=tokens, slot_labels=labels, guid=i)
        for i, (tokens, labels) in enumerate(zip(dataset['tokens'], dataset['labels']))
    ]

"""
    Convert datasets into Example objects
"""
train_examples = convert_to_examples(train_dataset)
dev_examples = convert_to_examples(dev_dataset)
test_examples = convert_to_examples(test_dataset)

                                                    labels  \
1828149  B-STREET I-STREET I-STREET I-STREET B-WARD I-W...   
231442   B-STREET I-STREET I-STREET I-STREET I-STREET B...   
1407341  B-STREET I-STREET I-STREET B-WARD I-WARD B-DIS...   
727953   B-STREET I-STREET I-STREET B-WARD I-WARD B-DIS...   
644422   B-STREET I-STREET I-STREET I-STREET I-STREET I...   

                                                    tokens  
1828149  140 Chu_Văn_An_Khu phố 1 Phường Ba_Đồn Thị_Xã ...  
231442   Số nhà 90 Đường Ngô_Miễn Phường Phúc_Thắng Thị...  
1407341        Số 68/11 Lê_Lợi Phường 5 TP Tuy_Hoà Phú_Yên  
727953   133/5/34 KP Long_Đức_1 Phường Tam_Phước Thành_...  
644422   Số 160 Tổ 9 Khu_phố 2 Thị_trấn Chơn_Thành Huyệ...  
Train dataset size: 2281260
Dev dataset size: 439958
Test dataset size: 537726
Train dataset sample: {'tokens': ['Số', '9', 'ngõ', '156/14', 'phố', 'Hồng_Mai', ',', 'Phường', 'Bạch_Mai', ',', 'Quận', 'Hai_Bà_Trưng', ',', 'Hà_Nội'], 'labels': ['B-STREET', 'I-STR

# Create Dataset 

In [4]:
def convert_examples_to_features(
    examples,
    max_seq_len,
    tokenizer,
    pad_label_id=-100,
    cls_token_segment_id=0,
    pad_token_segment_id=0,
    sequence_segment_id=0,
    mask_padding_with_zero=True,
):
    """
        Get special tokens from the tokenizer
    """
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id

    """
        List to hold the converted features
    """
    features = []

    for example_index, example in enumerate(examples):
        # Log progress every 5000 examples
        if example_index % 400 == 0:
            logger.info(f"Processing example {example_index} of {len(examples)}")

        # Tokenize each word and align its corresponding label
        tokens = []
        label_ids = []

        for word, label in zip(example.words, example.slot_labels):
            word_tokens = tokenizer.tokenize(word)

            # If the word cannot be tokenized, use [UNK] token
            if not word_tokens:
                word_tokens = [unk_token]

            tokens.extend(word_tokens)

            # Map string label to integer ID, apply pad_label_id for subword tokens
            label_id = label_map[label]
            label_ids.extend([label_id] + [pad_label_id] * (len(word_tokens) - 1))

        # Handle sequence truncation for [CLS] and [SEP] tokens
        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:max_seq_len - special_tokens_count]
            label_ids = label_ids[:max_seq_len - special_tokens_count]

        # Add [SEP] token at the end of the sentence
        tokens.append(sep_token)
        label_ids.append(pad_label_id)
        token_type_ids = [sequence_segment_id] * len(tokens)

        # Add [CLS] token at the start of the sentence
        tokens = [cls_token] + tokens
        label_ids = [pad_label_id] + label_ids
        token_type_ids = [cls_token_segment_id] + token_type_ids

        # Convert tokens to input IDs
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # Create attention masks (1 for real tokens, 0 for padding tokens)
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)


        # Pad sequences to the maximum sequence length
        padding_length = max_seq_len - len(input_ids)
        input_ids += [pad_token_id] * padding_length
        attention_mask += [0 if mask_padding_with_zero else 1] * padding_length
        token_type_ids += [pad_token_segment_id] * padding_length
        label_ids += [pad_label_id] * padding_length

        # Create InputFeatures object and append it to the list of features
        features.append(
            InputFeatures(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                slot_labels_ids=label_ids,
            )
        )

    return features


In [5]:
# Define the label list
label_list = ['B-DISTRICT', 'B-PROVINCE', 'B-STREET', 'B-WARD', 'I-DISTRICT', 'I-PROVINCE', 'I-STREET', 'I-WARD', 'O']

# Create a mapping from label strings to integers
label_map = {label: i for i, label in enumerate(label_list)}


In [6]:
class InputFeatures(object):
    """
        A single set of features of data.
    """

    def __init__(self, input_ids, attention_mask, token_type_ids, slot_labels_ids):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.slot_labels_ids = slot_labels_ids

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """
            Serializes this instance to a Python dictionary.
        """
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """
            Serializes this instance to a JSON string.
        """
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

In [7]:
from transformers import RobertaTokenizerFast

# Initialize the tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', add_prefix_space=True)

# Set the maximum sequence length
max_seq_len = 64

# Convert examples to features
train_features = convert_examples_to_features(train_examples, max_seq_len, tokenizer)
dev_features = convert_examples_to_features(dev_examples, max_seq_len, tokenizer)
test_features = convert_examples_to_features(test_examples, max_seq_len, tokenizer)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [8]:
tokenizer.cls_token, tokenizer.sep_token, tokenizer.unk_token, tokenizer.pad_token_id

('<s>', '</s>', '<unk>', 1)

In [9]:
import torch
from torch.utils.data import Dataset

# Define a Dataset class to wrap the tokenized features for training
class NERDataset(Dataset):
    def __init__(self, features):
        self.features = features

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]
        return {
            'input_ids': torch.tensor(feature.input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(feature.attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(feature.token_type_ids, dtype=torch.long),
            'labels': torch.tensor(feature.slot_labels_ids, dtype=torch.long),
        }

# Convert tokenized features into PyTorch datasets
train_dataset = NERDataset(train_features)
dev_dataset = NERDataset(dev_features)
test_dataset = NERDataset(test_features)


In [10]:
train_dataset[0]

{'input_ids': tensor([    0,   208,  1376,  2023,  3602,   361,  6094,  3849,  8906, 25664,
            73,  1570,  7843,  1376,  2023,  3602,   289,  1376,  2023,  9085,
          2590,  1215,   448,  1439,  2156,  4129,  8188,  7487,  1376,  2023,
            46,  2590,   163,  1376,  3070,  5543,   611,  1215,   448,  1439,
          2156,  3232,  1376,  3070, 12410,   282, 22972,  1215,   387,  5269,
          1215, 12667,  8188,  7487,  2590,  2156,   289,  5269,  1215,   487,
          1376,  2023,    27,     2]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

# Model

In [11]:
from transformers import RobertaForTokenClassification

# Define the number of unique labels (ensure this matches your dataset's label set)
num_labels = len(label_list)  # e.g., the number of unique labels such as O, B-ORG, etc.

# Load the RoBERTa model for token classification
model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=num_labels)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./ner-results-2',           # output directory to save model checkpoints and results
    evaluation_strategy="epoch",      # evaluation is done at the end of every epoch
    per_device_train_batch_size=32,   # batch size per device during training
    per_device_eval_batch_size=16,    # batch size for evaluation
    num_train_epochs=2,               # number of epochs to train the model
    weight_decay=0.01,                # strength of weight decay
    logging_dir='./logs',             # directory for storing logs
    logging_steps=10,                 # log every 10 steps
    save_steps=500,                   # save model checkpoint every 500 steps
    save_total_limit=2,               # limit the number of total checkpoints to save
)



In [13]:
!pip install seqeval

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ | done
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l- \ | done
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=694869ee61efa85e703e6e8581b226c6eb7bd22d47dec6b70bdf564d4eb762f7
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [14]:
label_list

['B-DISTRICT',
 'B-PROVINCE',
 'B-STREET',
 'B-WARD',
 'I-DISTRICT',
 'I-PROVINCE',
 'I-STREET',
 'I-WARD',
 'O']

In [15]:
from transformers import EvalPrediction
def compute_metrics(p: EvalPrediction):
    predictions = p.predictions.argmax(axis=2)  # Get predicted label indices
    labels = p.label_ids  # True label IDs

    # Debugging: Print shapes of predictions and labels
    print(f"Shape of predictions: {predictions.shape}")
    print(f"Shape of labels: {labels.shape}")

    # Debugging: Log first few predictions and labels for inspection
    print(f"First few predictions: {predictions[:2]}")
    print(f"First few labels: {labels[:2]}")

    pred_labels = []
    true_labels = []

    # Iterate through predictions and labels
    for i, (pred_seq, true_seq) in enumerate(zip(predictions, labels)):
        pred_label_seq = []
        true_label_seq = []

        # Iterate through each token in the sequence
        for pred_idx, true_idx in zip(pred_seq, true_seq):
            if true_idx == -100:
                # Debugging: Log any padding tokens encountered
                # print(f"Padding token encountered at position {i}")
                continue

            # Check if the indices are within the valid range
            if pred_idx < len(label_list) and true_idx < len(label_list):
                pred_label_seq.append(label_list[pred_idx])
                true_label_seq.append(label_list[true_idx])
            else:
                # Debugging: Log when out-of-bound indices are encountered
                print(f"Index out of range: pred_idx={pred_idx}, true_idx={true_idx} at position {i}")

        pred_labels.append(pred_label_seq)
        true_labels.append(true_label_seq)

    # Debugging: Log final processed predictions and labels
    print(f"Processed pred_labels: {pred_labels[:2]}")
    print(f"Processed true_labels: {true_labels[:2]}")

    # Compute token-level F1, Precision, and Recall
    precision = precision_score(true_labels, pred_labels)
    # Trong 10 lần dự đoán nhãn PER: thì chúng ta đoán đúng 6 lần -> 6/10 = 60%

    recall = recall_score(true_labels, pred_labels)
    # Trong 8 nhãn PER thật: thì chúng ta đoán đúng 6 lần -> 6/8 = 75%

    f1 = f1_score(true_labels, pred_labels)

    # Debugging: Print classification report
    print("Classification Report:")
    print(classification_report(true_labels, pred_labels))

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Training

In [16]:
from transformers import Trainer
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from transformers import EvalPrediction


In [17]:
import wandb

wandb.login(key="fe9e91b7500a838e4177e19426009b50c85f3808")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [18]:
# Initialize the Trainer with the modified compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Updated function
)

# Train the model
trainer.train()


  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mdatt2505myethuy5[0m ([33mdatt2505myethuy5-university-of-engineering-and-technolog[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250418_181606-cc1csh05[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m./ner-results-2[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/datt2505myethuy5-university-of-engineering-and-technolog/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/datt2505myethuy5-university-of-engineering-and-technolog/huggingface/runs/cc1csh05[0m


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0271,0.014031,0.994168,0.994173,0.994171
2,0.0055,0.010665,0.995037,0.996057,0.995547


Shape of predictions: (439958, 64)
Shape of labels: (439958, 64)
First few predictions: [[7 2 3 2 6 6 6 6 3 7 7 7 7 7 7 7 7 7 0 0 0 0 4 4 4 4 4 4 4 4 4 4 4 4 1 5
  5 5 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7]
 [7 3 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 8
  0 4 0 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 8 1 5 5 7 1 1 1]]
First few labels: [[-100    2 -100 -100 -100 -100 -100 -100    3 -100 -100 -100 -100 -100
  -100 -100 -100 -100    0 -100 -100 -100 -100 -100    4 -100 -100 -100
  -100 -100 -100 -100 -100 -100    1 -100 -100 -100 -100 -100 -100 -100
  -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
  -100 -100 -100 -100 -100 -100 -100 -100]
 [-100    3    7 -100 -100 -100    7    7 -100    7 -100    7 -100 -100
  -100 -100 -100 -100 -100 -100 -100    7 -100 -100 -100 -100 -100 -100
  -100 -100 -100 -100 -100 -100 -100    8    0 -100 -100 -100 -100 -100
     4 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
     8  

TrainOutput(global_step=142580, training_loss=0.01724328414659421, metrics={'train_runtime': 31371.3992, 'train_samples_per_second': 145.436, 'train_steps_per_second': 4.545, 'total_flos': 1.4903089087523328e+17, 'train_loss': 0.01724328414659421, 'epoch': 2.0})

# Evaluate

In [19]:
trainer.evaluate(test_dataset)

Shape of predictions: (537726, 64)
Shape of labels: (537726, 64)
First few predictions: [[1 2 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 3 3 3 3 7 7 7 7 7 7 7
  7 7 7 7 7 7 7 7 7 7 7 7 7 0 0 0 4 4 4 4 4 4 4 4 4 4 4 1]
 [6 2 6 6 2 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 3 7 7 7 7 7 7 7 7 7
  7 7 7 7 7 7 7 7 0 4 4 4 4 4 4 4 4 4 4 4 4 1 5 5 5 5 5 6]]
First few labels: [[-100    2 -100 -100 -100    6 -100 -100 -100 -100 -100 -100 -100 -100
  -100 -100    6    6 -100 -100 -100 -100 -100 -100    6    3 -100 -100
  -100 -100 -100 -100    7 -100 -100 -100 -100 -100 -100 -100 -100 -100
  -100 -100 -100 -100 -100 -100 -100    0 -100 -100 -100 -100    4 -100
  -100 -100 -100 -100 -100 -100 -100 -100]
 [-100    2 -100 -100 -100    6 -100 -100    6    6 -100 -100 -100 -100
  -100 -100 -100    6 -100 -100 -100 -100 -100 -100 -100 -100    3 -100
  -100 -100 -100 -100 -100    7 -100 -100 -100 -100 -100 -100 -100 -100
  -100 -100    0 -100    4 -100 -100 -100 -100 -100 -100 -100 -100 -100
  -100  

{'eval_loss': 0.013143617659807205,
 'eval_precision': 0.9939983711801312,
 'eval_recall': 0.9952591695854865,
 'eval_f1': 0.9946283708335835,
 'eval_runtime': 1456.9912,
 'eval_samples_per_second': 369.066,
 'eval_steps_per_second': 23.067,
 'epoch': 2.0}

# Inference

In [20]:
def predict_ner(text, model, tokenizer, label_list, max_seq_len=64):
    # Tokenize input text
    tokens = text.split()  # Giả sử text đã được phân cách bởi khoảng trắng
    encoding = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True, padding='max_length', max_length=max_seq_len)
    
    # Chuyển sang device của model (CPU hoặc GPU)
    input_ids = encoding['input_ids'].to(model.device)
    attention_mask = encoding['attention_mask'].to(model.device)

    # Dự đoán
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Shape: (batch_size, seq_len, num_labels)
        predictions = torch.argmax(logits, dim=2)  # Lấy nhãn có xác suất cao nhất

    # Chuyển đổi dự đoán thành danh sách nhãn
    pred_labels = [label_list[pred.item()] for pred in predictions[0] if pred.item() < len(label_list)]
    
    # Loại bỏ nhãn của các token đặc biệt ([CLS], [SEP], [PAD])
    word_ids = encoding.word_ids()  # Map token về từ gốc
    filtered_labels = []
    for i, word_id in enumerate(word_ids):
        if word_id is None:  # Bỏ qua [CLS], [SEP]
            continue
        if i > 0 and word_id == word_ids[i-1]:  # Bỏ qua subword
            continue
        filtered_labels.append(pred_labels[i])

    # Ghép token với nhãn dự đoán
    result = list(zip(tokens, filtered_labels[:len(tokens)]))
    return result

In [21]:
# Thử nghiệm suy luận
# test_case = "Số 123 đường Lê Lợi , Quận 1 , TP Hồ Chí Minh"
test_case = "Số 7 đường đê Tả đáy , phường Đồng Mai , Quận Hà Đông , TP Hà Nội"
predictions = predict_ner(test_case, model, tokenizer, label_list)

print("Kết quả suy luận cho test case:")
for token, label in predictions:
    print(f"{token}: {label}")

Kết quả suy luận cho test case:
Số: B-STREET
7: I-STREET
đường: I-STREET
đê: I-STREET
Tả: I-STREET
đáy: I-STREET
,: O
phường: B-WARD
Đồng: I-WARD
Mai: I-WARD
,: O
Quận: B-DISTRICT
Hà: I-DISTRICT
Đông: I-DISTRICT
,: O
TP: B-PROVINCE
Hà: I-PROVINCE
Nội: I-PROVINCE


# Upload Model to HuggingFace

In [22]:
!pip install huggingface_hub

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
!huggingface-cli login --token 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `Dat mieu` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `Dat mieu`


In [24]:
trainer.push_to_hub(tags="bert-ner-address-3", commit_message="Training complete")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datmieu2k4/ner-results-2/commit/799385d982745b94a227657a71e46cab5d38fcc8', commit_message='Training complete', commit_description='', oid='799385d982745b94a227657a71e46cab5d38fcc8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datmieu2k4/ner-results-2', endpoint='https://huggingface.co', repo_type='model', repo_id='datmieu2k4/ner-results-2'), pr_revision=None, pr_num=None)