In [1]:
# https://wandb.ai/mostafaibrahim17/ml-articles/reports/Named-Entity-Recognition-With-HuggingFace-Using-PyTorch-and-W-B--Vmlldzo0NDgzODA2

In [3]:
# Step 1: Install Packages and Import Dependencies
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Step 2: Check CUDA Availability and Device Information
print("CUDA available:", torch.cuda.is_available())
print("Current device index:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: True
Current device index: 0
Device name: NVIDIA GeForce GTX 1650 with Max-Q Design


In [5]:
# Step 3: Read and Prepare Data
def read_conll_file(file_path):
    with open(file_path, "r") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data


train_data = read_conll_file("./data/CoNLL2003/eng.train")
validation_data = read_conll_file("./data/CoNLL2003/eng.testa")
test_data = read_conll_file("./data/CoNLL2003/eng.testb")


def convert_to_dataset(data, label_map):
    formatted_data = {"tokens": [], "ner_tags": []}
    for sentence in data:
        tokens = [token_data[0] for token_data in sentence]
        ner_tags = [label_map[token_data[3]] for token_data in sentence]
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
    return Dataset.from_dict(formatted_data)


label_list = sorted(list(set([token_data[3] for sentence in train_data for token_data in sentence])))
label_map = {label: i for i, label in enumerate(label_list)}


train_dataset = convert_to_dataset(train_data, label_map)
validation_dataset = convert_to_dataset(validation_data, label_map)
test_dataset = convert_to_dataset(test_data, label_map)


datasets = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset,
})


113
109
124


In [5]:
# Step 4: Initialize Tokenizer and Model
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Step 5: Define Metrics and Tokenization Function

def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)


    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]


    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "classification_report": classification_report(true_labels, true_predictions),
    }


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [7]:
# Step 6: Tokenize Datasets and Set Training Arguments
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=100,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)


Map:   0%|          | 0/14987 [00:00<?, ? examples/s]

Map: 100%|██████████| 14987/14987 [00:01<00:00, 12547.85 examples/s]
Map: 100%|██████████| 3466/3466 [00:00<00:00, 13115.03 examples/s]
Map: 100%|██████████| 3684/3684 [00:00<00:00, 13419.42 examples/s]


In [8]:
# Step 7: Define Data Collator and Initialize Trainer
def data_collator(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]


    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)


    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Step 8: Train the Model

trainer.train()

  5%|▌         | 100/1874 [00:44<12:58,  2.28it/s]

{'loss': 0.3564, 'learning_rate': 4.7331910352187837e-05, 'epoch': 0.05}


 11%|█         | 200/1874 [01:27<11:10,  2.50it/s]

{'loss': 0.1162, 'learning_rate': 4.466382070437567e-05, 'epoch': 0.11}


 16%|█▌        | 300/1874 [02:11<11:50,  2.21it/s]

{'loss': 0.084, 'learning_rate': 4.1995731056563505e-05, 'epoch': 0.16}


 21%|██▏       | 400/1874 [02:58<12:08,  2.02it/s]

{'loss': 0.0875, 'learning_rate': 3.932764140875134e-05, 'epoch': 0.21}


 27%|██▋       | 500/1874 [03:41<10:49,  2.11it/s]

{'loss': 0.0685, 'learning_rate': 3.665955176093917e-05, 'epoch': 0.27}


                                                  
 27%|██▋       | 500/1874 [04:37<10:49,  2.11it/s]

{'eval_loss': 0.0727742537856102, 'eval_precision': 0.907103825136612, 'eval_recall': 0.9219118142039717, 'eval_f1': 0.9144478758033553, 'eval_classification_report': '              precision    recall  f1-score   support\n\n         LOC       0.92      0.96      0.94      1837\n        MISC       0.82      0.83      0.82       922\n         ORG       0.88      0.86      0.87      1341\n         PER       0.96      0.97      0.96      1842\n\n   micro avg       0.91      0.92      0.91      5942\n   macro avg       0.89      0.91      0.90      5942\nweighted avg       0.91      0.92      0.91      5942\n', 'eval_runtime': 55.8708, 'eval_samples_per_second': 62.036, 'eval_steps_per_second': 7.768, 'epoch': 0.27}


 32%|███▏      | 600/1874 [05:24<08:29,  2.50it/s]  

{'loss': 0.0631, 'learning_rate': 3.3991462113127e-05, 'epoch': 0.32}


 37%|███▋      | 700/1874 [06:08<08:45,  2.23it/s]

{'loss': 0.0749, 'learning_rate': 3.1323372465314835e-05, 'epoch': 0.37}


 43%|████▎     | 800/1874 [06:54<09:15,  1.93it/s]

{'loss': 0.0609, 'learning_rate': 2.8655282817502672e-05, 'epoch': 0.43}


 48%|████▊     | 900/1874 [07:40<06:40,  2.43it/s]

{'loss': 0.0595, 'learning_rate': 2.5987193169690503e-05, 'epoch': 0.48}


 53%|█████▎    | 1000/1874 [08:26<06:05,  2.39it/s]

{'loss': 0.0592, 'learning_rate': 2.3319103521878334e-05, 'epoch': 0.53}


                                                   
 53%|█████▎    | 1000/1874 [09:22<06:05,  2.39it/s]

{'eval_loss': 0.06164441630244255, 'eval_precision': 0.9133101276313609, 'eval_recall': 0.9272972063278357, 'eval_f1': 0.920250521920668, 'eval_classification_report': '              precision    recall  f1-score   support\n\n         LOC       0.94      0.96      0.95      1837\n        MISC       0.78      0.87      0.82       922\n         ORG       0.91      0.86      0.89      1341\n         PER       0.97      0.97      0.97      1842\n\n   micro avg       0.91      0.93      0.92      5942\n   macro avg       0.90      0.92      0.91      5942\nweighted avg       0.92      0.93      0.92      5942\n', 'eval_runtime': 55.8193, 'eval_samples_per_second': 62.093, 'eval_steps_per_second': 7.775, 'epoch': 0.53}


 59%|█████▊    | 1100/1874 [10:06<05:57,  2.17it/s]  

{'loss': 0.0533, 'learning_rate': 2.0651013874066168e-05, 'epoch': 0.59}


 64%|██████▍   | 1200/1874 [10:51<05:11,  2.16it/s]

{'loss': 0.0552, 'learning_rate': 1.7982924226254002e-05, 'epoch': 0.64}


 69%|██████▉   | 1300/1874 [11:36<04:41,  2.04it/s]

{'loss': 0.0433, 'learning_rate': 1.5314834578441836e-05, 'epoch': 0.69}


 75%|███████▍  | 1400/1874 [12:20<03:16,  2.41it/s]

{'loss': 0.0313, 'learning_rate': 1.264674493062967e-05, 'epoch': 0.75}


 80%|████████  | 1500/1874 [13:05<03:02,  2.05it/s]

{'loss': 0.0409, 'learning_rate': 9.978655282817503e-06, 'epoch': 0.8}


                                                   
 80%|████████  | 1500/1874 [14:01<03:02,  2.05it/s]

{'eval_loss': 0.048809733241796494, 'eval_precision': 0.934560669456067, 'eval_recall': 0.9397509256142713, 'eval_f1': 0.9371486112276579, 'eval_classification_report': '              precision    recall  f1-score   support\n\n         LOC       0.96      0.96      0.96      1837\n        MISC       0.85      0.89      0.87       922\n         ORG       0.93      0.90      0.91      1341\n         PER       0.96      0.98      0.97      1842\n\n   micro avg       0.93      0.94      0.94      5942\n   macro avg       0.92      0.93      0.93      5942\nweighted avg       0.94      0.94      0.94      5942\n', 'eval_runtime': 56.0146, 'eval_samples_per_second': 61.877, 'eval_steps_per_second': 7.748, 'epoch': 0.8}


 85%|████████▌ | 1600/1874 [14:48<01:41,  2.70it/s]  

{'loss': 0.0444, 'learning_rate': 7.310565635005337e-06, 'epoch': 0.85}


 91%|█████████ | 1700/1874 [15:32<01:22,  2.10it/s]

{'loss': 0.0408, 'learning_rate': 4.6424759871931695e-06, 'epoch': 0.91}


 96%|█████████▌| 1800/1874 [16:17<00:31,  2.31it/s]

{'loss': 0.0378, 'learning_rate': 1.9743863393810032e-06, 'epoch': 0.96}


100%|██████████| 1874/1874 [16:51<00:00,  1.85it/s]

{'train_runtime': 1011.6662, 'train_samples_per_second': 14.814, 'train_steps_per_second': 1.852, 'train_loss': 0.07503059579570368, 'epoch': 1.0}





TrainOutput(global_step=1874, training_loss=0.07503059579570368, metrics={'train_runtime': 1011.6662, 'train_samples_per_second': 14.814, 'train_steps_per_second': 1.852, 'train_loss': 0.07503059579570368, 'epoch': 1.0})