# Fine-tuning Models

### Changing to the main directory

In [None]:
%cd ..

### Importing Necessary Libraries

In [2]:
import os 
import numpy as np
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer
import evaluate
import json

import pandas as pd 

from utilities import EVAL_STRATEGY, LEARNING_RATE, PER_DEVICE_TRAIN_BATCH_SIZE, PER_DEVICE_EVAL_BATCH_SIZE, NUM_TRAIN_EPOCHS, WEIGHT_DECAY
from utilities import MODEL_ID, MODEL_PATH, OUTPUT_DIR, OUTPUT_MODEL, OUTPUT_DATASET_PATH

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

### Loading the Tokenized Dataset

- Loading the pre-tokenized dataset for training and validation.
- This ensures the dataset is pre-processed and ready for input into the model.


In [3]:
tokenized_electrical_ner_dataset = load_from_disk(OUTPUT_DATASET_PATH)
print(tokenized_electrical_ner_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 12076
    })
    validation: Dataset({
        features: ['text', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1509
    })
    test: Dataset({
        features: ['text', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1510
    })
})


Extracting the list of labels and their count from the dataset's features.

In [4]:
label_list= tokenized_electrical_ner_dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)

print(f"Labels: {label_list}")
print(f"Number of labels: {num_labels}")

Labels: ['O', 'B-COMPONENT', 'I-COMPONENT', 'B-DESIGN_PARAM', 'I-DESIGN_PARAM', 'B-MATERIAL', 'I-MATERIAL', 'B-EQUIPMENT', 'I-EQUIPMENT', 'B-TECHNOLOGY', 'I-TECHNOLOGY', 'B-SOFTWARE', 'I-SOFTWARE', 'B-STANDARD', 'I-STANDARD', 'B-VENDOR', 'I-VENDOR', 'B-PRODUCT', 'I-PRODUCT']
Number of labels: 19


In [5]:
tokenized_electrical_ner_dataset.shape

{'train': (12076, 7), 'validation': (1509, 7), 'test': (1510, 7)}

### Model Training

Initializing the pre-trained model and tokenizer with the specified number of labels.

In [6]:
model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training arguments for the Trainer API:

- eval_strategy: Strategy for evaluation during training.
- learning_rate: Optimizer learning rate.
- batch size and epochs: Control training speed and efficiency.
- output_dir: Directory to save the trained model.

In [7]:
args = TrainingArguments(
    output_dir=MODEL_PATH,
    eval_strategy=EVAL_STRATEGY,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    weight_decay=WEIGHT_DECAY,
)

Using a Data Collator to dynamically pad inputs and ensure token-label alignment during training.

In [8]:
data_collator = DataCollatorForTokenClassification(tokenizer)

- The following function computes evaluation metrics such as precision, recall, F1 score, and accuracy.
- It filters out ignored tokens (label=-100) during computation.

In [10]:
def compute_metrics(eval_preds):
    """
    Function to compute the evaluation metrics for Named Entity Recognition (NER) tasks.
    The function computes precision, recall, F1 score and accuracy.

    Parameters:
    eval_preds (tuple): A tuple containing the predicted logits and the true labels.

    Returns:
    A dictionary containing the precision, recall, F1 score and accuracy.
    """
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
    ]
    metric = evaluate.load("seqeval")
    results = metric.compute(predictions=predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Initializing the Trainer with the model, training arguments, datasets, and evaluation metrics.

In [None]:
trainer = Trainer(
    model,
    args,
   train_dataset=tokenized_electrical_ner_dataset["train"],
   eval_dataset=tokenized_electrical_ner_dataset["validation"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics,
)

Starting model training. This step optimizes the model's weights to fit the dataset.

In [12]:
trainer.train()



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdisham[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.174512,0.875821,0.916937,0.895907,0.953219
2,No log,0.132473,0.896673,0.928634,0.912373,0.960535
3,0.349900,0.118794,0.90959,0.929503,0.919439,0.963977
4,0.349900,0.113333,0.913159,0.924129,0.918611,0.963952
5,0.349900,0.11147,0.913242,0.930925,0.921999,0.964742


TrainOutput(global_step=945, training_loss=0.24046884365182705, metrics={'train_runtime': 41.2198, 'train_samples_per_second': 1464.829, 'train_steps_per_second': 22.926, 'total_flos': 591032701542888.0, 'train_loss': 0.24046884365182705, 'epoch': 5.0})

### Saving the training results

Extracting and saving key training metrics (e.g., precision, recall, F1 score, runtime) for later analysis.

In [14]:
results = pd.DataFrame(trainer.state.log_history)
results = results[['epoch', 'eval_precision', 'eval_recall', 'eval_f1', 'eval_accuracy', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second']]
results.dropna(inplace=True)
results.reset_index(drop=True, inplace=True)

# Saving evaluation results in a CSV format for easy visualization and comparison.
results.to_csv(f"logs/{OUTPUT_MODEL.split("/")[-1]}-results.csv", index=False)

### Saving the Model

Saving the trained model and tokenizer for future inference and deployment.

In [13]:
model.save_pretrained(OUTPUT_MODEL)
tokenizer.save_pretrained(OUTPUT_MODEL)

('models/electrical-ner-distilbert-base-uncased/tokenizer_config.json',
 'models/electrical-ner-distilbert-base-uncased/special_tokens_map.json',
 'models/electrical-ner-distilbert-base-uncased/vocab.txt',
 'models/electrical-ner-distilbert-base-uncased/added_tokens.json',
 'models/electrical-ner-distilbert-base-uncased/tokenizer.json')

Creating mappings between label indices and label names for model configuration.

In [15]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [16]:
config = json.load(open(f"{OUTPUT_MODEL}/config.json"))

Adding the label mappings to the model configuration for seamless inference.

In [17]:
config["id2label"] = id2label
config["label2id"] = label2id

In [18]:
json.dump(config, open(f"{OUTPUT_MODEL}/config.json","w"))

Next Steps:

- Use the trained model to evaluate its performance on unseen test data.
- Deploy the model for inference and integrate it into an application.