## Model Building - Fine Tuning

### Changing to the main directory

In [1]:
%cd ..

/home/isham/Desktop/machine-learning-projects/misc_ai_projects/elec-device-feedback-classification


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import os 
import numpy as np
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import torch
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer
import evaluate
import json

import pandas as pd 

from utilities import EVAL_STRATEGY, LEARNING_RATE, PER_DEVICE_TRAIN_BATCH_SIZE, PER_DEVICE_EVAL_BATCH_SIZE, NUM_TRAIN_EPOCHS, WEIGHT_DECAY
from utilities import MODEL_ID, MODEL_PATH, OUTPUT_DIR, OUTPUT_MODEL, OUTPUT_DATASET_PATH

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

### Loading the Tokenized Dataset

In [3]:
tokenized_electrical_classification_dataset = load_from_disk(OUTPUT_DATASET_PATH)
print(tokenized_electrical_classification_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 11552
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1352
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1352
    })
})


In [4]:
tokenized_electrical_classification_dataset["train"].features["label"].names

['negative', 'positive', 'mixed', 'neutral']

In [5]:
label_list= tokenized_electrical_classification_dataset["train"].features["label"].names
num_labels = len(label_list)

print(f"Labels: {label_list}")
print(f"Number of labels: {num_labels}")

Labels: ['negative', 'positive', 'mixed', 'neutral']
Number of labels: 4


In [6]:
# device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, num_labels = num_labels).to(device)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, num_labels = num_labels)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
training_args = TrainingArguments(output_dir = MODEL_PATH,
                                 num_train_epochs=NUM_TRAIN_EPOCHS,
                                 learning_rate = LEARNING_RATE,
                                 per_device_train_batch_size= PER_DEVICE_TRAIN_BATCH_SIZE,
                                 per_device_eval_batch_size = PER_DEVICE_EVAL_BATCH_SIZE,
                                weight_decay=WEIGHT_DECAY,
                                evaluation_strategy = EVAL_STRATEGY,
                                disable_tqdm=False)
     



In [8]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average='weighted')
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1": f1}


In [9]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=tokenized_electrical_classification_dataset['train'],
                  eval_dataset=tokenized_electrical_classification_dataset['validation'],
                  tokenizer=tokenizer)
     

  trainer = Trainer(model=model, args=training_args,


[2025-01-05 19:24:43,678] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/isham/anaconda3/envs/image_classification/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/isham/anaconda3/envs/image_classification/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/isham/anaconda3/envs/image_classification/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/isham/anaconda3/envs/image_classification/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/isham/anaconda3/envs/image_classification/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/isham/anaconda3/envs/image_classification/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsi

In [10]:
trainer.train()



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdisham[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.434108,0.856509,0.835328
2,No log,0.364496,0.869822,0.862051
3,0.343200,0.351232,0.868343,0.867117
4,0.343200,0.339185,0.877959,0.876423
5,0.343200,0.334324,0.879438,0.878077


TrainOutput(global_step=905, training_loss=0.2527716199337448, metrics={'train_runtime': 109.4871, 'train_samples_per_second': 527.551, 'train_steps_per_second': 8.266, 'total_flos': 2988902284032000.0, 'train_loss': 0.2527716199337448, 'epoch': 5.0})

### Saving the training results

In [11]:
results = pd.DataFrame(trainer.state.log_history)
results = results[['epoch', 'eval_f1', 'eval_accuracy', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second']]
results.dropna(inplace=True)
results.reset_index(drop=True, inplace=True)

# Saving evaluation results in a CSV format for easy visualization and comparison.
results.to_csv(f"logs/{OUTPUT_MODEL.split("/")[-1]}-results.csv", index=False)

### Saving the Model

In [12]:
model.save_pretrained(OUTPUT_MODEL)
tokenizer.save_pretrained(OUTPUT_MODEL)

('models/electrical-classification-distilbert-base-uncased/tokenizer_config.json',
 'models/electrical-classification-distilbert-base-uncased/special_tokens_map.json',
 'models/electrical-classification-distilbert-base-uncased/vocab.txt',
 'models/electrical-classification-distilbert-base-uncased/added_tokens.json',
 'models/electrical-classification-distilbert-base-uncased/tokenizer.json')

In [13]:
label_list

['negative', 'positive', 'mixed', 'neutral']

In [14]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [15]:
config = json.load(open(f"{OUTPUT_MODEL}/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open(f"{OUTPUT_MODEL}/config.json","w"))