In [1]:
import re
import torch
import numpy as np
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch.nn as nn
import torch.nn.functional as F
import nlpaug.augmenter.word as naw
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.figure_factory as ff

  "class": algorithms.Blowfish,


In [None]:
model = BertForSequenceClassification.from_pretrained(
    '\\vi240c060002.woc.prod\\e$\\Machine Learning\\fine_tuned_source_of_incident',
    num_labels=num_labels
)

In [None]:
model.save_pretrained('\\\\vi240c060002.woc.prod\\e$\\Machine Learning\\fine_tuned_source_of_incident')


In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        log_prob = F.log_softmax(inputs, dim=-1)
        prob = torch.exp(log_prob)
        
        log_prob = log_prob.gather(dim=-1, index=targets.unsqueeze(1)).squeeze(1)
        prob = prob.gather(dim=-1, index=targets.unsqueeze(1)).squeeze(1)
        
        focal_weight = (1 - prob) ** self.gamma

        if self.alpha is not None:
            alpha_t = self.alpha[targets] if isinstance(self.alpha, torch.Tensor) else self.alpha
            focal_weight = focal_weight * alpha_t

        loss = -focal_weight * log_prob
        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss

class FocalTrainer(Trainer):
    def __init__(self, alpha=None, gamma=2.0, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = FocalLoss(alpha=alpha, gamma=gamma)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss = self.loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
# 1) Load your old + new data (or just new data)
old_data = pd.read_excel('\\\\vi240c060002.woc.prod\\e$\\datasets\\WCMLDataset12_23.xlsx')
new_data = pd.read_csv('new_source_incident_data.csv')       # newly arrived data
combined_data = pd.concat([old_data, new_data], ignore_index=True)

# 2) Clean the text if necessary (reuse your clean_text function)
combined_data['Combined_Text'] = combined_data['Combined_Text'].apply(clean_text)

# 3) Use or load your existing label encoder
# If you have it saved, you can do something like:
# import joblib
# label_encoder = joblib.load("source_incident_label_encoder.pkl")
# Otherwise, if you only have it in memory from old code, replicate it here:
label_encoder = LabelEncoder()
combined_data['Source of Incident Desc_Encoded'] = label_encoder.fit_transform(
    combined_data['Source of Incident Desc']
)

# 4) Tokenize with the same tokenizer from your old checkpoint
tokenizer = BertTokenizer.from_pretrained('\\vi240c060002.woc.prod\\e$\\Machine Learning\\fine_tuned_source_of_incident')

# 5) Split
X_train, X_val, y_train, y_val = train_test_split(
    combined_data['Combined_Text'],
    combined_data['Source of Incident Desc_Encoded'],
    test_size=0.2,
    random_state=42,
    stratify=combined_data['Source of Incident Desc_Encoded']
)

# 6) Tokenize
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512)
val_encodings   = tokenizer(list(X_val), truncation=True, padding=True, max_length=512)

# 7) Create dataset class as before
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = Dataset(train_encodings, list(y_train))
val_dataset   = Dataset(val_encodings, list(y_val))

# 8) Load your existing model checkpoint with the same # of labels
num_labels = len(label_encoder.classes_)
model = BertForSequenceClassification.from_pretrained(
    '\\vi240c060002.woc.prod\\e$\\Machine Learning\\fine_tuned_source_of_incident',
    num_labels=num_labels
)

# 9) Reuse FocalTrainer, FocalLoss, etc.
# (Paste your FocalLoss and FocalTrainer definitions here)
# from your current code:
# class FocalLoss(nn.Module): ...
# class FocalTrainer(Trainer): ...
# FILL IN EXACT CODE

# 10) Set up new training arguments
training_args = TrainingArguments(
    output_dir='./incremental_results',
    learning_rate=3e-5,
    num_train_epochs=3,        # fewer epochs if it’s just incremental
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./incremental_logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

# 11) Reuse your alpha / class_weights if you want
# If you want to keep the same approach:
from torch import nn
import torch.nn.functional as F
import numpy as np

# e.g., compute class_weights from y_train
train_class_counts = np.bincount(y_train)
total_samples = len(y_train)
inc_class_weights = torch.tensor(total_samples / (num_labels * train_class_counts.astype(float)), dtype=torch.float)

# 12) Instantiate the FocalTrainer again
trainer = FocalTrainer(
    alpha=inc_class_weights,  # same approach
    gamma=2.0,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# 13) Train incrementally
trainer.train()

# 14) Evaluate
eval_results = trainer.evaluate(val_dataset)
print("Incremental Eval Results:", eval_results)

# 15) Save
trainer.save_model('./fine_tuned_source_incident_incremental')

# 16) (Optional) Save updated label encoder if needed
# import joblib
# joblib.dump(label_encoder, "source_incident_label_encoder_updated.pkl")
