In [1]:
import re
import torch
import numpy as np
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch.nn as nn
import torch.nn.functional as F
import nlpaug.augmenter.word as naw
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.figure_factory as ff

  "class": algorithms.Blowfish,


In [2]:
print("Is CUDA available? ", torch.cuda.is_available())
print("Number of available GPUs:", torch.cuda.device_count())

Is CUDA available?  False
Number of available GPUs: 0


In [3]:
if torch.cuda.is_available():
    print("Current GPU:", torch.cuda.current_device())
    print("Device name:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available. Check your driver/environment setup.")

CUDA is not available. Check your driver/environment setup.


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
# ---------------------------------------------
# Text Cleaning and Lemmatization
# ---------------------------------------------
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove non-informative chars
    text = re.sub(r"[^a-z0-9.,!?'\s-]", '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r"\s+", " ", text).strip()
    # Normalize excessive punctuation
    text = re.sub(r"!+", "!", text)
    text = re.sub(r"\?+", "?", text)
    # Lemmatize tokens
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    text = " ".join(tokens)
    return text

In [None]:
# ---------------------------------------------
# Data Loading and Preprocessing
# ---------------------------------------------
data = pd.read_excel('\\\\vi240c060002.woc.prod\\e$\\datasets\\Fields\\Ad Hoc Use\\Food_and_Beverage.xlsx')
example_data = data.copy()

text_fields = [
    'Incident Description',
    #'Activity Engaged in During Accident'
    'CM Observation', 
    #'General HS Comments', 
    'Injury Description'
]

example_data[text_fields] = example_data[text_fields].fillna('')
for field in text_fields:
    example_data[field] = example_data[field].apply(clean_text)

example_data['Combined_Text'] = (
    example_data['Incident Description'] + ' ' +
    #example_data['Activity Engaged in During Accident'] + ' ' +
    example_data['CM Observation'] + ' ' +
    #example_data['General HS Comments'] + ' ' +
    example_data['Injury Description']
).str.strip()

In [None]:
# ---------------------------------------------
# Label Encoding
# ---------------------------------------------
targets = [
    #'Event of Injury Desc', 
    #'Source of Injury Desc', 
    #'Event of Incident Desc', 
    #'Source of Incident Desc',
    #'EDI Cause Desc'
    'Allergy Code'
]

label_encoders = {}
for target in targets:
    le = LabelEncoder()
    example_data[target + '_Encoded'] = le.fit_transform(example_data[target])
    label_encoders[target] = le

In [None]:
# ---------------------------------------------
# Rare Class Identification and Augmentation
# ---------------------------------------------
# Choose target
focus_target = 'Allergy Code'
focus_target_encoded = focus_target + '_Encoded'

class_counts = example_data[focus_target].value_counts()

# Define a rarity threshold
rare_threshold = 50
rare_classes_list = class_counts[class_counts < rare_threshold].index.tolist()

if rare_classes_list:
    # Augmenting rare class samples
    syn_aug = naw.SynonymAug(aug_src='wordnet', aug_min=1, aug_max=3, aug_p=0.1)
    
    def augment_text(text, augmenter=syn_aug):
        return augmenter.augment(text)
    
    rare_class_filter = example_data[focus_target].isin(rare_classes_list)
    rare_class_data = example_data[rare_class_filter]
    
    augmented_samples = []
    for _, row in rare_class_data.iterrows():
        #augment once per rare sample
        augmented_text = augment_text(row['Combined_Text'])
        new_row = row.copy()
        new_row['Combined_Text'] = augmented_text
        augmented_samples.append(new_row)
    
    augmented_df = pd.DataFrame(augmented_samples)
    example_data = pd.concat([example_data, augmented_df], ignore_index=True)

In [None]:
# ---------------------------------------------
# Tokenization and Data Split
# ---------------------------------------------
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

X_train, X_test, y_train, y_test = train_test_split(
    example_data['Combined_Text'], 
    example_data[focus_target_encoded], 
    test_size=0.2, 
    random_state=42
)

X_train = X_train.astype(str).tolist()
X_test = X_test.astype(str).tolist()

train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = Dataset(train_encodings, list(y_train))
test_dataset = Dataset(test_encodings, list(y_test))

In [None]:
# ---------------------------------------------
# Compute Class Weights and Implement Focal Loss
# ---------------------------------------------
num_labels = len(label_encoders[focus_target].classes_)

class_counts_train = np.bincount(y_train)
total_samples = len(y_train)
class_weights = total_samples / (num_labels * class_counts_train.astype(float))
class_weights = torch.tensor(class_weights, dtype=torch.float)

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.5, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        
        # Move alpha to the same device as inputs
        if isinstance(self.alpha, torch.Tensor):
            self.alpha = self.alpha.to(inputs.device)
        
        log_prob = F.log_softmax(inputs, dim=-1)
        prob = torch.exp(log_prob)
        
        log_prob = log_prob.gather(dim=-1, index=targets.unsqueeze(1)).squeeze(1)
        prob = prob.gather(dim=-1, index=targets.unsqueeze(1)).squeeze(1)
        
        focal_weight = (1 - prob) ** self.gamma

        if self.alpha is not None:
            alpha_t = self.alpha[targets] if isinstance(self.alpha, torch.Tensor) else self.alpha
            focal_weight = focal_weight * alpha_t

        loss = -focal_weight * log_prob
        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss

class FocalTrainer(Trainer):
    def __init__(self, alpha=None, gamma=2.5, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = FocalLoss(alpha=alpha, gamma=gamma)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss = self.loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
# ---------------------------------------------
# Compute Metrics Function
# ---------------------------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted', zero_division=0)
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# ---------------------------------------------
# Training Arguments and Trainer Initialization
# ---------------------------------------------
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=1250,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",    
    save_strategy="epoch",
    load_best_model_at_end=True,
    no_cuda=False  # ensures GPU is used
)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Move model to GPU
model.to(device)

trainer = FocalTrainer(
    alpha=class_weights,   # Incorporate class weights into focal loss
    gamma=2.5,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
# ---------------------------------------------
# Training
# ---------------------------------------------
trainer.train()
model.save_pretrained('\\\\vi240c060002.woc.prod\\e$\\Machine Learning\\food_and_beverage_allergy')

In [None]:
# ---------------------------------------------
# Evaluation
# ---------------------------------------------
eval_results = trainer.evaluate(test_dataset)
print("Evaluation Results:", eval_results)

# Predict on test set for confusion matrix and classification report
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

decoded_preds = label_encoders[focus_target].inverse_transform(preds)
decoded_labels = label_encoders[focus_target].inverse_transform(labels)

cm = confusion_matrix(decoded_labels, decoded_preds)
print("Confusion Matrix:")
print(cm)

report = classification_report(decoded_labels, decoded_preds, zero_division=0)
print("Classification Report:")
print(report)

In [None]:
#Visualize confusion matrix
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoders[focus_target].classes_, 
            yticklabels=label_encoders[focus_target].classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Inference example
model = BertForSequenceClassification.from_pretrained('\\\\vi240c060002.woc.prod\\e$\\Machine Learning\\food_and_beverage_allergy')

In [None]:
new_text = ["Nicole Lithos Clark claimed that she used the female restrooms in American Adventure and upon leaving the stall bumped the right side of her head on the coat hanger that was on the inside of the door. Felt dizzy upon walking and shopped and sat down near Japan."]
new_text = [clean_text(t) for t in new_text]  # Clean the new text before prediction
new_encodings = tokenizer(new_text, truncation=True, padding=True, max_length=512, return_tensors='pt')

In [None]:
model.eval()
outputs = model(**new_encodings)
predicted_class = torch.argmax(outputs.logits, dim=1).item()
decoded_class = label_encoders['Allergy Code'].inverse_transform([predicted_class])
print(f"Predicted Allergy Cause: {decoded_class[0]}")

In [None]:
df_results = pd.DataFrame({
    'Text': X_test, 
    'True_Label': label_encoders[focus_target].inverse_transform(labels),
    'Predicted_Label': label_encoders[focus_target].inverse_transform(preds)
})

# Filter for misclassified samples
df_misclassified = df_results[df_results['True_Label'] != df_results['Predicted_Label']]

# Export to Excel
df_misclassified.to_excel('\\\\vi240c060002.woc.prod\\e$\\data\\AllergyCode_misclassified_samples.xlsx', index=False)
print("Misclassified samples exported to misclassified_samples.xlsx")

In [None]:
log_history = trainer.state.log_history

train_epochs = []
train_losses = []
eval_epoch = []
eval_losses = []

for record in log_history:
    if "loss" in record:
        train_epochs.append(record["epoch"])
        train_losses.append(record["loss"])
    if "eval_loss" in record:
        eval_epochs.append(record["epoch"])
        eval_losses.append(record["eval_loss"])

plt.figure(figsize=(8,6))
plt.plot(train_epochs, train_losses, label='Training Loss')
plt.plot(eval_epochs, eval_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training & Validation Loss Over Steps')
plt.legend()
plt.show()
