In [None]:
# %pip install nltk

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import numpy as np
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import re
import nltk
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
#Step 1: Data Preprocessing

In [None]:
data = pd.read_excel('\\\\vi240c060002.woc.prod\\e$\\datasets\\WCMLDataset12_17.xlsx')
example_data = data.copy()

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def clean_text(text):
    # Lowercase
    text = text.lower()

    # Remove or normalize unwanted characters (digits, special symbols)
    text = re.sub(r"[^a-z0-9.,!?'\s-]", '', text)

    # Replace multiple spaces with a single space
    text = re.sub(r"\s+", " ", text).strip()

    # Normalize excessive punctuation
    text = re.sub(r"!+", "!", text)
    text = re.sub(r"\?+", "?", text)

    # Tokenize by whitespace to apply lemmatization
    tokens = text.split()

    # Lemmatize each token
    # The WordNetLemmatizer defaults to nouns, so for a more accurate approach,
    # you could consider POS tagging. For now, we’ll assume noun form or just use the default.
    # If you want better results, you could incorporate POS tagging:
    # nltk.download('averaged_perceptron_tagger')
    # then map POS tags to WordNet POS and lemmatize accordingly.
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Rejoin after lemmatization
    text = " ".join(tokens)

    return text

In [None]:
text_fields = [
    'Incident Description', 
    'Activity Engaged in During Accident', 
    'General HS Comments', 
    'Injury Description'
]

In [None]:
# Fill NaN with empty strings
example_data[text_fields] = example_data[text_fields].fillna('')

In [None]:
# Apply the cleaning function to each text field
for field in text_fields:
    example_data[field] = example_data[field].apply(clean_text)

In [None]:
# Combine text fields into a single input column (they are already lowercase from the cleaning step)
example_data['Combined_Text'] = (
    example_data['Incident Description'] + ' ' +
    example_data['Activity Engaged in During Accident'] + ' ' +
    example_data['General HS Comments'] + ' ' +
    example_data['Injury Description']
).str.strip()


In [None]:
# Encode target labels for all outputs
targets = [
    'Event of Injury Desc', 
    'Source of Injury Desc', 
    'Event of Incident Desc', 
    'Source of Incident Desc',
    'EDI Cause Desc'
]
#Need 

In [None]:
label_encoders = {}
for target in targets:
    le = LabelEncoder()
    example_data[target + '_Encoded'] = le.fit_transform(example_data[target])
    label_encoders[target] = le

In [None]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Example: split data for the "Source of Injury Desc" target
X_train, X_test, y_train, y_test = train_test_split(
    example_data['Combined_Text'], 
    example_data['Source of Injury Desc_Encoded'], 
    test_size=0.2, 
    random_state=42
)

In [None]:
# Tokenize the text
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)

In [None]:
class_counts = np.bincount(y_train)  # counts how many samples of each class are in y_train

num_classes = len(class_counts)
total_samples = len(y_train)

class_weights = total_samples / (num_classes * class_counts.astype(float))
class_weights = torch.tensor(class_weights, dtype=torch.float)

print("Class Weights:", class_weights)

In [None]:
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = nn.CrossEntropyLoss(weight=class_weights)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")

        loss = self.loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [None]:
train_dataset = Dataset(train_encodings, list(y_train))
test_dataset = Dataset(test_encodings, list(y_test))

num_labels = len(label_encoders['Source of Injury Desc'].classes_)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
# Create an instance of WeightedTrainer
trainer = WeightedTrainer(
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained('\\\\vi240c060002.woc.prod\\e$\\Machine Learning\\fine_tuned_source_of_injury')

In [None]:
# Inference example
model = BertForSequenceClassification.from_pretrained('\\\\vi240c060002.woc.prod\\e$\\Machine Learning\\fine_tuned_source_of_injury')

In [None]:
new_text = ["Guest was exiting the Gran Fiesta Tour ride in Mexico Pavilion and cut her leg, Guest received a laceration on her left shin that was bleeding after exiting the Gran Fiesta Tour. Stated that she fell while trying to exit and got the laceration as she was trying to stand up. Guest stated she fell getting out of the boat at Gran Fiesta Tour at the unload platform and got a cut on her left leg. When she was walking up the exit ramp, there was BBP on the ground and she was bleeding from the cut pretty significantly as it was also on her shoes and pants."]
new_text = [clean_text(t) for t in new_text]  # Clean the new text before prediction
new_encodings = tokenizer(new_text, truncation=True, padding=True, max_length=512, return_tensors='pt')

In [None]:
model.eval()
outputs = model(**new_encodings)
predicted_class = torch.argmax(outputs.logits, dim=1).item()
decoded_class = label_encoders['Source of Injury Desc'].inverse_transform([predicted_class])
print(f"Predicted Source of Injury: {decoded_class[0]}")