In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [2]:
data = pd.read_excel('WCMLDataset.xlsx')

In [3]:
example_data = data

In [4]:
text_fields = [
    'Incident Description',
    'Activity Engaged in During Accident',
    'General HS Comments',
    'Injury Description'
]
example_data[text_fields] = example_data[text_fields].fillna('')

In [5]:
# Step 1: Data Cleaning and Preprocessing
def preprocess_text(text):
    """Basic text preprocessing: lowercasing and stripping whitespace."""
    if not isinstance(text, str):
        text = str(text)  # Convert to string if not already
    return text.lower().strip()

# Combine relevant input fields into a single text column
example_data['Combined_Text'] = (
    example_data['Incident Description'] + ' ' +
    example_data['Activity Engaged in During Accident'] + ' ' +
    example_data['General HS Comments'] + ' ' +
    example_data['Injury Description']
).apply(preprocess_text)

# Encode categorical output labels for modeling
from sklearn.preprocessing import LabelEncoder

output_fields = [
    'Source of Incident Desc',
    'Source of Injury Desc',
    'Event of Incident Desc',
    'Event of Injury Desc',
    'EDI Cause Desc'
]

label_encoders = {}
for field in output_fields:
    le = LabelEncoder()
    example_data[field + '_Encoded'] = le.fit_transform(example_data[field])
    label_encoders[field] = le  # Save encoders for inverse transformation later

example_data[['Combined_Text'] + [f"{field}_Encoded" for field in output_fields]].head()


Unnamed: 0,Combined_Text,Source of Incident Desc_Encoded,Source of Injury Desc_Encoded,Event of Incident Desc_Encoded,Event of Injury Desc_Encoded,EDI Cause Desc_Encoded
0,a guest grabbed her wrist and pulled as she co...,133,134,0,0,9
1,he was kicked in the knee when a male guest go...,133,134,0,0,9
2,while performing housekeep duties she felt a b...,1,1,2,2,9
3,while feeding treats to a red river hog from t...,2,2,2,2,9
4,while eating the lettuce the turtle bit down o...,2,2,2,2,9


In [17]:
# Step 2: Initialize and apply the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1500)  # Adjust max_features as needed
X_tfidf = tfidf_vectorizer.fit_transform(example_data['Combined_Text'])


print("TF-IDF matrix shape:", X_tfidf.shape)

print("Sample TF-IDF feature names:", tfidf_vectorizer.get_feature_names_out()[:10])

TF-IDF matrix shape: (8189, 1500)
Sample TF-IDF feature names: ['00068470' '00138038' '00141918' '00214808' '00255032' '00338210'
 '00339378' '00465407' '00575844' '00585371']


In [19]:
unique_classes = sorted(set(y_test))

report = classification_report(
    y_test, 
    y_pred, 
    labels=unique_classes,  
    target_names=[label_encoders['Source of Incident Desc'].inverse_transform([cls])[0] for cls in unique_classes]
)

print(report)

                                                                            precision    recall  f1-score   support

                           Animal/Insect - Displayed (Petting Farm, Caged)       0.00      0.00      0.00         1
                                            Animal/Insect - Domestic - Dog       0.00      0.00      0.00         1
                                          Animal/Insect - Domestic - Horse       1.00      1.00      1.00         3
                                                    Animal/Insect - Insect       0.82      0.88      0.85        51
                                                      Animal/Insect - Wild       0.00      0.00      0.00         1
                                                                 Appliance       0.33      0.50      0.40         4
                                                  Appliance - Refrigerator       0.14      0.50      0.22         2
                                                         Appliance - St

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
# Step 3.1: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, example_data['Source of Incident Desc_Encoded'], test_size=0.2, random_state=42
)

# Step 3.2: Initialize Logistic Regression with Class Weights
logistic_model = LogisticRegression(class_weight='balanced', random_state=42)

# Step 3.3: Train the Model
logistic_model.fit(X_train, y_train)

# Step 3.4: Evaluate Performance
from sklearn.metrics import classification_report

y_pred = logistic_model.predict(X_test)
report = classification_report(y_test, y_pred, target_names=label_encoders['Source of Incident Desc'].classes_)

print(report)


ValueError: Number of classes, 155, does not match size of target_names, 176. Try specifying the labels parameter

In [None]:
#################################################################################

In [None]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Function to process a batch of text data and get embeddings
def get_bert_embeddings(text_batch):
    """
    Tokenize the input text and generate embeddings using BERT.
    Args:
        text_batch: A list of strings (text samples).
    Returns:
        numpy.ndarray: Array of embeddings for each text sample.
    """
    inputs = tokenizer(
        text_batch,
        return_tensors="pt",
        padding=True,         # Pad sentences to the same length
        truncation=True,      # Truncate sentences longer than max_length
        max_length=512        # Max token length for BERT
    )
    with torch.no_grad():  # Turn off gradients for inference
        outputs = model(**inputs)
    # Extract [CLS] token embeddings (first token in BERT output)
    return outputs.last_hidden_state[:, 0, :].numpy()

# Process embeddings in batches
batch_size = 4  # Adjust batch size based on your data and memory
embeddings = []

for i in range(0, len(df), batch_size):
    batch_texts = df['Combined_Text'][i:i + batch_size].tolist()
    batch_embeddings = get_bert_embeddings(batch_texts)
    embeddings.extend(batch_embeddings)

# Add embeddings as a new column to the DataFrame
df['BERT_Embedding'] = embeddings

# Output DataFrame with embeddings
print(df[['Combined_Text', 'BERT_Embedding']].head())


In [None]:
print(df['BERT_Embedding'].iloc[0].shape)  # Should output (768,) for BERT-base
print(df[['Combined_Text', 'BERT_Embedding']])

In [None]:
print(df['BERT_Embedding'].head())

In [None]:
# Encode the target labels
label_encoder = LabelEncoder()
df['Source_Encoded'] = label_encoder.fit_transform(df['Source of Incident Desc'])

In [None]:
label_map = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_map)

In [None]:
class_counts = df['Source of Incident Desc'].value_counts()

In [None]:
print("Class Distribution:")
print(class_counts)

# Plot the class distribution
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
class_counts.plot(kind='bar')
plt.title("Class Distribution of 'Source of Incident'")
plt.xlabel("Class")
plt.ylabel("Frequency")
plt.xticks(rotation=90)
plt.show()

In [None]:
data

In [None]:
#df.to_excel('J:/Data/RMSA Analysis/0 - Team Working Files/10 - Nick/WCMLDataset.xlsx', index=False)