In [0]:
%pip install pandas>=1.3.0 numpy>=1.19.0 scikit-learn>=0.24.0 datasets>=2.0.0 openpyxl>=3.0.0 accelerate>=0.26.0

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
# Remove flash-attn which causes symbol errors
%pip uninstall -y flash-attn flash-attn-2

# Install compatible versions
%pip install torch==2.1.2 transformers==4.40.0 datasets scikit-learn

# Restart the Python kernel after this


Found existing installation: flash-attn 2.5.9.post1
Not uninstalling flash-attn at /databricks/python3/lib/python3.11/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-570bdbe4-7f27-443b-9a32-d875f6b8425f
Can't uninstall 'flash-attn'. No files were found to uninstall.
[0m[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
Collecting torch==2.1.2
  Obtaining dependency information for torch==2.1.2 from https://files.pythonhosted.org/packages/da/6a/7fb9d82db4568834ff6d4df2fe3b143de4ed65a3f8f93e7daed703626cb6/torch-2.1.2-cp311-cp311-manylinux1_x86_64.whl.metadata
  Downloading torch-2.1.2-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting transformers==4.40.0
  Obtaining dependency information for transformers==4.40.0 from https://files.pythonhosted.org/packages/09/c8/844d5518a6aeb4ffdc0cf0cae65ae13dbe5838306728c5c640b5a6e2a0c9/transformers-4.40.0-py3-none-any.whl.metada

In [0]:
%restart_python

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import os
 
# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

2025-05-12 19:59:28.370843: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using device: cuda


In [0]:
# 1. Read Data
print("\n1. Reading Data...")
input_path = '/dbfs/FileStore/tables/NearMissReport_NewDataset.xlsx'  # Databricks path
df = pd.read_excel(input_path, engine='openpyxl')
print("Initial DataFrame shape:", df.shape)
print("Columns in DataFrame:", df.columns.tolist())
 
relevant_columns = [
    'Event Title', 'Report', 'Preliminary Cause', 'Resolution Action',
    'Classification', 'Activity performed (type of service)',
    'Type of location', 'Sub Location', 'Cause'
]
 
# Check if all columns exist
missing_cols = set(relevant_columns) - set(df.columns)
if missing_cols:
    print("WARNING: Missing columns in input data:", missing_cols)
    raise ValueError(f"Missing required columns: {missing_cols}")
 
df = df[relevant_columns]
print("DataFrame shape after selecting relevant columns:", df.shape)


1. Reading Data...
Initial DataFrame shape: (132387, 105)
Columns in DataFrame: ['period', 'Year', 'Unit', 'Mainloc', 'account', 'Main Location', '60 code', 'DM', '50 code', 'Regional VP', '40 code', 'Area VP', 'division code', 'division', 'Main', 'Country Site ID', 'Legal Entity Concerned', 'Site Manager Name', 'Company name, if not Sodexo', 'CS \\ Reported by', 'CS \\ Address', 'Address2', 'Zip Code', 'City', 'CS \\ Phone #', 'CS \\ Phone #2', 'Event Title', 'Type', 'Event Date', 'Safety Action Type', 'Safe Action', 'Criticality', 'Critical', 'SAFE', 'Safety Category', 'Safe Category', 'Used stop work authority', 'Reported by employee (ID_firstname_lastname)', 'email prefix', 'email', 'Preliminary Cause', 'Event ID', 'Local \\ Date', 'TimeZone', 'Local \\ Reported Date', 'Classification', 'Duration', 'Duration Type', 'Inspection \\ Status', 'Gravest Severity', 'LSC ?', 'Unauthorised work task / activity ?', 'Activity performed (type of service)', 'Type of location', 'Specific Area',

In [0]:
# 2. Separate 'other' and 'unknown'
# print("\n2. Separating 'other' and 'unknown'...")
# mask_other_unknown = df['Cause'].str.lower().isin(['other', 'unknown'])
# df_other_unknown = df[mask_other_unknown].copy()
# df_main = df[~mask_other_unknown].copy()
# print("Shape of df_main (without 'other'/'unknown'):", df_main.shape)
# print("Shape of df_other_unknown:", df_other_unknown.shape)
# print("Unique causes in df_main:", df_main['Cause'].unique())
df_main = df.copy()

In [0]:
 
# 3. Check class balance and balance if needed
print("\n3. Checking class balance...")
if len(df_main) > 0:
    class_counts = df_main['Cause'].value_counts()
    print("Class distribution before balancing:")
    print(class_counts)
   
    max_count = class_counts.max()
    df_list = []
    for label in class_counts.index:
        df_label = df_main[df_main['Cause'] == label]
        df_label_upsampled = resample(df_label, replace=True, n_samples=max_count, random_state=42)
        df_list.append(df_label_upsampled)
    df_balanced = pd.concat(df_list)
    print("\nClass distribution after balancing:")
    print(df_balanced['Cause'].value_counts())
else:
    print("WARNING: No data left after filtering 'other' and 'unknown'")
    df_balanced = df_main
 
print("Shape of balanced DataFrame:", df_balanced.shape)


3. Checking class balance...
Class distribution before balancing:
Cause
Food Related Concern                                            10057
Sharp items / surfaces                                          10000
PPE: not used, incorrect use, defective…                        10000
Manual handling / Lifting  / Carrying  / Pulling / Pushing      10000
Ingestion / absorption / Contact with of hazardous substance    10000
Hand tools                                                      10000
Hit by / Struck against / Trapped / Crushed                     10000
Slip / Trip / Fall                                              10000
Fall from height/elevation                                       9999
Exposure to hot objects / surfaces / temperatures                9984
Fire / Explosion                                                 7965
Contact with electricity                                         6860
Vehicle incident / accident                                      5587
Inhalation of gas

In [0]:
# 4. Data processing
print("\n4. Processing data...")
# Check for nulls in each column
print("\nNull values in each column:")
print(df_balanced.isnull().sum())
 
# Fill nulls with empty string for text columns
text_columns = [
    'Event Title', 'Report', 'Preliminary Cause', 'Resolution Action',
    'Classification', 'Activity performed (type of service)',
    'Type of location', 'Sub Location'
]
 
# Fill nulls in text columns with empty string
for col in text_columns:
    df_balanced[col] = df_balanced[col].fillna('')
 
print("\nShape after filling nulls:", df_balanced.shape)
 
# Create text column
text_series = df_balanced[text_columns].astype(str).apply(lambda row: ' '.join([str(x) for x in row]), axis=1)
print("Type and shape of text_series:", type(text_series), text_series.shape)
df_balanced['text'] = text_series
 
# Remove any rows where text is empty or just whitespace
df_balanced = df_balanced[df_balanced['text'].str.strip().str.len() > 0]
print("Shape after removing empty texts:", df_balanced.shape)
 
# 5. Train-test split
print("\n5. Splitting data...")
X = df_balanced['text'].values
y = df_balanced['Cause'].values
print("X shape:", X.shape)
print("y shape:", y.shape)
print("Unique classes in y:", np.unique(y))
 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print("Training set size:", len(X_train))
print("Test set size:", len(X_test))


4. Processing data...

Null values in each column:
Event Title                                  0
Report                                      78
Preliminary Cause                         6385
Resolution Action                       128723
Classification                               0
Activity performed (type of service)    208559
Type of location                          4087
Sub Location                             71123
Cause                                        0
dtype: int64

Shape after filling nulls: (211197, 9)
Type and shape of text_series: <class 'pandas.core.series.Series'> (211197,)
Shape after removing empty texts: (211197, 10)

5. Splitting data...
X shape: (211197,)
y shape: (211197,)
Unique classes in y: ['Asphyxiation' 'Caught in, between or under' 'Contact with electricity'
 'Exposure' 'Exposure to cold objects / surfaces / temperatures'
 'Exposure to hot objects / surfaces / temperatures'
 'Fall from height/elevation' 'Fire / Explosion' 'Food Related Concern'
 'Ha

In [0]:
import numpy as np
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import torch

# Make sure you're using the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Assuming X_train, X_test, y_train, y_test are already defined
print("\n6. Preparing datasets for transformer...")
train_dataset = Dataset.from_dict({'text': X_train, 'label': y_train})
test_dataset = Dataset.from_dict({'text': X_test, 'label': y_test})

# Encode labels
le = LabelEncoder()
le.fit(y_train)
train_dataset = train_dataset.map(lambda x: {'label': le.transform([x['label']])[0]})
test_dataset = test_dataset.map(lambda x: {'label': le.transform([x['label']])[0]})

# Load tokenizer
model_name = 'bert-base-uncased'
print(f"Loading tokenizer: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=256)

print("Tokenizing datasets...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Load model
num_labels = len(le.classes_)
print(f"Number of unique labels: {num_labels}")
print(f"Loading model: {model_name}")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)

# Training arguments
training_args = TrainingArguments(
    output_dir='/dbfs/FileStore/tables/results',
    num_train_epochs=3,
    per_device_train_batch_size=16,  # Set to 16 for safer memory usage on CPU/GPU
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='/dbfs/FileStore/tables/logs',
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    fp16=torch.cuda.is_available(),  # Only use fp16 if GPU is available
    gradient_accumulation_steps=2,
    max_steps=2000,
    report_to="none",  # Avoid errors with TensorBoard if not configured
)

# Metric computation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted')
    }

# Train the model
print("\n7. Training model...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()



6. Preparing datasets for transformer...


Map:   0%|          | 0/168957 [00:00<?, ? examples/s]

Map:   0%|          | 0/42240 [00:00<?, ? examples/s]

Loading tokenizer: bert-base-uncased




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizing datasets...


Map:   0%|          | 0/168957 [00:00<?, ? examples/s]

Map:   0%|          | 0/42240 [00:00<?, ? examples/s]

Number of unique labels: 21
Loading model: bert-base-uncased


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
max_steps is given, it will override any value given in num_train_epochs



7. Training model...




Step,Training Loss
50,3.0532
100,2.8955
150,2.3716
200,1.9072
250,1.7965
300,1.6497
350,1.5958
400,1.5891
450,1.5398
500,1.479


TrainOutput(global_step=2000, training_loss=1.3858954334259033, metrics={'train_runtime': 713.5422, 'train_samples_per_second': 89.693, 'train_steps_per_second': 2.803, 'total_flos': 8420990040145920.0, 'train_loss': 1.3858954334259033, 'epoch': 0.3787878787878788})

In [0]:

# 7. Evaluate
print("\n8. Evaluating model...")
results = trainer.evaluate()
print('Test Accuracy:', results['eval_accuracy'])
print('Test F1:', results['eval_f1'])


8. Evaluating model...


Test Accuracy: 0.7082623106060606
Test F1: 0.7009090676954755


In [0]:
# print("\n1. Reading Data...")
predict_data_path = '/dbfs/FileStore/tables/OtherUnknownDataset.xlsx'  # Databricks path
df_other_unknown = pd.read_excel(predict_data_path, engine='openpyxl')
print("Initial DataFrame shape:",  df_other_unknown.shape)
print("Columns in DataFrame:",  df_other_unknown.columns.tolist())

Initial DataFrame shape: (538470, 105)
Columns in DataFrame: ['period', 'Year', 'Unit', 'Mainloc', 'account', 'Main Location', '60 code', 'DM', '50 code', 'Regional VP', '40 code', 'Area VP', 'division code', 'division', 'Main', 'Country Site ID', 'Legal Entity Concerned', 'Site Manager Name', 'Company name, if not Sodexo', 'CS \\ Reported by', 'CS \\ Address', 'Address2', 'Zip Code', 'City', 'CS \\ Phone #', 'CS \\ Phone #2', 'Event Title', 'Type', 'Event Date', 'Safety Action Type', 'Safe Action', 'Criticality', 'Critical', 'SAFE', 'Safety Category', 'Safe Category', 'Used stop work authority', 'Reported by employee (ID_firstname_lastname)', 'email prefix', 'email', 'Preliminary Cause', 'Event ID', 'Local \\ Date', 'TimeZone', 'Local \\ Reported Date', 'Classification', 'Duration', 'Duration Type', 'Inspection \\ Status', 'Gravest Severity', 'LSC ?', 'Unauthorised work task / activity ?', 'Activity performed (type of service)', 'Type of location', 'Specific Area', 'Main location2', '

In [0]:
df_other_unknown = df_other_unknown[relevant_columns]

In [0]:
# Fill nulls in text columns
for col in text_columns:
        df_other_unknown[col] = df_other_unknown[col].fillna('')

In [0]:
df_other_unknown['text'] = df_other_unknown[text_columns].apply(
    lambda row: ' '.join([f"{col}: {str(x).strip()}" for col, x in zip(text_columns, row) if str(x).strip()]),
    axis=1
)

In [0]:
df_other_unknown = df_other_unknown[df_other_unknown['text'].str.strip().str.len() > 0]

In [0]:
# 8. Predict on 'other' and 'unknown'
print("\n9. Predicting on 'other' and 'unknown'...")
 
texts_to_predict = df_other_unknown['text'].tolist()
 
# Process in smaller batches to avoid memory issues
batch_size = 32
all_predictions = []
all_confidences = []
 
for i in range(0, len(texts_to_predict), batch_size):
    batch_texts = texts_to_predict[i:i + batch_size]
    print(f"Processing batch {i//batch_size + 1}/{(len(texts_to_predict) + batch_size - 1)//batch_size}")
   
    # Tokenize and move to the same device as the model
    inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=256, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
   
    with torch.no_grad():
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        probabilities = torch.softmax(outputs.logits, dim=1).cpu().numpy()
       
    all_predictions.extend(preds)
    all_confidences.extend(np.max(probabilities, axis=1))
 
# Convert predictions to labels and add to dataframe
predicted_labels = le.inverse_transform(all_predictions)
df_other_unknown['Predicted_Cause'] = predicted_labels
df_other_unknown['Confidence_Score'] = all_confidences
 
# Create category mapping and add category numbers
print("\nCreating category mapping...")
category_mapping = {label: idx for idx, label in enumerate(le.classes_)}
print("Category mapping:", category_mapping)
 
# Verify and create Category_Number column
try:
    df_other_unknown['Category_Number'] = df_other_unknown['Predicted_Cause'].map(category_mapping)
    print("\nVerifying Category_Number creation:")
    print("Unique categories in Predicted_Cause:", df_other_unknown['Predicted_Cause'].unique())
    print("Unique numbers in Category_Number:", df_other_unknown['Category_Number'].unique())
    print("Number of null values in Category_Number:", df_other_unknown['Category_Number'].isnull().sum())
except Exception as e:
    print(f"Error creating Category_Number: {str(e)}")
    # Fallback: create Category_Number directly from predictions
    df_other_unknown['Category_Number'] = all_predictions
    print("Created Category_Number directly from predictions")
 
# Create prediction summary
print("\nPrediction Summary by Category:")
summary = df_other_unknown.groupby('Predicted_Cause').agg({
    'Category_Number': 'first',
    'Event Title': 'count',
    'Confidence_Score': ['mean', 'min', 'max']
}).round(4)
 
summary.columns = ['Category_Number', 'Count', 'Avg_Confidence', 'Min_Confidence', 'Max_Confidence']
summary = summary.sort_values('Count', ascending=False)
print("\nCategory-wise Prediction Summary:")
print(summary)



9. Predicting on 'other' and 'unknown'...
Processing batch 1/16828
Processing batch 2/16828
Processing batch 3/16828
Processing batch 4/16828
Processing batch 5/16828
Processing batch 6/16828
Processing batch 7/16828
Processing batch 8/16828
Processing batch 9/16828
Processing batch 10/16828
Processing batch 11/16828
Processing batch 12/16828
Processing batch 13/16828
Processing batch 14/16828
Processing batch 15/16828
Processing batch 16/16828
Processing batch 17/16828
Processing batch 18/16828
Processing batch 19/16828
Processing batch 20/16828
Processing batch 21/16828
Processing batch 22/16828
Processing batch 23/16828
Processing batch 24/16828
Processing batch 25/16828
Processing batch 26/16828
Processing batch 27/16828
Processing batch 28/16828
Processing batch 29/16828
Processing batch 30/16828
Processing batch 31/16828
Processing batch 32/16828
Processing batch 33/16828
Processing batch 34/16828
Processing batch 35/16828
Processing batch 36/16828
Processing batch 37/16828
Proc

In [0]:
 
# 9. Save predictions to Excel
print("\n10. Saving predictions...")
# First save to a temporary location
temp_output_path = '/tmp/predicted_other_unknown.xlsx'
final_output_path = '/dbfs/FileStore/predicted_other_unknown.xlsx'
 
print(f"Number of rows to save: {len(df_other_unknown)}")
print("Columns in saved file:", df_other_unknown.columns.tolist())
 
# Verify columns exist before printing
required_columns = ['Event Title', 'Cause', 'Predicted_Cause', 'Category_Number', 'Confidence_Score']
missing_columns = [col for col in required_columns if col not in df_other_unknown.columns]
if missing_columns:
    print(f"Warning: Missing columns: {missing_columns}")
    # Print available columns instead
    print("\nSample of predictions (available columns):")
    print(df_other_unknown.head())
else:
    print("\nSample of predictions with category numbers:")
    print(df_other_unknown[required_columns].head())
 
try:
    # Save to temporary location first
    df_other_unknown.to_excel(temp_output_path, index=False)
    print(f'Temporary file saved to: {temp_output_path}')
   
    # Copy to FileStore
    import shutil
    shutil.copy2(temp_output_path, final_output_path)
    print(f'File copied to FileStore: {final_output_path}')
   
    # Clean up temporary file
    os.remove(temp_output_path)
    print("Temporary file cleaned up")
   
    # Verify the final file
    if os.path.exists(final_output_path):
        print(f"File successfully saved. Size: {os.path.getsize(final_output_path) / 1024:.2f} KB")
    else:
        print("Warning: File was not saved successfully!")
except Exception as e:
    print(f"Error saving file: {str(e)}")



10. Saving predictions...
Number of rows to save: 538470
Columns in saved file: ['Event Title', 'Report', 'Preliminary Cause', 'Resolution Action', 'Classification', 'Activity performed (type of service)', 'Type of location', 'Sub Location', 'Cause', 'text', 'Predicted_Cause', 'Confidence_Score', 'Category_Number']

Sample of predictions with category numbers:
                                       Event Title  ... Confidence_Score
0                                East HS 3/30/2022  ...         0.979531
1  APP-1646111517516-Cooler cold storage lighting.  ...         0.727688
2                  APP-1646112639063-PEMS Employee  ...         0.990776
3                 APP-1646112568836-Propping doors  ...         0.988860
4                     APP-1646112739299-Lock doors  ...         0.987224

[5 rows x 5 columns]
Temporary file saved to: /tmp/predicted_other_unknown.xlsx
File copied to FileStore: /dbfs/FileStore/predicted_other_unknown.xlsx
Temporary file cleaned up
File successfully sa