In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModel
import torch

In [2]:
data = pd.read_excel('WCMLDataset.xlsx')

In [3]:
df = data
nan_count = df['Claim Number'].isna().sum()

# Count blank string values (after stripping whitespace)
blank_count = df['Claim Number'].str.strip().eq('').sum()

# Total rows with either NaN or blank
total_blank_or_nan = nan_count + blank_count

print(f"Number of rows with NaN in 'Claim Number': {nan_count}")
print(f"Number of rows with blank strings in 'Claim Number': {blank_count}")
print(f"Total rows with blank or NaN 'Claim Number': {total_blank_or_nan}")

Number of rows with NaN in 'Claim Number': 0
Number of rows with blank strings in 'Claim Number': 0
Total rows with blank or NaN 'Claim Number': 0


In [4]:
duplicate_rows = df[df.duplicated()]
print(f"Number of duplicate rows: {len(duplicate_rows)}")

Number of duplicate rows: 0


In [5]:
df.describe()

  df.describe()


Unnamed: 0,Claim Number,Date of Incident/Loss,Event of Incident Desc,Source of Incident Desc,Event of Injury Desc,Source of Injury Desc,EDI Cause Desc,Incident Description,Activity Engaged in During Accident,General HS Comments,Injury Description,Changes?
count,8189,8189,8189,8189,8189,8187,8182,8189,8189,7029,8159,8069
unique,8189,731,51,176,50,177,10,8147,7738,7026,3709,280
top,WDWW2021220675,2022-10-07 00:00:00,Struck/Struck By,Objects,Struck/Struck By,Objects,Strain or Injury By,She was in contact with an infected coworker.,She was cleaning a guest room.,On December 4th 2022 at 06:45 I was just turni...,"Back - Lower, Sprain/Strain;",OK
freq,1,31,1785,1038,3260,954,3176,7,16,2,347,7504
first,,2021-01-01 00:00:00,,,,,,,,,,
last,,2023-01-01 00:00:00,,,,,,,,,,


In [6]:
df.dtypes

Claim Number                                   object
Date of Incident/Loss                  datetime64[ns]
Event of Incident Desc                         object
Source of Incident Desc                        object
Event of Injury Desc                           object
Source of Injury Desc                          object
EDI Cause Desc                                 object
Incident Description                           object
Activity Engaged in During Accident            object
General HS Comments                            object
Injury Description                             object
Changes?                                       object
dtype: object

In [7]:
df.shape

(8189, 12)

In [8]:
Missing = df.isnull().sum(axis=0)
Missing[Missing>0]

Source of Injury Desc       2
EDI Cause Desc              7
General HS Comments      1160
Injury Description         30
Changes?                  120
dtype: int64

In [9]:
data_missing = (df.isnull().sum(axis=0)/data.shape[0]) * 100
data_missing

Claim Number                            0.000000
Date of Incident/Loss                   0.000000
Event of Incident Desc                  0.000000
Source of Incident Desc                 0.000000
Event of Injury Desc                    0.000000
Source of Injury Desc                   0.024423
EDI Cause Desc                          0.085481
Incident Description                    0.000000
Activity Engaged in During Accident     0.000000
General HS Comments                    14.165344
Injury Description                      0.366345
Changes?                                1.465380
dtype: float64

In [10]:
data_missing[data_missing == 0].index

Index(['Claim Number', 'Date of Incident/Loss', 'Event of Incident Desc',
       'Source of Incident Desc', 'Event of Injury Desc',
       'Incident Description', 'Activity Engaged in During Accident'],
      dtype='object')

In [11]:
data_missing = data_missing.drop(data_missing[data_missing == 0].index)
data_missing

Source of Injury Desc     0.024423
EDI Cause Desc            0.085481
General HS Comments      14.165344
Injury Description        0.366345
Changes?                  1.465380
dtype: float64

In [12]:
data_missing = data_missing.sort_values(ascending=False)
data_missing

General HS Comments      14.165344
Changes?                  1.465380
Injury Description        0.366345
EDI Cause Desc            0.085481
Source of Injury Desc     0.024423
dtype: float64

In [13]:
to_be_cleaned_column_names = data_missing[data_missing <51].index
to_be_cleaned_column_names

Index(['General HS Comments', 'Changes?', 'Injury Description',
       'EDI Cause Desc', 'Source of Injury Desc'],
      dtype='object')

In [14]:
bad_column_names = data_missing[data_missing >=51].index
bad_column_names

Index([], dtype='object')

In [15]:
data_new=data.drop(columns=bad_column_names, axis=1)

In [None]:
for c in to_be_cleaned_column_names:
    v=data_new[c]#get values in this column
    v_valid=v[~v.isnull()] # get valid values 
    if data_new[c].dtype == np.dtype('O'): # non-numeric values       
        data[c]=data[c].fillna(v.value_counts().index[0]) # the most frequent category
        #data_new[c]=data_new[c].fillna('nothing') # replace nan with "nothing" - a new category  
    else: # numeric 
        data_new[c]=data_new[c].fillna(v_valid.median()) #replace nan with median value

In [None]:
data_new.isnull().sum().sum()

In [None]:
#################################################################################

In [None]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Combine text fields into one input column
df['Combined_Text'] = (
    df['Injury Description'] + " " +
    df['Activity Engaged in During Accident'] + " " +
    df['Incident Description'] + " " +
    df['General HS Comments']
)

# Function to process a batch of text data and get embeddings
def get_bert_embeddings(text_batch):
    """
    Tokenize the input text and generate embeddings using BERT.
    Args:
        text_batch: A list of strings (text samples).
    Returns:
        numpy.ndarray: Array of embeddings for each text sample.
    """
    inputs = tokenizer(
        text_batch,
        return_tensors="pt",
        padding=True,         # Pad sentences to the same length
        truncation=True,      # Truncate sentences longer than max_length
        max_length=512        # Max token length for BERT
    )
    with torch.no_grad():  # Turn off gradients for inference
        outputs = model(**inputs)
    # Extract [CLS] token embeddings (first token in BERT output)
    return outputs.last_hidden_state[:, 0, :].numpy()

# Process embeddings in batches
batch_size = 2  # Adjust batch size based on your data and memory
embeddings = []

for i in range(0, len(df), batch_size):
    batch_texts = df['Combined_Text'][i:i + batch_size].tolist()
    batch_embeddings = get_bert_embeddings(batch_texts)
    embeddings.extend(batch_embeddings)

# Add embeddings as a new column to the DataFrame
df['BERT_Embedding'] = embeddings

# Output DataFrame with embeddings
print(df[['Combined_Text', 'BERT_Embedding']].head())


In [None]:
print(f"Tokenizer type: {type(tokenizer)}")  # Should be BertTokenizer
print(f"Batch size: {len(batch_texts)}")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd

# Reinitialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Example data
data = {
    'Injury Description': ["Cut on hand", "Bruise on leg", "Burn on face"],
    'Activity Engaged In During Accident': ["Cutting wood", "Running", "Cooking"],
    'Incident Description': ["Fell while running", "Cut hand with knife", "Burnt hand while cooking"],
    'General HS Comments': ["Accident in kitchen", "Accident on trail", "Hot oil burn"],
}
df = pd.DataFrame(data)

# Combine text fields
df['Combined_Text'] = df['Injury Description'] + " " + df['Activity Engaged In During Accident'] + " " + df['Incident Description'] + " " + df['General HS Comments']

# Batch processing function
def get_bert_embeddings(text_batch):
    inputs = tokenizer(
        text_batch,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()

# Process text in small batches
batch_size = 2
embeddings = []

for i in range(0, len(df), batch_size):
    batch_texts = df['Combined_Text'][i:i + batch_size].tolist()
    batch_embeddings = get_bert_embeddings(batch_texts)
    embeddings.extend(batch_embeddings)

# Add embeddings to the dataframe
df['BERT_Embedding'] = embeddings

print("BERT embeddings successfully generated.")


In [None]:
print(f"Tokenizer type: {type(tokenizer)}")  # Should be BertTokenizer
print(f"Batch size: {len(batch_texts)}")

In [None]:
#df.to_excel('J:/Data/RMSA Analysis/0 - Team Working Files/10 - Nick/WCMLDataset.xlsx', index=False)