In [2]:
#Import necessary libraries, pandas, numpy and mlxtend
import pandas as pd
import numpy as np
import mlxtend
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [7]:
# --- Step 1: Load the Excel files ---
crashes = pd.read_excel(
    'bitre_fatal_crashes_dec2024.xlsx', 
    sheet_name='BITRE_Fatal_Crash',
    skiprows=4
)
fatalities = pd.read_excel(
    'bitre_fatalities_dec2024.xlsx', 
    sheet_name='BITRE_Fatality', 
    skiprows=4)

In [8]:
crashes.head()

Unnamed: 0,Crash ID,State,Month,Year,Dayweek,Time,Crash Type,Number Fatalities,Bus \nInvolvement,Heavy Rigid Truck Involvement,Articulated Truck Involvement,Speed Limit,National Remoteness Areas,SA4 Name 2021,National LGA Name 2021,National Road Type,Christmas Period,Easter Period,Day of week,Time of Day
0,20241115,NSW,12,2024,Friday,04:00:00,Single,1,No,No,No,100,Inner Regional Australia,Riverina,Wagga Wagga,Arterial Road,Yes,No,Weekday,Night
1,20241125,NSW,12,2024,Friday,06:15:00,Single,1,No,No,No,80,Inner Regional Australia,Sydney - Baulkham Hills and Hawkesbury,Hawkesbury,Local Road,No,No,Weekday,Day
2,20246013,Tas,12,2024,Friday,09:43:00,Multiple,1,No,No,No,50,Inner Regional Australia,Launceston and North East,Northern Midlands,Local Road,Yes,No,Weekday,Day
3,20241002,NSW,12,2024,Friday,10:35:00,Multiple,1,No,No,No,100,Outer Regional Australia,New England and North West,Armidale Regional,National or State Highway,No,No,Weekday,Day
4,20242261,Vic,12,2024,Friday,11:30:00,Multiple,1,-9,-9,-9,-9,Unknown,,,Undetermined,No,No,Weekday,Day


In [19]:
# --- Step 2: Merge datasets on 'Crash ID' ---
crashes['Crash ID'] = crashes['Crash ID'].astype(str)
fatalities['Crash ID'] = fatalities['Crash ID'].astype(str)

merged_df = pd.merge(fatalities, crashes, on="Crash ID", suffixes=('_fatal', '_crash'))

In [25]:
merged_df.shape

(56874, 42)

In [21]:
merged_df.columns

Index(['Crash ID', 'State_fatal', 'Month_fatal', 'Year_fatal', 'Dayweek_fatal',
       'Time_fatal', 'Crash Type_fatal', 'Bus Involvement',
       'Heavy Rigid Truck Involvement_fatal',
       'Articulated Truck Involvement_fatal', 'Speed Limit_fatal', 'Road User',
       'Gender', 'Age', 'National Remoteness Areas_fatal',
       'SA4 Name 2021_fatal', 'National LGA Name 2021_fatal',
       'National Road Type_fatal', 'Christmas Period_fatal',
       'Easter Period_fatal', 'Age Group', 'Day of week_fatal', 'Time of day',
       'State_crash', 'Month_crash', 'Year_crash', 'Dayweek_crash',
       'Time_crash', 'Crash Type_crash', 'Number Fatalities',
       'Bus \nInvolvement', 'Heavy Rigid Truck Involvement_crash',
       'Articulated Truck Involvement_crash', 'Speed Limit_crash',
       'National Remoteness Areas_crash', 'SA4 Name 2021_crash',
       'National LGA Name 2021_crash', 'National Road Type_crash',
       'Christmas Period_crash', 'Easter Period_crash', 'Day of week_crash',


In [50]:
# --- Step 2: Data Cleaning ---
# replace -9 with NaN
merged_df.replace(-9, pd.NA, inplace=True)

# Clean the speed limit values
merged_df['Speed Limit_fatal'] = merged_df['Speed Limit_fatal'].replace('<40', '40')

merged_df = merged_df.dropna(subset=['Speed Limit_fatal'])
merged_df['Speed Limit_fatal'] = merged_df['Speed Limit_fatal'].astype(int)



In [82]:
# --- Step 4: Select columns for association rule mining ---
selected_cols = [
    'Gender', 'Age Group', 'Crash Type_fatal', 'Bus Involvement',
    'Articulated Truck Involvement_fatal', 'Speed Limit_fatal',
     'Christmas Period_fatal',
    'Easter Period_fatal', 
    'Day of week_fatal', 'Time of day', 'Road User'
]

In [83]:
arm_data = merged_df[selected_cols]

In [84]:
arm_data.isna().sum()

Gender                                 33
Age Group                              94
Crash Type_fatal                        0
Bus Involvement                        52
Articulated Truck Involvement_fatal    46
Speed Limit_fatal                       0
Christmas Period_fatal                  0
Easter Period_fatal                     0
Day of week_fatal                       0
Time of day                             0
Road User                               0
dtype: int64

In [85]:
#  drop rows with missing values 
arm_data = arm_data.dropna()

arm_data.isna().sum()

Gender                                 0
Age Group                              0
Crash Type_fatal                       0
Bus Involvement                        0
Articulated Truck Involvement_fatal    0
Speed Limit_fatal                      0
Christmas Period_fatal                 0
Easter Period_fatal                    0
Day of week_fatal                      0
Time of day                            0
Road User                              0
dtype: int64

In [86]:
arm_data.dtypes

Gender                                 object
Age Group                              object
Crash Type_fatal                       object
Bus Involvement                        object
Articulated Truck Involvement_fatal    object
Speed Limit_fatal                       int64
Christmas Period_fatal                 object
Easter Period_fatal                    object
Day of week_fatal                      object
Time of day                            object
Road User                              object
dtype: object

In [87]:
# --- Step 5: Discretize Speed Limit into categories ---
arm_data['Speed Bin'] = pd.cut(
    arm_data['Speed Limit_fatal'].astype(float),
    bins=[0, 50, 80, 100, 130, 200],
    labels=['0-50', '51-80', '81-100', '101-130', '131+']
)

# Drop original speed and use Speed Bin instead
arm_data = arm_data.drop(columns=['Speed Limit_fatal'])
arm_data['Speed Bin'] = arm_data['Speed Bin'].astype(str)

In [88]:
arm_data.head(30)

Unnamed: 0,Gender,Age Group,Crash Type_fatal,Bus Involvement,Articulated Truck Involvement_fatal,Christmas Period_fatal,Easter Period_fatal,Day of week_fatal,Time of day,Road User,Speed Bin
0,Male,65_to_74,Single,No,No,Yes,No,Weekday,Night,Driver,81-100
1,Female,17_to_25,Single,No,No,No,No,Weekday,Day,Driver,51-80
2,Female,26_to_39,Multiple,No,No,Yes,No,Weekday,Day,Driver,0-50
3,Female,26_to_39,Multiple,No,No,No,No,Weekday,Day,Driver,81-100
5,Female,40_to_64,Multiple,No,No,No,No,Weekday,Day,Passenger,81-100
6,Male,40_to_64,Single,No,No,Yes,No,Weekday,Day,Motorcycle rider,81-100
7,Male,65_to_74,Single,No,No,No,No,Weekday,Day,Driver,81-100
8,Female,26_to_39,Multiple,No,No,No,No,Weekend,Night,Passenger,51-80
9,Male,17_to_25,Multiple,No,No,No,No,Weekend,Night,Motorcycle rider,0-50
10,Female,40_to_64,Multiple,No,No,No,No,Weekend,Night,Driver,81-100


In [89]:
# --- Step 6: Convert each row into a transaction of "attribute=value" ---
transactions = []
for _, row in arm_data.iterrows():
    transaction = [f"{col}={row[col]}" for col in arm_data.columns]
    transactions.append(transaction)

In [90]:
# --- Step 7: Encode transactions using TransactionEncoder ---
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
arm_df = pd.DataFrame(te_array, columns=te.columns_)


In [91]:
# --- Step 8: Run Apriori algorithm ---
frequent_itemsets = apriori(arm_df, min_support=0.05, use_colnames=True)

In [92]:
# --- Step 9: Generate association rules ---
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)


In [93]:
# --- Step 10: Filter rules with "Road User" on the RHS (consequent) ---
rules_with_road_user = rules[rules['consequents'].apply(lambda x: any('Road User=' in str(i) for i in x))]


In [94]:
# Sort by lift and confidence
top_rules = rules_with_road_user.sort_values(by=['lift', 'confidence'], ascending=False)


In [95]:
# --- Step 11: Display top k rules ---
k = 5
top_k_rules = top_rules.head(k)[['antecedents', 'consequents', 'support', 'confidence', 'lift']]
print("\nTop Rules with 'Road User' on RHS:\n")
print(top_k_rules.to_string(index=False))


Top Rules with 'Road User' on RHS:

                                                                   antecedents                                                                               consequents  support  confidence     lift
                 (Articulated Truck Involvement_fatal=Yes, Bus Involvement=No)                                             (Crash Type_fatal=Multiple, Road User=Driver) 0.050128    0.503090 2.321781
             (Articulated Truck Involvement_fatal=Yes, Easter Period_fatal=No)                                             (Crash Type_fatal=Multiple, Road User=Driver) 0.050363    0.500991 2.312095
                                     (Articulated Truck Involvement_fatal=Yes)                                             (Crash Type_fatal=Multiple, Road User=Driver) 0.050508    0.500898 2.311666
                        (Articulated Truck Involvement_fatal=Yes, Gender=Male) (Road User=Driver, Bus Involvement=No, Christmas Period_fatal=No, Easter Period_fatal=No