In [1]:
import pandas as pd
import numpy as np

# Load the XLSX file into a DataFrame
df = pd.read_excel('fatal_crashes.xlsx')

In [2]:
# Replace -9 with NaN to mark missing or unknown values
df.replace(-9, np.nan, inplace=True)

In [8]:
# Check for duplicate rows
duplicates = df.duplicated().sum()
print("Number of duplicate rows:", duplicates)

Number of duplicate rows: 0


In [14]:
# Columns to exclude from the analysis
exclude_columns = ['Crash ID', 'Time', 'SA4 Name 2021', 'National LGA Name 2021', ]

# Open a file to write the output
with open('statistics_output.txt', 'w', encoding='utf-8') as f:
    # Iterate over each column in the DataFrame
    for col in df.columns:
        # Skip columns listed in exclude_columns
        if col in exclude_columns:
            continue
        
        f.write(f"=== {col} ===\n")
        
        # Write value counts (including NaN) for the column
        counts = df[col].value_counts(dropna=False).to_string()
        f.write(counts + "\n\n")
        
        f.write("\n")
        
print("Statistics output has been saved to 'statistics_output.txt'.")

Statistics output has been saved to 'statistics_output.txt'.


In [13]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a figure and axes for subplots (3 rows, 2 columns)
fig, axs = plt.subplots(4, 2, figsize=(15, 20))  # 4 rows, 2 columns to fit all 7 plots

# 1. Accidents by Month
monthly_stats = df.groupby("Month")["Number Fatalities"].agg(["count", "sum"])
monthly_stats.columns = ["Accident_Count", "Total_Fatalities"]
monthly_stats.plot(kind="bar", ax=axs[0, 0], title="Accidents by Month")
axs[0, 0].set_xlabel("Month")
axs[0, 0].set_ylabel("Accident Count")

# 2. Accidents by Day of Week
weekday_stats = df.groupby("Day of week")["Number Fatalities"].agg(["count", "sum"])
weekday_stats.columns = ["Accident_Count", "Total_Fatalities"]
weekday_stats.plot(kind="bar", ax=axs[0, 1], title="Accidents by Day of the Week")
axs[0, 1].set_xlabel("Day of Week")
axs[0, 1].set_ylabel("Accident Count")

# 3. Accidents by Time of Day
timeofday_stats = df.groupby("Time of Day")["Number Fatalities"].agg(["count", "sum"])
timeofday_stats.columns = ["Accident_Count", "Total_Fatalities"]
timeofday_stats.plot(kind="bar", ax=axs[1, 0], title="Accidents by Time of Day")
axs[1, 0].set_xlabel("Time of Day")
axs[1, 0].set_ylabel("Accident Count")

# 4. Accidents by LGA (Top 10 LGAs)
lga_stats = df.groupby("National LGA Name 2021")["Number Fatalities"].agg(["count", "sum"]).sort_values(by="count", ascending=False).head(10)
lga_stats.columns = ["Accident_Count", "Total_Fatalities"]
lga_stats.plot(kind="bar", ax=axs[1, 1], title="Top 10 LGAs with the Most Accidents")
axs[1, 1].set_xlabel("LGA")
axs[1, 1].set_ylabel("Accident Count")

# 5. Accidents by Road Type
roadtype_stats = df.groupby("National Road Type")["Number Fatalities"].agg(["count", "sum"])
roadtype_stats.columns = ["Accident_Count", "Total_Fatalities"]
roadtype_stats.plot(kind="bar", ax=axs[2, 0], title="Accidents by Road Type")
axs[2, 0].set_xlabel("Road Type")
axs[2, 0].set_ylabel("Accident Count")

# 6. Accidents by Speed Limit
speed_stats = df.groupby("Speed Limit")["Number Fatalities"].agg(["count", "sum"])
speed_stats.columns = ["Accident_Count", "Total_Fatalities"]
speed_stats.plot(kind="line", ax=axs[2, 1], title="Accidents by Speed Limit")
axs[2, 1].set_xlabel("Speed Limit")
axs[2, 1].set_ylabel("Accident Count")

# 7. Accidents by Vehicle Involvement
vehicle_stats = df[["Bus \nInvolvement", "Heavy Rigid Truck Involvement", "Articulated Truck Involvement", "Number Fatalities"]].melt(id_vars=["Number Fatalities"], var_name="Vehicle Type", value_name="Involved")
vehicle_stats = vehicle_stats[vehicle_stats["Involved"] == "Yes"].groupby("Vehicle Type")["Number Fatalities"].agg(["count", "sum"])
vehicle_stats.columns = ["Accident_Count", "Total_Fatalities"]
vehicle_stats.plot(kind="bar", ax=axs[3, 0], title="Accidents Involving Buses and Trucks")
axs[3, 0].set_xlabel("Vehicle Type")
axs[3, 0].set_ylabel("Accident Count")

# Adjust layout to avoid overlap
plt.tight_layout()

# Save the figure with all the plots into a single file
plt.savefig("all_accidents_statistics_with_vehicle_involvement.png")
plt.close()


In [15]:
# Statistics by Local Government Area (Top 10 LGAs)
lga_stats = df.groupby("National LGA Name 2021")["Number Fatalities"].agg(["count", "sum"]).sort_values(by="count", ascending=False).head(10)
lga_stats.columns = ["Accident_Count", "Total_Fatalities"]
print("\n===== Top 10 LGAs with the Most Accidents =====")
print(lga_stats)


===== Top 10 LGAs with the Most Accidents =====
                        Accident_Count  Total_Fatalities
National LGA Name 2021                                  
Unknown                           1538              1676
Brisbane                           198               205
Moreton Bay                        129               138
Gold Coast                         127               138
Central Coast                      114               122
Sunshine Coast                     108               112
Logan                              105               112
Bundaberg                           82                96
Toowoomba                           81                91
Mid-Coast                           79                86
