In [None]:
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
crash_data = pd.read_pickle('crash_processed_data.pkl')

In [None]:
crash_data.columns

In [None]:
crash_data['TOTAL_NO_OCCUPANTS'].value_counts()

**Distribution of Accident Severity Across Different Features: Analyzing the Impact of Speed Zones, Road Conditions, and More**

In [None]:
# Plot showing the number of unique ACCIDENT_NO for each severity in ascending order with percentage
plt.figure(figsize=(8, 6))
sns.countplot(data=crash_data, x='SEVERITY', palette='viridis', order=crash_data['SEVERITY'].value_counts().index)
plt.title('Number of Unique ACCIDENT_NO for Each Severity (Ordered)')
plt.xlabel('Severity')
plt.ylabel('Count of Unique ACCIDENT_NO')
plt.xticks(rotation=45)

# Calculate and display percentage distribution
total_records = len(crash_data)
ax = plt.gca()
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height() / total_records)
    x = p.get_x() + p.get_width() / 2 - 0.1
    y = p.get_y() + p.get_height()
    ax.annotate(percentage, (x, y), fontsize=10, ha='center')

plt.show()


In [None]:

# Group the data by 'SPEED_ZONE' and calculate the percentage distribution of each severity level within each speed zone
percentage_by_speed_zone = crash_data.groupby(['SPEED_ZONE', 'SEVERITY']).size().unstack(fill_value=0)
percentage_by_speed_zone = percentage_by_speed_zone.div(percentage_by_speed_zone.sum(axis=1), axis=0) * 100

# Create a bar plot
plt.figure(figsize=(12, 8))
percentage_by_speed_zone.plot(kind='bar', stacked=True, cmap='viridis', figsize=(12, 8))
plt.title('Percentage Distribution of Severity for Each Speed Zone (Ordered)')
plt.xlabel('SPEED_ZONE')
plt.ylabel('Percentage')
plt.xticks(rotation=90)
plt.legend(title='Severity', loc='upper right', bbox_to_anchor=(1.15, 1))

# Annotate the percentages
ax = plt.gca()
for p in ax.patches:
    x = p.get_x() + p.get_width() / 2 - 0.1
    y = p.get_height()
    ax.annotate(f'{y:.1f}%', (x, y), fontsize=10, ha='center')

plt.show()


As Speed Zone gets Faster, The Distribution of Severity becomes more skewed towards 1,2. Speed is definitely a telling factor for severity

In [None]:

# Create age bands in 10-year increments.
crash_data['AGE_BAND'] = pd.cut(crash_data['AGE'], bins=range(0, 120, 10), right=False)

# Group the data by 'AGE_BAND' and 'SEVERITY', and calculate the percentage distribution of each severity level within each age band
percentage_by_age_band = crash_data.groupby(['AGE_BAND', 'SEVERITY']).size().unstack(fill_value=0)
percentage_by_age_band = percentage_by_age_band.div(percentage_by_age_band.sum(axis=1), axis=0) * 100

# Create a bar plot
plt.figure(figsize=(12, 8))
percentage_by_age_band.plot(kind='bar', stacked=True, cmap='viridis', figsize=(12, 8))
plt.title('Percentage Distribution of Severity for Each Age Band')
plt.xlabel('Age Band')
plt.ylabel('Percentage')
plt.xticks(rotation=45)
plt.legend(title='Severity', loc='upper right', bbox_to_anchor=(1.15, 1))

# Annotate the percentages
ax = plt.gca()
for p in ax.patches:
    x = p.get_x() + p.get_width() / 2 - 0.1
    y = p.get_height()
    ax.annotate(f'{y:.1f}%', (x, y), fontsize=10, ha='center')

plt.show()


Seems to be a positive correlation between age and severity (older people involved in more severe crashes)

In [None]:
# Group the data by 'Light Condition Desc' and 'SEVERITY', and calculate the percentage distribution of each severity level within each light condition
percentage_by_light_condition = crash_data.groupby(['Light Condition Desc', 'SEVERITY']).size().unstack(fill_value=0)
percentage_by_light_condition = percentage_by_light_condition.div(percentage_by_light_condition.sum(axis=1), axis=0) * 100

# Create a bar plot
plt.figure(figsize=(12, 8))
percentage_by_light_condition.plot(kind='bar', stacked=True, cmap='viridis', figsize=(12, 8))
plt.title('Percentage Distribution of Severity for Each Light Condition')
plt.xlabel('Light Condition Desc')
plt.ylabel('Percentage')
plt.xticks(rotation=45)
plt.legend(title='Severity', loc='upper right', bbox_to_anchor=(1.15, 1))

# Annotate the percentages
ax = plt.gca()
for p in ax.patches:
    x = p.get_x() + p.get_width() / 2 - 0.1
    y = p.get_height()
    ax.annotate(f'{y:.1f}%', (x, y), fontsize=10, ha='center')

plt.show()


Dark and No Street Lights most Dangerous, followed by Dark Street Lights off

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming your data is loaded into a DataFrame named 'crash_data'

# Define custom bins and labels
bins = [-1, 0, 1, 2, 3, 4, 5, 10, float('inf')]
labels = ['0', '1', '2', '3', '4', '5', '6-10', '10+']

# Create a new column 'OCCUPANTS_RANGE' with the specified labels based on 'TOTAL_NO_OCCUPANTS'
crash_data['OCCUPANTS_RANGE'] = pd.cut(crash_data['TOTAL_NO_OCCUPANTS'], bins=bins, labels=labels, right=False)

# Group the data by 'OCCUPANTS_RANGE' and 'SEVERITY', and calculate the percentage distribution of each severity level within each range
percentage_by_occupants_range = crash_data.groupby(['OCCUPANTS_RANGE', 'SEVERITY']).size().unstack(fill_value=0)
percentage_by_occupants_range = percentage_by_occupants_range.div(percentage_by_occupants_range.sum(axis=1), axis=0) * 100

# Create a bar plot
plt.figure(figsize=(12, 8))
percentage_by_occupants_range.plot(kind='bar', stacked=True, cmap='viridis', figsize=(12, 8))
plt.title('Percentage Distribution of Severity for Each Occupants Range')
plt.xlabel('Occupants Range')
plt.ylabel('Percentage')
plt.xticks(rotation=0)
plt.legend(title='Severity', loc='upper right', bbox_to_anchor=(1.15, 1))

# Annotate the percentages
ax = plt.gca()
for p in ax.patches:
    x = p.get_x() + p.get_width() / 2 - 0.1
    y = p.get_height()
    ax.annotate(f'{y:.1f}%', (x, y), fontsize=10, ha='center')

plt.show()


More Occupants seems to be linked to more severe crashes (severity 1)

In [None]:
# Group the data by 'Road Geometry Desc' and 'SEVERITY', and calculate the percentage distribution of each severity level within each category
percentage_by_road_geometry = crash_data.groupby(['Road Geometry Desc', 'SEVERITY']).size().unstack(fill_value=0)
percentage_by_road_geometry = percentage_by_road_geometry.div(percentage_by_road_geometry.sum(axis=1), axis=0) * 100

# Create a bar plot
plt.figure(figsize=(12, 8))
percentage_by_road_geometry.plot(kind='bar', stacked=True, cmap='viridis', figsize=(12, 8))
plt.title('Percentage Distribution of Severity for Each Road Geometry Desc')
plt.xlabel('Road Geometry Desc')
plt.ylabel('Percentage')
plt.xticks(rotation=45)
plt.legend(title='Severity', loc='upper right', bbox_to_anchor=(1.15, 1))

# Annotate the percentages
ax = plt.gca()
for p in ax.patches:
    x = p.get_x() + p.get_width() / 2 - 0.1
    y = p.get_height()
    ax.annotate(f'{y:.1f}%', (x, y), fontsize=10, ha='center')

plt.show()

In [None]:
#note only 5 private property incidents and 8 road closures
crash_data['Road Geometry Desc'].value_counts()

Unclear if Road Geometry plays a big factor

In [None]:
# Group the data by 'Road Geometry Desc' and 'SEVERITY', and calculate the percentage distribution of each severity level within each category
percentage_by_road_condition = crash_data.groupby(['Surface Cond Desc', 'SEVERITY']).size().unstack(fill_value=0)
percentage_by_road_condition = percentage_by_road_condition.div(percentage_by_road_condition.sum(axis=1), axis=0) * 100

# Create a bar plot
plt.figure(figsize=(12, 8))
percentage_by_road_condition.plot(kind='bar', stacked=True, cmap='viridis', figsize=(12, 8))
plt.title('Percentage Distribution of Severity for Each Road Surface Condition')
plt.xlabel('Road Condition Desc')
plt.ylabel('Percentage')
plt.xticks(rotation=45)
plt.legend(title='Severity', loc='upper right', bbox_to_anchor=(1.15, 1))

# Annotate the percentages
ax = plt.gca()
for p in ax.patches:
    x = p.get_x() + p.get_width() / 2 - 0.1
    y = p.get_height()
    ax.annotate(f'{y:.1f}%', (x, y), fontsize=10, ha='center')

plt.show()

In [None]:
crash_data['Surface Cond Desc'].value_counts()

Surpsingly looks like surface condition has little impact on severity of crash

In [None]:
crash_data['VEHICLE_YEARS_OLD'].value_counts()

In [None]:

bins = [0, 5, 10, 15, 20, 25, 50, 75, float('inf')]
labels = ['0-5', '5-10', '10-15', '15-20', '20-25', '25-50', '50-75', '75+']
crash_data['VEHICLE_AGE_RANGE'] = pd.cut(crash_data['VEHICLE_YEARS_OLD'], bins=bins, labels=labels, right=False)

# Group the data by 'VEHICLE_AGE_RANGE' and 'SEVERITY', and calculate the percentage distribution of each severity level within each category
percentage_by_vehicle_age = crash_data.groupby(['VEHICLE_AGE_RANGE', 'SEVERITY']).size().unstack(fill_value=0)
percentage_by_vehicle_age = percentage_by_vehicle_age.div(percentage_by_vehicle_age.sum(axis=1), axis=0) * 100

# Create a bar plot
plt.figure(figsize=(12, 8))
ax = percentage_by_vehicle_age.plot(kind='bar', stacked=True, cmap='viridis', figsize=(12, 8))
plt.title('Percentage Distribution of Severity for Each Vehicle Age Range')
plt.xlabel('Vehicle Age Range')
plt.ylabel('Percentage')
plt.xticks(rotation=0)
plt.legend(title='Severity', loc='upper right', bbox_to_anchor=(1.15, 1))

# Annotate the percentages and counts
for p in ax.patches:
    x = p.get_x() + p.get_width() / 2 - 0.1
    y = p.get_height()
    ax.annotate(f'{y:.1f}%', (x, y), fontsize=10, ha='center', va='bottom')
plt.show()


Older vehicles tend to have more severe crashes

In [None]:
percentage_by_sex = crash_data.groupby(['SEX', 'SEVERITY']).size().unstack(fill_value=0)
percentage_by_sex = percentage_by_sex.div(percentage_by_sex.sum(axis=1), axis=0) * 100

# Create a bar plot
plt.figure(figsize=(8, 6))
ax = percentage_by_sex.plot(kind='bar', stacked=True, cmap='viridis', figsize=(8, 6))
plt.title('Percentage Distribution of Severity by Gender')
plt.xlabel('Gender')
plt.ylabel('Percentage')
plt.xticks(rotation=0)

# Annotate the percentages correctly
for p in ax.patches:
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    ax.annotate(f'{y:.1f}%', (x, y), fontsize=10, ha='center', va='bottom')

# Remove the legend
ax.get_legend().remove()

plt.show()

Males tend to be in more severe accidents than females

In [None]:
percentage_by_lamps = crash_data.groupby(['LAMPS', 'SEVERITY']).size().unstack(fill_value=0)
percentage_by_lamps = percentage_by_lamps.div(percentage_by_lamps.sum(axis=1), axis=0) * 100

# Create a bar plot
plt.figure(figsize=(8, 6))
ax = percentage_by_lamps.plot(kind='bar', stacked=True, cmap='viridis', figsize=(8, 6))
plt.title('Percentage Distribution of Severity by LAMPS')
plt.xlabel('LAMPS')
plt.ylabel('Percentage')
plt.xticks(rotation=0)

# Annotate the percentages correctly
for p in ax.patches:
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    ax.annotate(f'{y:.1f}%', (x, y), fontsize=10, ha='center', va='bottom')

# Remove the legend
ax.get_legend().remove()

plt.show()

May be useful, may be junk data with 9 lamps but 1 lamp seems more dangeous than 2, not sure what 0 lamps means no working headlights

In [None]:
crash_data['LAMPS'].value_counts()

In [None]:
percentage_by_helmet_belt = crash_data.groupby(['HELMET_BELT_WORN', 'SEVERITY']).size().unstack(fill_value=0)
percentage_by_helmet_belt = percentage_by_helmet_belt.div(percentage_by_helmet_belt.sum(axis=1), axis=0) * 100

# Create a bar plot
plt.figure(figsize=(8, 6))
ax = percentage_by_helmet_belt.plot(kind='bar', stacked=True, cmap='viridis', figsize=(8, 6))
plt.title('Percentage Distribution of Severity by Helmet/Belt Usage')
plt.xlabel('Helmet/Belt Usage')
plt.ylabel('Percentage')
plt.xticks(rotation=0)

# Annotate the percentages correctly
for p in ax.patches:
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    ax.annotate(f'{y:.1f}%', (x, y), fontsize=10, ha='center', va='bottom')

# Remove the legend
ax.get_legend().remove()

plt.show()

In [None]:
numerical_columns = crash_data.select_dtypes(include=['number'])

# Compute the correlation matrix
numerical_corr = numerical_columns.corr()



plt.figure(figsize=(12, 8))
sns.heatmap(numerical_corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix for Numerical Variables')
plt.show()

In [None]:
crash_data.isna().sum()

In [None]:
crash_data = crash_data[~crash_data['TOTAL_NO_OCCUPANTS'].isna()]

In [None]:
crash_data.to_pickle('clean_crash_data.pkl')
