In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("output_0_9999_showcase.csv")

In [None]:
print("Columns in dataset:", df.columns)

In [None]:
df['vehicle_mass'] = pd.to_numeric(df['vehicle_mass'], errors='coerce')

In [None]:
bins = [0, 2000, 5000, 20000, 1000000]
labels = ["Light (<2t)", "Medium Light (2–5t)", "Medium Heavy (5-20t)", "Heavy (>20t)"]
df['vehicle_mass_cat'] = pd.cut(df['vehicle_mass'], bins=bins, labels=labels)

In [None]:
df

In [None]:
plt.figure(figsize=(8,5))
df['day_of_week'].value_counts().sort_index().plot(kind="bar", color="skyblue")
plt.title("Accidents by Day of Week")
plt.xlabel("Day of Week")
plt.ylabel("Accident Count")
plt.show()

In [None]:
plt.figure(figsize=(6,5))
df['Seriousness'].value_counts().plot(kind="bar", color="orange")
plt.title("Distribution of Accident Seriousness")
plt.xlabel("Seriousness")
plt.ylabel("Count")
plt.show()

plt.figure(figsize=(10,6))
sns.countplot(data=df, x="year", hue="Seriousness")
plt.title("Accident Seriousness by Year")
plt.xticks(rotation=45)
plt.show()


In [None]:
fig1 = px.pie(
    df, 
    values = "id",
    names = "vehicle_mass_cat",
    title="Distribution of Vehicle Mass in Accidents",
    labels={"vehicle_mass_cat": "Vehicle Mass Category"}
)
fig1.show()

In [None]:
fig2 = px.violin(
    df, 
    x="Seriousness", 
    y="vehicle_mass", 
    title="Vehicle Mass by Accident Seriousness",
)
fig2.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


df_bar = df.dropna(subset=["vehicle_mass_cat"]).copy()
df_bar["vehicle_mass_cat"] = df_bar["vehicle_mass_cat"].astype(str)

crosstab = pd.crosstab(df_bar["day_of_week"], df_bar["vehicle_mass_cat"])

crosstab.plot(kind="bar", stacked=True, figsize=(10,6), colormap="tab20")
plt.title("Accidents by Day of Week and Vehicle Mass Category (Stacked)")
plt.xlabel("Day of Week")
plt.ylabel("Number of Accidents")
plt.legend(title="Vehicle Mass Category")
plt.tight_layout()
plt.show()

In [None]:
accidents_by_year = df.groupby("year").size().reset_index(name="accident_count")
fig4 = px.line(
    accidents_by_year,
    x="year",
    y="accident_count",
    title="Yearly Accident Trends"
)
fig4.show()

In [None]:
# Read world shapefile and select Finland
world = gpd.read_file('ne_10m_admin_0_countries/ne_10m_admin_0_countries.shp')
finland = world[world['ADMIN'] == 'Finland']

# Set target CRS (ETRS89 / TM35FIN, EPSG:3067)
target_crs = 'EPSG:3067'
finland = finland.to_crs(target_crs)

# Prepare data: drop missing vehicle_mass_cat, lat, lon
df_map = df.dropna(subset=["vehicle_mass_cat", "lat", "lon"]).copy()
df_map["vehicle_mass_cat"] = df_map["vehicle_mass_cat"].astype(str)

# Create GeoDataFrame for accident points
geometry = [Point(xy) for xy in zip(df_map['lon'], df_map['lat'])]
gdf = gpd.GeoDataFrame(df_map, geometry=geometry, crs='EPSG:4326')
gdf = gdf.to_crs(target_crs)

# Assign a color to each vehicle_mass_cat
categories = gdf["vehicle_mass_cat"].unique()
colors = plt.cm.tab20(np.linspace(0, 1, len(categories)))
color_dict = dict(zip(categories, colors))

# Plot
fig, ax = plt.subplots(1, 1, figsize=(30, 30))
finland.plot(ax=ax, color='lightgray', edgecolor='black')

for cat in categories:
    subset = gdf[gdf["vehicle_mass_cat"] == cat]
    subset.plot(ax=ax, marker='o', color=color_dict[cat], markersize=10, alpha=0.5, label=cat, zorder=5)

ax.set_title("Coordinates on a Map of Finland by Vehicle Mass Category")
ax.set_xlabel("Easting (m)")
ax.set_ylabel("Northing (m)")
ax.legend(title="Vehicle Mass Category")
plt.show()

In [None]:
world = gpd.read_file('ne_10m_admin_0_countries/ne_10m_admin_0_countries.shp')
finland = world[world['ADMIN'] == 'Finland']

target_crs = 'EPSG:3067'

finland = finland.to_crs(target_crs)

geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]
gdf = gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')

gdf = gdf.to_crs(target_crs)

fig, ax = plt.subplots(1, 1, figsize=(30, 30))

finland.plot(ax=ax, color='lightgray', edgecolor='black')

gdf.plot(ax=ax, marker='x', color='blue', markersize=2, zorder=5, alpha=0.5)

for x, y in zip(gdf.geometry.x, gdf.geometry.y):
    ax.text(x, y, "", fontsize=12, ha='right', va='top', fontweight='bold')

ax.set_title("Coordinates on a Map of Finland")
ax.set_xlabel("Easting (m)")
ax.set_ylabel("Northing (m)")

plt.show()

With Snowfall

In [None]:
snow_cols = ["snowfall_1", "snowfall_2", "snowfall_3", "snowfall_4", "snowfall_5"]
for col in snow_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Create aggregated snowfall features
df["snowfall_mean"] = df[snow_cols].mean(axis=1, skipna=True)
df["snowfall_max"] = df[snow_cols].max(axis=1, skipna=True)
df["snowfall_sum"] = df[snow_cols].sum(axis=1, skipna=True)
df["snowfall_binary"] = (df["snowfall_max"] > 0).astype(int)

In [None]:
plt.figure(figsize=(8,5))
sns.histplot(df["snowfall_mean"], bins=40, color="skyblue", edgecolor="black")
plt.title("Distribution of Snowfall on Accident Days")
plt.xlabel("Snowfall (cm)")
plt.ylabel("Frequency")
plt.show()

In [None]:
df["snowfall_bin"] = pd.cut(df["snowfall_mean"], bins=[-0.1,0,2,5,10,50], 
                            labels=["0 cm","0–2 cm","2–5 cm","5–10 cm","10+ cm"])
plt.figure(figsize=(8,5))
sns.countplot(data=df, x="snowfall_bin", order=["0 cm","0–2 cm","2–5 cm","5–10 cm","10+ cm"], 
              color="steelblue")
plt.title("Accidents by Snowfall Intensity")
plt.xlabel("Snowfall Category")
plt.ylabel("Number of Accidents")
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.violinplot(data=df, x="Seriousness", y="snowfall_mean", inner="box")
plt.title("Snowfall vs Accident Seriousness")
plt.ylabel("Snowfall (cm)")
plt.xticks(rotation=30)
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x="snowfall_mean", y="vehicle_mass", hue="Seriousness", alpha=0.6)
plt.title("Snowfall vs Vehicle Mass (Colored by Seriousness)")
plt.xlabel("Snowfall (cm)")
plt.ylabel("Vehicle Mass (kg)")
plt.show()

In [None]:
snow_month = df.groupby("month")["snowfall_mean"].mean().reset_index()
plt.figure(figsize=(8,5))
sns.lineplot(data=snow_month, x="month", y="snowfall_mean", marker="o")
plt.title("Average Snowfall per Accident by Month")
plt.xlabel("Month")
plt.ylabel("Mean Snowfall (cm)")
plt.show()

In [None]:
if "lat" in df.columns and "lon" in df.columns:
    plt.figure(figsize=(8,6))
    plt.scatter(df["lon"], df["lat"], c=df["snowfall_mean"], cmap="Blues", alpha=0.5, s=15)
    plt.colorbar(label="Snowfall (cm)")
    plt.title("Geographical Distribution of Accidents with Snowfall")
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.show()

In [None]:
num_cols = ["vehicle_mass","snowfall_mean","snowfall_max","snowfall_sum"]
corr = df[num_cols].corr()
plt.figure(figsize=(7,5))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation of Snowfall with Vehicle Mass")
plt.show()

In [None]:
plt.figure(figsize=(10,6))
pivot = df.pivot_table(values="id", index="day_of_week", columns=pd.cut(df["snowfall_max"], bins=[0,1,5,10,20,50]), aggfunc="count")
sns.heatmap(pivot, annot=True, fmt="d", cmap="Blues")
plt.title("Accident Count by Day of Week and Snowfall Level")
plt.xlabel("Snowfall (cm bins)")
plt.ylabel("Day of Week")
plt.show()

In [None]:
# --- Load Finland base map ---
world = gpd.read_file('ne_10m_admin_0_countries/ne_10m_admin_0_countries.shp')
finland = world[world['ADMIN'] == 'Finland']

# Reproject to Finnish CRS
target_crs = 'EPSG:3067'
finland = finland.to_crs(target_crs)

# --- Convert accident dataset to GeoDataFrame ---
geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]
gdf = gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')
gdf = gdf.to_crs(target_crs)

# --- Use max snowfall ---
gdf["snowfall_max"] = df[["snowfall_1","snowfall_2","snowfall_3","snowfall_4","snowfall_5"]].max(axis=1, skipna=True)

# --- Plot ---
fig, ax = plt.subplots(1, 1, figsize=(14, 18))

finland.plot(ax=ax, color='lightgray', edgecolor='black')

# Plot accidents with snowfall depth as color & size
scatter = gdf.plot(
    ax=ax, 
    column="snowfall_max",
    cmap="Blues",
    markersize=gdf["snowfall_max"].fillna(0) * 5,  # scale size by cm
    alpha=0.6,
    legend=True,
    legend_kwds={'label': "Snowfall Depth (cm)", 'orientation': "vertical"}
)

# Title & labels
ax.set_title("Accidents in Finland with Snowfall Depth", fontsize=16)
ax.set_xlabel("Easting (m)")
ax.set_ylabel("Northing (m)")

plt.show()