In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("data-2014-2021-wo-missing.csv")

In [None]:
print("Columns in dataset:", df.columns)

In [None]:
df['vehicle_mass'] = pd.to_numeric(df['vehicle_mass'], errors='coerce')

In [None]:
bins = [0, 2000, 5000, 20000, 1000000]
labels = ["Light (<2t)", "Medium Light (2–5t)", "Medium Heavy (5-20t)", "Heavy (>20t)"]
df['vehicle_mass_cat'] = pd.cut(df['vehicle_mass'], bins=bins, labels=labels)

In [None]:
df

In [None]:
plt.figure(figsize=(8,5))
df['day_of_week'].value_counts().sort_index().plot(kind="bar", color="skyblue")
plt.title("Accidents by Day of Week")
plt.xlabel("Day of Week")
plt.ylabel("Accident Count")
plt.show()

In [None]:
plt.figure(figsize=(6,5))
df['Seriousness'].value_counts().plot(kind="bar", color="orange")
plt.title("Distribution of Accident Seriousness")
plt.xlabel("Seriousness")
plt.ylabel("Count")
plt.show()

plt.figure(figsize=(10,6))
sns.countplot(data=df, x="year", hue="Seriousness")
plt.title("Accident Seriousness by Year")
plt.xticks(rotation=45)
plt.show()


In [None]:
fig1 = px.pie(
    df, 
    values = "id",
    names = "vehicle_mass_cat",
    title="Distribution of Vehicle Mass in Accidents",
    labels={"vehicle_mass_cat": "Vehicle Mass Category"}
)
fig1.show()

In [None]:
fig2 = px.violin(
    df, 
    x="Seriousness", 
    y="vehicle_mass", 
    title="Vehicle Mass by Accident Seriousness",
)
fig2.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


df_bar = df.dropna(subset=["vehicle_mass_cat"]).copy()
df_bar["vehicle_mass_cat"] = df_bar["vehicle_mass_cat"].astype(str)

crosstab = pd.crosstab(df_bar["day_of_week"], df_bar["vehicle_mass_cat"])

crosstab.plot(kind="bar", stacked=True, figsize=(10,6), colormap="tab20")
plt.title("Accidents by Day of Week and Vehicle Mass Category (Stacked)")
plt.xlabel("Day of Week")
plt.ylabel("Number of Accidents")
plt.legend(title="Vehicle Mass Category")
plt.tight_layout()
plt.show()

In [None]:
accidents_by_year = df.groupby("year").size().reset_index(name="accident_count")
fig4 = px.line(
    accidents_by_year,
    x="year",
    y="accident_count",
    title="Yearly Accident Trends"
)
fig4.show()

In [None]:
# Read world shapefile and select Finland
world = gpd.read_file('ne_10m_admin_0_countries/ne_10m_admin_0_countries.shp')
finland = world[world['ADMIN'] == 'Finland']

# Set target CRS (ETRS89 / TM35FIN, EPSG:3067)
target_crs = 'EPSG:3067'
finland = finland.to_crs(target_crs)

# Prepare data: drop missing vehicle_mass_cat, lat, lon
df_map = df.dropna(subset=["vehicle_mass_cat", "lat", "lon"]).copy()
df_map["vehicle_mass_cat"] = df_map["vehicle_mass_cat"].astype(str)

# Create GeoDataFrame for accident points
geometry = [Point(xy) for xy in zip(df_map['lon'], df_map['lat'])]
gdf = gpd.GeoDataFrame(df_map, geometry=geometry, crs='EPSG:4326')
gdf = gdf.to_crs(target_crs)

# Assign a color to each vehicle_mass_cat
categories = gdf["vehicle_mass_cat"].unique()
colors = plt.cm.tab20(np.linspace(0, 1, len(categories)))
color_dict = dict(zip(categories, colors))

# Plot
fig, ax = plt.subplots(1, 1, figsize=(30, 30))
finland.plot(ax=ax, color='lightgray', edgecolor='black')

for cat in categories:
    subset = gdf[gdf["vehicle_mass_cat"] == cat]
    subset.plot(ax=ax, marker='o', color=color_dict[cat], markersize=10, alpha=0.5, label=cat, zorder=5)

ax.set_title("Coordinates on a Map of Finland by Vehicle Mass Category")
ax.set_xlabel("Easting (m)")
ax.set_ylabel("Northing (m)")
ax.legend(title="Vehicle Mass Category")
plt.show()

In [None]:
world = gpd.read_file('ne_10m_admin_0_countries/ne_10m_admin_0_countries.shp')
finland = world[world['ADMIN'] == 'Finland']

target_crs = 'EPSG:3067'

finland = finland.to_crs(target_crs)

geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]
gdf = gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')

gdf = gdf.to_crs(target_crs)

fig, ax = plt.subplots(1, 1, figsize=(30, 30))

finland.plot(ax=ax, color='lightgray', edgecolor='black')

gdf.plot(ax=ax, marker='x', color='blue', markersize=2, zorder=5, alpha=0.5)

for x, y in zip(gdf.geometry.x, gdf.geometry.y):
    ax.text(x, y, "", fontsize=12, ha='right', va='top', fontweight='bold')

ax.set_title("Coordinates on a Map of Finland")
ax.set_xlabel("Easting (m)")
ax.set_ylabel("Northing (m)")

plt.show()