<a href="https://colab.research.google.com/github/exp0nent/Kaggle_Practice/blob/main/Predicting_Road_Accident_Risk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===========================
# 📦 Standard / Core Libraries
# ===========================
import warnings
import numpy as np
import pandas as pd

# ===========================
# 📊 Visualization
# ===========================
import matplotlib.pyplot as plt
import seaborn as sns

# ===========================
# 🧠 scikit-learn (modeling / preprocessing)
# ===========================
from sklearn.model_selection import KFold, cross_val_score, cross_validate
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ===========================
# 🚀 Boosting Libraries
# ===========================
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# ===========================
# 🔧 Settings / Warnings / Notebook
# ===========================
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")
plt.style.use("fivethirtyeight")
%matplotlib inline

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Define file paths
train_path = "/content/drive/MyDrive/Predicting Road Accident Risk Data/train.csv"
test_path = "/content/drive/MyDrive/Predicting Road Accident Risk Data/test.csv"
original_path = "/content/drive/MyDrive/Predicting Road Accident Risk Data/synthetic_road_accidents_100k.csv"

# Load CSV files into DataFrames
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_original = pd.read_csv(original_path)

# Add a 'dataset' column to track source
df_train['dataset'] = 'train'
df_test['dataset'] = 'test'
df_original['dataset'] = 'train'

# Combine train and test datasets for unified preprocessing
df = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)

# Display dataset shape and preview
print("Dataset shape:", df.shape)
df


Dataset shape: (690339, 15)


Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk,dataset
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13,train
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35,train
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.30,train
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21,train
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690334,690334,rural,2,0.01,45,dim,rainy,False,False,afternoon,True,True,2,,test
690335,690335,rural,1,0.74,70,daylight,foggy,False,True,afternoon,False,False,2,,test
690336,690336,urban,2,0.14,70,dim,clear,False,False,evening,True,True,1,,test
690337,690337,urban,1,0.09,45,daylight,foggy,True,True,morning,False,True,0,,test


In [None]:
df.shape

(690339, 15)

In [None]:
# 📋 Check column types and non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690339 entries, 0 to 690338
Data columns (total 15 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   id                      690339 non-null  int64  
 1   road_type               690339 non-null  object 
 2   num_lanes               690339 non-null  int64  
 3   curvature               690339 non-null  float64
 4   speed_limit             690339 non-null  int64  
 5   lighting                690339 non-null  object 
 6   weather                 690339 non-null  object 
 7   road_signs_present      690339 non-null  bool   
 8   public_road             690339 non-null  bool   
 9   time_of_day             690339 non-null  object 
 10  holiday                 690339 non-null  bool   
 11  school_season           690339 non-null  bool   
 12  num_reported_accidents  690339 non-null  int64  
 13  accident_risk           517754 non-null  float64
 14  dataset             

In [None]:
# ✅ Separate numerical and categorical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()

print("Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)

Numerical Columns: ['id', 'num_lanes', 'curvature', 'speed_limit', 'num_reported_accidents', 'accident_risk']
Categorical Columns: ['road_type', 'lighting', 'weather', 'road_signs_present', 'public_road', 'time_of_day', 'holiday', 'school_season', 'dataset']


In [None]:
# 🔍 Check for missing values
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percent})
missing_df = missing_df[missing_df['Missing Values'] > 0]
missing_df

Unnamed: 0,Missing Values,Percentage
accident_risk,172585,25.000036


In [None]:
# 📊 Descriptive statistics for numerical columns
df[numerical_cols].describe()

Unnamed: 0,id,num_lanes,curvature,speed_limit,num_reported_accidents,accident_risk
count,690339.0,690339.0,690339.0,690339.0,690339.0,517754.0
mean,345169.0,2.492145,0.488355,46.110121,1.187492,0.352377
std,199283.848077,1.120113,0.272509,15.788149,0.896261,0.166417
min,0.0,1.0,0.0,25.0,0.0,0.0
25%,172584.5,1.0,0.26,35.0,1.0,0.23
50%,345169.0,2.0,0.51,45.0,1.0,0.34
75%,517753.5,3.0,0.71,60.0,2.0,0.46
max,690338.0,4.0,1.0,70.0,7.0,1.0


In [None]:
# 🔢 Unique value counts for categorical columns
for col in categorical_cols:
    print(f"\nUnique values in '{col}':")
    print(df[col].value_counts())


Unique values in 'road_type':
road_type
highway    231752
rural      230128
urban      228459
Name: count, dtype: int64

Unique values in 'lighting':
lighting
dim         244969
daylight    237412
night       207958
Name: count, dtype: int64

Unique values in 'weather':
weather
foggy    241699
clear    239288
rainy    209352
Name: count, dtype: int64

Unique values in 'road_signs_present':
road_signs_present
False    346382
True     343957
Name: count, dtype: int64

Unique values in 'public_road':
public_road
True     346718
False    343621
Name: count, dtype: int64

Unique values in 'time_of_day':
time_of_day
morning      231157
evening      230466
afternoon    228716
Name: count, dtype: int64

Unique values in 'holiday':
holiday
True     347813
False    342526
Name: count, dtype: int64

Unique values in 'school_season':
school_season
False    346710
True     343629
Name: count, dtype: int64

Unique values in 'dataset':
dataset
train    517754
test     172585
Name: count, dtype: int6

Exploratory Data Analysis (EDA)