### Import Libraries

In [401]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import pickle
import os

In [402]:

root_dir = r"F:\PROJECT\ml-streamlit-showcase"

### Import Dataset

In [403]:
dataset = pd.read_excel('motorcycle_risk_factor.xlsx')

In [404]:
dataset.head()

Unnamed: 0,rider_age,rider_experience_years,speed,helmet_used,alcohol_detected,recent_violations_count,road_surface,light_condition,brake_condition,weather_condition,road_type,risk_factor
0,60,19,51.398327,1,0,0,Mud,Dusk,0.151612,Windy,Highway,0.725473
1,51,18,39.634335,1,0,0,Mud,Dawn,0.711764,Windy,Urban,0.214512
2,79,4,38.527286,1,0,1,Wet,Dawn,0.015046,Snowy,Rural,0.869034
3,49,33,59.817724,1,0,1,Mud,Daylight,0.190475,Snowy,Suburban,0.709622
4,56,21,34.131623,1,0,0,Gravel,Dusk,0.822539,Foggy,Highway,0.19012


In [405]:
# Round off speed, brake condition and risk_factor to 2 decimal places
dataset['speed'] = dataset['speed'].round(2)
dataset['brake_condition'] = dataset['brake_condition'].round(2)
dataset['risk_factor'] = dataset['risk_factor'].round(2)

### Data Pre-processing

In [406]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [407]:
# print first 10 rows of X
print("First 10 rows of X:")
print(X[:10])

First 10 rows of X:
[[60 19 51.4 1 0 0 'Mud' 'Dusk' 0.15 'Windy' 'Highway']
 [51 18 39.63 1 0 0 'Mud' 'Dawn' 0.71 'Windy' 'Urban']
 [79 4 38.53 1 0 1 'Wet' 'Dawn' 0.02 'Snowy' 'Rural']
 [49 33 59.82 1 0 1 'Mud' 'Daylight' 0.19 'Snowy' 'Suburban']
 [56 21 34.13 1 0 0 'Gravel' 'Dusk' 0.82 'Foggy' 'Highway']
 [67 34 63.32 0 0 0 'Mud' 'Dusk' 0.25 'Foggy' 'Urban']
 [24 3 58.41 1 0 2 'Mud' 'Daylight' 0.66 'Rainy' 'Rural']
 [14 23 50.79 1 0 0 'Mud' 'Dawn' 0.23 'Clear' 'Urban']
 [13 29 60.28 1 0 1 'Wet' 'Dusk' 0.08 'Foggy' 'Urban']
 [32 14 45.15 0 0 0 'Mud' 'Daylight' 0.01 'Clear' 'Rural']]


In [408]:
# One-hot encoding for categorical variables
ct = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(), [6, 7, 9, 10])
    ],remainder= 'passthrough'
)
X = np.array(ct.fit_transform(X))

In [409]:
# Preparing test and train datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [410]:
# Scaling the features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Linear Regression

In [411]:
linear_regressor = LinearRegression()
model_fit = linear_regressor.fit(X_train,y_train)

In [412]:
y_pred = linear_regressor.predict(X_test)

In [413]:
# Evaluate Linear Regression model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R^2 Score: {r2}')

Mean Absolute Error: 0.12197642593822661
Mean Squared Error: 0.021475017375242295
Root Mean Squared Error: 0.14654356818107814
R^2 Score: 0.679246424875071


In [414]:
# printing y_test and y_pred
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(comparison_df)

      Actual  Predicted
0       0.91   0.940007
1       0.77   0.598584
2       0.43   0.466565
3       0.85   0.724090
4       0.85   0.571108
...      ...        ...
1995    0.97   0.765759
1996    0.75   0.842938
1997    0.25   0.169331
1998    0.74   0.910849
1999    0.73   0.819400

[2000 rows x 2 columns]


### Random Forest

In [415]:
randomforest_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_fit = randomforest_regressor.fit(X_train, y_train)

In [416]:
y_pred = rf_model_fit.predict(X_test)

In [417]:
# Evaluating the Random Forest model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R^2 Score: {r2}')

Mean Absolute Error: 0.02210220000000001
Mean Squared Error: 0.0012110139900000005
Root Mean Squared Error: 0.03479962629109687
R^2 Score: 0.9819121419074325


In [418]:
# printing y_test and y_pred
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(comparison_df)

      Actual  Predicted
0       0.91     0.9010
1       0.77     0.7794
2       0.43     0.4288
3       0.85     0.7749
4       0.85     0.6137
...      ...        ...
1995    0.97     0.8723
1996    0.75     0.7841
1997    0.25     0.2516
1998    0.74     0.7812
1999    0.73     0.7745

[2000 rows x 2 columns]


### XGBoost

In [419]:
xgb = XGBRegressor(n_estimators=100, random_state=42)
xgb_model_fit = xgb.fit(X_train, y_train)

In [420]:
y_pred = xgb_model_fit.predict(X_test)

In [421]:
# Evaluation of the XGBoost model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R^2 Score: {r2}')

Mean Absolute Error: 0.019743113104701046
Mean Squared Error: 0.0012977366086757063
Root Mean Squared Error: 0.03602411149044076
R^2 Score: 0.980616841908444


In [422]:
# printing y_test and y_pred
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(comparison_df)

      Actual  Predicted
0       0.91   0.892893
1       0.77   0.774059
2       0.43   0.429869
3       0.85   0.773145
4       0.85   0.568028
...      ...        ...
1995    0.97   0.940508
1996    0.75   0.794220
1997    0.25   0.246803
1998    0.74   0.772275
1999    0.73   0.753151

[2000 rows x 2 columns]


### Neural Network

In [423]:
# Build neural network
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # output layer for regression
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [424]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

In [425]:
# Train the model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32)

Epoch 1/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1381 - mae: 0.2726 - val_loss: 0.0318 - val_mae: 0.1418
Epoch 2/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0263 - mae: 0.1298 - val_loss: 0.0222 - val_mae: 0.1187
Epoch 3/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0181 - mae: 0.1070 - val_loss: 0.0179 - val_mae: 0.1044
Epoch 4/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0153 - mae: 0.0981 - val_loss: 0.0164 - val_mae: 0.1012
Epoch 5/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0131 - mae: 0.0898 - val_loss: 0.0138 - val_mae: 0.0913
Epoch 6/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0112 - mae: 0.0826 - val_loss: 0.0131 - val_mae: 0.0879
Epoch 7/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - 

In [426]:
# Evaluate
loss, mae = model.evaluate(X_test, y_test)
print(f"Test MAE: {mae}")
print(f"Test Loss: {loss}")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 966us/step - loss: 0.0052 - mae: 0.0484
Test MAE: 0.04806695133447647
Test Loss: 0.004823956172913313


In [427]:
y_pred = model.predict(X_test)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 884us/step


In [428]:
y_pred = y_pred.flatten()

In [429]:
# printing y_test and y_pred
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(comparison_df)

      Actual  Predicted
0       0.91   0.918034
1       0.77   0.717261
2       0.43   0.448526
3       0.85   0.835018
4       0.85   0.437358
...      ...        ...
1995    0.97   0.903863
1996    0.75   0.756298
1997    0.25   0.204022
1998    0.74   0.702000
1999    0.73   0.719910

[2000 rows x 2 columns]


### Choosing the model and Exporting

- Based on evaluation metrics, XGBoost is prefered

In [430]:
model_path = os.path.join(root_dir , "models\motorcycle-risk-predictor.pkl")
model_path

'F:\\PROJECT\\ml-streamlit-showcase\\models\\motorcycle-risk-predictor.pkl'

In [431]:
X = dataset.drop("risk_factor", axis=1)
y = dataset["risk_factor"]

# Column names
numeric_features = ['rider_age', 'rider_experience_years', 'speed','helmet_used','alcohol_detected','recent_violations_count', 'brake_condition']
categorical_features = ['road_surface', 'light_condition', 'weather_condition', 'road_type']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Full pipeline (preprocessing + model)
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor())
])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the full pipeline
model_pipeline.fit(X_train, y_train)

In [432]:
# Save the models
with open(model_path,'wb') as f:
    pickle.dump(model_pipeline,f)

### Loading the model and testing

In [433]:
# load the model
with open(model_path, "rb") as f:
    loaded_model = pickle.load(f)

In [434]:
# Input test data
input_dict = {
    'rider_age': [19],
    'rider_experience_years': [1],
    'speed': [100.0],
    'helmet_used': [1],
    'alcohol_detected': [0],
    'recent_violations_count': [0],
    'road_surface': ['Gravel'],
    'light_condition': ['Dusk'],
    'brake_condition': [0.1],
    'weather_condition': ['Clear'],
    'road_type': ['Urban']
}


input_df = pd.DataFrame(input_dict)

# Make prediction
prediction = loaded_model.predict(input_df)
print("Predicted risk score:", prediction[0])

Predicted risk score: 0.81853455
