In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler, OrdinalEncoder, LabelEncoder

In [2]:
df = pd.read_csv("../artifacts/train.csv")

df.head(2)

Unnamed: 0,weekday,driver_age,driver_sex,educational_level,driver_relation,driving_exp,vehicle_type,vehicle_owner,service_year,vehicle_defect,...,vehicle_movement,casualty_class,casualty_sex,casualty_age,casualty_severity,casualty_work,casualty_fitness,pedestrian_movement,accident_cause,accident_severity
0,Wednesday,Over 51,Male,Writing & reading,Employee,Below 1yr,Special vehicle,Owner,Unknown,No defect,...,Reversing,Passenger,Female,31-50,3,Driver,Normal,Not a Pedestrian,No priority to vehicle,Slight Injury
1,Saturday,Under 18,Male,Junior high school,Employee,Above 10yr,Automobile,Owner,,No defect,...,Going straight,Driver or rider,Male,18-30,3,,,Not a Pedestrian,Changing lane to the right,Slight Injury


In [3]:
df.shape

(8621, 31)

In [4]:
numerical_columns = df.select_dtypes(exclude="object").columns

numerical_columns

Index(['num_of_vehicles', 'casualty'], dtype='object')

In [5]:
categorical_columns = df.select_dtypes(include="object").columns

categorical_columns = list(df.drop(columns=["accident_severity", "accident_area"], axis=1))

categorical_columns

['weekday',
 'driver_age',
 'driver_sex',
 'educational_level',
 'driver_relation',
 'driving_exp',
 'vehicle_type',
 'vehicle_owner',
 'service_year',
 'vehicle_defect',
 'lanes',
 'road_alignment',
 'junction_type',
 'road_type',
 'road_conditions',
 'light_condition',
 'weather_condition',
 'collision_type',
 'num_of_vehicles',
 'casualty',
 'vehicle_movement',
 'casualty_class',
 'casualty_sex',
 'casualty_age',
 'casualty_severity',
 'casualty_work',
 'casualty_fitness',
 'pedestrian_movement',
 'accident_cause']

In [6]:
target_column_name = "accident_severity"

In [7]:
num_pipeline = Pipeline(
steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline(
steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ordinalencoder", OrdinalEncoder()),
    ("scaler", StandardScaler())
])

In [8]:
preprocessor = ColumnTransformer(
[
    ("num_pipeline", num_pipeline, numerical_columns),
    ("cat_pipeline", cat_pipeline, categorical_columns)
])

In [9]:
#target_column_name = "accident_severity"

In [10]:
numerical_columns

Index(['num_of_vehicles', 'casualty'], dtype='object')

In [11]:
train_df = pd.read_csv("../artifacts/train.csv")

In [12]:
test_df = pd.read_csv("../artifacts/test.csv")

In [13]:
input_feature_train_df=train_df.drop(columns=[target_column_name],axis=1)
target_feature_train_df=train_df[target_column_name]

In [14]:
input_feature_test_df=test_df.drop(columns=[target_column_name],axis=1)
target_feature_test_df=test_df[target_column_name]

In [15]:
input_feature_train_arr=preprocessor.fit_transform(input_feature_train_df)
input_feature_test_arr=preprocessor.transform(input_feature_test_df)

In [16]:
input_feature_train_arr

array([[-5.61503344e-02,  4.55650649e-01,  1.46973719e+00, ...,
         9.55695443e-04,  1.86035014e-01,  1.00056879e+00],
       [-5.61503344e-02,  4.55650649e-01, -4.71299815e-01, ...,
         9.55695443e-04,  1.86035014e-01, -1.16244529e+00],
       [-5.61503344e-02,  4.55650649e-01,  1.46973719e+00, ...,
         9.55695443e-04,  1.28830845e+00,  6.07293500e-01],
       ...,
       [-5.61503344e-02, -5.44391165e-01,  1.39594356e-02, ...,
         9.55695443e-04,  1.86035014e-01, -1.16244529e+00],
       [ 1.40189796e+00,  1.45569246e+00,  1.46973719e+00, ...,
         9.55695443e-04,  1.28830845e+00, -1.35908294e+00],
       [-5.61503344e-02,  4.55650649e-01, -1.44181832e+00, ...,
         9.55695443e-04,  1.86035014e-01,  1.00056879e+00]])

In [17]:
input_feature_test_arr

array([[-5.61503344e-02, -5.44391165e-01,  1.46973719e+00, ...,
         9.55695443e-04,  1.86035014e-01, -1.16244529e+00],
       [-5.61503344e-02, -5.44391165e-01, -9.56559065e-01, ...,
         9.55695443e-04, -4.22305874e+00,  1.00056879e+00],
       [-1.51419863e+00, -5.44391165e-01, -4.71299815e-01, ...,
         9.55695443e-04,  1.86035014e-01, -3.75894719e-01],
       ...,
       [-5.61503344e-02, -5.44391165e-01, -9.56559065e-01, ...,
         9.55695443e-04,  1.86035014e-01, -7.69170006e-01],
       [ 2.85994625e+00, -5.44391165e-01,  4.99218686e-01, ...,
         9.55695443e-04,  1.86035014e-01, -7.69170006e-01],
       [-5.61503344e-02, -5.44391165e-01,  1.46973719e+00, ...,
         9.55695443e-04,  1.86035014e-01, -1.16244529e+00]])

In [18]:
input_feature_train_arr.shape

(8621, 31)

In [19]:
target_feature_train_df

0        Slight Injury
1        Slight Injury
2       Serious Injury
3       Serious Injury
4        Slight Injury
             ...      
8616     Slight Injury
8617     Slight Injury
8618     Slight Injury
8619     Slight Injury
8620     Slight Injury
Name: accident_severity, Length: 8621, dtype: object

In [20]:
le = LabelEncoder()

le.fit_transform(target_feature_train_df)

array([2, 2, 1, ..., 2, 2, 2])

In [21]:
le.transform(target_feature_test_df)

array([2, 1, 1, ..., 2, 2, 2])

## Feature Selection

In [49]:
data = pd.read_csv("../artifacts/train.csv")

In [50]:
X = data.drop("accident_severity", axis=1)
y = data["accident_severity"]

In [51]:
X

Unnamed: 0,weekday,driver_age,driver_sex,educational_level,driver_relation,driving_exp,vehicle_type,vehicle_owner,service_year,vehicle_defect,...,casualty,vehicle_movement,casualty_class,casualty_sex,casualty_age,casualty_severity,casualty_work,casualty_fitness,pedestrian_movement,accident_cause
0,Wednesday,Over 51,Male,Writing & reading,Employee,Below 1yr,Special vehicle,Owner,Unknown,No defect,...,2,Reversing,Passenger,Female,31-50,3,Driver,Normal,Not a Pedestrian,No priority to vehicle
1,Saturday,Under 18,Male,Junior high school,Employee,Above 10yr,Automobile,Owner,,No defect,...,2,Going straight,Driver or rider,Male,18-30,3,,,Not a Pedestrian,Changing lane to the right
2,Wednesday,18-30,Male,Junior high school,Employee,No Licence,Other,Owner,1-2yr,No defect,...,2,Other,Pedestrian,Male,31-50,3,Driver,Normal,Unknown or other,No distancing
3,Monday,18-30,Male,Elementary school,Owner,2-5yr,Turbo,Owner,,No defect,...,2,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Changing lane to the left
4,Friday,Under 18,Male,Junior high school,Employee,2-5yr,Automobile,Owner,Unknown,No defect,...,2,Going straight,Driver or rider,Female,31-50,3,Driver,Normal,Not a Pedestrian,No priority to vehicle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8616,Wednesday,31-50,Male,Junior high school,Employee,2-5yr,Public (12 seats),Owner,Unknown,No defect,...,1,Going straight,Passenger,Female,18-30,3,Driver,Normal,Not a Pedestrian,No priority to pedestrian
8617,Saturday,Over 51,Male,Junior high school,Employee,1-2yr,Stationwagen,Owner,,No defect,...,3,Going straight,Driver or rider,Female,18-30,3,,Normal,Not a Pedestrian,No priority to pedestrian
8618,Sunday,18-30,Male,High school,Employee,2-5yr,Public (> 45 seats),Owner,Unknown,,...,1,Going straight,na,na,na,na,Driver,Normal,Not a Pedestrian,Changing lane to the right
8619,Wednesday,Under 18,Male,Junior high school,Owner,2-5yr,Automobile,Owner,Above 10yr,No defect,...,3,Going straight,Pedestrian,Male,31-50,3,,,Unknown or other,Changing lane to the left


In [52]:
y

0        Slight Injury
1        Slight Injury
2       Serious Injury
3       Serious Injury
4        Slight Injury
             ...      
8616     Slight Injury
8617     Slight Injury
8618     Slight Injury
8619     Slight Injury
8620     Slight Injury
Name: accident_severity, Length: 8621, dtype: object

In [54]:
# Encoding categorical features
le = LabelEncoder()
for col in X.select_dtypes(include=['object']):
    X[col] = le.fit_transform(X[col])

In [55]:
X

Unnamed: 0,weekday,driver_age,driver_sex,educational_level,driver_relation,driving_exp,vehicle_type,vehicle_owner,service_year,vehicle_defect,...,casualty,vehicle_movement,casualty_class,casualty_sex,casualty_age,casualty_severity,casualty_work,casualty_fitness,pedestrian_movement,accident_cause
0,6,2,1,6,0,4,13,3,5,2,...,2,7,1,0,1,2,0,2,5,12
1,2,3,1,4,0,3,0,3,6,2,...,2,2,0,1,0,2,7,5,5,1
2,6,0,1,4,0,5,7,3,0,2,...,2,4,2,1,1,2,0,2,6,10
3,1,0,1,1,2,1,16,3,6,2,...,2,2,3,2,5,3,0,2,5,0
4,0,3,1,4,0,1,0,3,5,2,...,2,2,0,0,1,2,0,2,5,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8616,6,1,1,4,0,1,9,3,5,2,...,1,2,1,0,0,2,0,2,5,11
8617,2,2,1,4,0,0,14,3,6,2,...,3,2,0,0,0,2,7,2,5,11
8618,3,0,1,2,0,1,11,3,5,3,...,1,2,3,2,5,3,0,2,5,1
8619,6,3,1,4,2,1,0,3,3,2,...,3,2,2,1,1,2,7,5,6,0


In [56]:
y = le.fit_transform(y)

In [57]:
y

array([2, 2, 1, ..., 2, 2, 2])

In [58]:
# Feature selection using mutual information
mi = SelectKBest(mutual_info_classif, k=10)
X_mi = mi.fit_transform(X, y)

In [59]:
X_mi

array([[6, 4, 7, ..., 1, 2, 2],
       [2, 3, 3, ..., 0, 2, 5],
       [6, 5, 1, ..., 1, 2, 2],
       ...,
       [3, 1, 1, ..., 5, 3, 2],
       [6, 1, 8, ..., 1, 2, 5],
       [0, 0, 0, ..., 5, 3, 5]], dtype=int64)

## Best features for Model building

In [60]:
# Print the selected features
selected_features = X.columns[mi.get_support()]
print(selected_features)

Index(['weekday', 'driving_exp', 'junction_type', 'road_conditions',
       'num_of_vehicles', 'casualty', 'casualty_sex', 'casualty_age',
       'casualty_severity', 'casualty_fitness'],
      dtype='object')


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../artifacts/train.csv")

df.head(3)

Unnamed: 0,weekday,driver_age,driver_sex,educational_level,driver_relation,driving_exp,vehicle_type,vehicle_owner,service_year,vehicle_defect,...,vehicle_movement,casualty_class,casualty_sex,casualty_age,casualty_severity,casualty_work,casualty_fitness,pedestrian_movement,accident_cause,accident_severity
0,Wednesday,Over 51,Male,Writing & reading,Employee,Below 1yr,Special vehicle,Owner,Unknown,No defect,...,Reversing,Passenger,Female,31-50,3,Driver,Normal,Not a Pedestrian,No priority to vehicle,Slight Injury
1,Saturday,Under 18,Male,Junior high school,Employee,Above 10yr,Automobile,Owner,,No defect,...,Going straight,Driver or rider,Male,18-30,3,,,Not a Pedestrian,Changing lane to the right,Slight Injury
2,Wednesday,18-30,Male,Junior high school,Employee,No Licence,Other,Owner,1-2yr,No defect,...,Other,Pedestrian,Male,31-50,3,Driver,Normal,Unknown or other,No distancing,Serious Injury


In [3]:
df['weekday'].value_counts()

weekday
Friday       1451
Thursday     1303
Wednesday    1272
Tuesday      1240
Monday       1177
Saturday     1140
Sunday       1038
Name: count, dtype: int64

In [4]:
df['driving_exp'].value_counts()

driving_exp
5-10yr        2347
2-5yr         1824
Above 10yr    1614
1-2yr         1226
Below 1yr      920
No Licence      95
unknown         20
Name: count, dtype: int64

In [5]:
['weekday', 'driving_exp', 'junction_type', 'road_conditions',
       'num_of_vehicles', 'casualty', 'casualty_sex', 'casualty_age',
       'casualty_severity', 'casualty_fitness']

['weekday',
 'driving_exp',
 'junction_type',
 'road_conditions',
 'num_of_vehicles',
 'casualty',
 'casualty_sex',
 'casualty_age',
 'casualty_severity',
 'casualty_fitness']

In [6]:
df['junction_type'].value_counts()

junction_type
Y Shape        3173
No junction    2671
Crossing       1516
Other           316
Unknown         144
O Shape         127
T Shape          42
X Shape           7
Name: count, dtype: int64

In [7]:
df['road_conditions'].value_counts()

road_conditions
Dry                     6545
Wet or damp             2026
Snow                      49
Flood over 3cm. deep       1
Name: count, dtype: int64

In [8]:
df['num_of_vehicles'].value_counts()

num_of_vehicles
2    5864
1    1394
3    1071
4     258
6      31
7       3
Name: count, dtype: int64

In [9]:
df['casualty_sex'].value_counts()

casualty_sex
Male      3649
na        3135
Female    1837
Name: count, dtype: int64

In [10]:
df['casualty_age'].value_counts()

casualty_age
na          3135
18-30       2199
31-50       1717
Under 18     707
Over 51      689
5            174
Name: count, dtype: int64

In [11]:
df['casualty_severity'].value_counts()

casualty_severity
3     4937
na    3135
2      534
1       15
Name: count, dtype: int64

In [12]:
df['casualty_fitness'].value_counts()

casualty_fitness
Normal          6749
NormalNormal      14
Blind             13
Deaf              13
Other             12
Name: count, dtype: int64

In [13]:
df['casualty'].value_counts()

casualty
1    5888
2    1587
3     655
4     275
5     139
6      54
7      16
8       7
Name: count, dtype: int64