In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler, OrdinalEncoder, LabelEncoder

In [52]:
df = pd.read_csv("../artifacts/train.csv")

df.head(2)

Unnamed: 0,weekday,driver_age,driver_sex,educational_level,driver_relation,driving_exp,vehicle_type,vehicle_owner,service_year,vehicle_defect,...,vehicle_movement,casualty_class,casualty_sex,casualty_age,casualty_severity,casualty_work,casualty_fitness,pedestrian_movement,accident_cause,accident_severity
0,Wednesday,Over 51,Male,Writing & reading,Employee,Below 1yr,Special vehicle,Owner,Unknown,No defect,...,Reversing,Passenger,Female,31-50,3,Driver,Normal,Not a Pedestrian,No priority to vehicle,Slight Injury
1,Saturday,Under 18,Male,Junior high school,Employee,Above 10yr,Automobile,Owner,,No defect,...,Going straight,Driver or rider,Male,18-30,3,,,Not a Pedestrian,Changing lane to the right,Slight Injury


In [67]:
df.shape

(8621, 31)

In [53]:
numerical_columns = df.select_dtypes(exclude="object").columns

numerical_columns

Index(['num_of_vehicles', 'casualty'], dtype='object')

In [54]:
categorical_columns = df.select_dtypes(include="object").columns

categorical_columns = list(df.drop(columns=["accident_severity", "accident_area"], axis=1))

categorical_columns

['weekday',
 'driver_age',
 'driver_sex',
 'educational_level',
 'driver_relation',
 'driving_exp',
 'vehicle_type',
 'vehicle_owner',
 'service_year',
 'vehicle_defect',
 'lanes',
 'road_alignment',
 'junction_type',
 'road_type',
 'road_conditions',
 'light_condition',
 'weather_condition',
 'collision_type',
 'num_of_vehicles',
 'casualty',
 'vehicle_movement',
 'casualty_class',
 'casualty_sex',
 'casualty_age',
 'casualty_severity',
 'casualty_work',
 'casualty_fitness',
 'pedestrian_movement',
 'accident_cause']

In [55]:
target_column_name = "accident_severity"

In [56]:
num_pipeline = Pipeline(
steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline(
steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ordinalencoder", OrdinalEncoder()),
    ("scaler", StandardScaler())
])

In [57]:
preprocessor = ColumnTransformer(
[
    ("num_pipeline", num_pipeline, numerical_columns),
    ("cat_pipeline", cat_pipeline, categorical_columns)
])

In [30]:
#target_column_name = "accident_severity"

In [58]:
numerical_columns

Index(['num_of_vehicles', 'casualty'], dtype='object')

In [59]:
train_df = pd.read_csv("../artifacts/train.csv")

In [60]:
test_df = pd.read_csv("../artifacts/test.csv")

In [61]:
input_feature_train_df=train_df.drop(columns=[target_column_name],axis=1)
target_feature_train_df=train_df[target_column_name]

In [62]:
input_feature_test_df=test_df.drop(columns=[target_column_name],axis=1)
target_feature_test_df=test_df[target_column_name]

In [63]:
input_feature_train_arr=preprocessor.fit_transform(input_feature_train_df)
input_feature_test_arr=preprocessor.transform(input_feature_test_df)

In [64]:
input_feature_train_arr

array([[-5.61503344e-02,  4.55650649e-01,  1.46973719e+00, ...,
         9.55695443e-04,  1.86035014e-01,  1.00056879e+00],
       [-5.61503344e-02,  4.55650649e-01, -4.71299815e-01, ...,
         9.55695443e-04,  1.86035014e-01, -1.16244529e+00],
       [-5.61503344e-02,  4.55650649e-01,  1.46973719e+00, ...,
         9.55695443e-04,  1.28830845e+00,  6.07293500e-01],
       ...,
       [-5.61503344e-02, -5.44391165e-01,  1.39594356e-02, ...,
         9.55695443e-04,  1.86035014e-01, -1.16244529e+00],
       [ 1.40189796e+00,  1.45569246e+00,  1.46973719e+00, ...,
         9.55695443e-04,  1.28830845e+00, -1.35908294e+00],
       [-5.61503344e-02,  4.55650649e-01, -1.44181832e+00, ...,
         9.55695443e-04,  1.86035014e-01,  1.00056879e+00]])

In [65]:
input_feature_test_arr

array([[-5.61503344e-02, -5.44391165e-01,  1.46973719e+00, ...,
         9.55695443e-04,  1.86035014e-01, -1.16244529e+00],
       [-5.61503344e-02, -5.44391165e-01, -9.56559065e-01, ...,
         9.55695443e-04, -4.22305874e+00,  1.00056879e+00],
       [-1.51419863e+00, -5.44391165e-01, -4.71299815e-01, ...,
         9.55695443e-04,  1.86035014e-01, -3.75894719e-01],
       ...,
       [-5.61503344e-02, -5.44391165e-01, -9.56559065e-01, ...,
         9.55695443e-04,  1.86035014e-01, -7.69170006e-01],
       [ 2.85994625e+00, -5.44391165e-01,  4.99218686e-01, ...,
         9.55695443e-04,  1.86035014e-01, -7.69170006e-01],
       [-5.61503344e-02, -5.44391165e-01,  1.46973719e+00, ...,
         9.55695443e-04,  1.86035014e-01, -1.16244529e+00]])

In [66]:
input_feature_train_arr.shape

(8621, 31)

In [68]:
target_feature_train_df

0        Slight Injury
1        Slight Injury
2       Serious Injury
3       Serious Injury
4        Slight Injury
             ...      
8616     Slight Injury
8617     Slight Injury
8618     Slight Injury
8619     Slight Injury
8620     Slight Injury
Name: accident_severity, Length: 8621, dtype: object

In [69]:
le = LabelEncoder()

le.fit_transform(target_feature_train_df)

array([2, 2, 1, ..., 2, 2, 2])

In [70]:
le.transform(target_feature_test_df)

array([2, 1, 1, ..., 2, 2, 2])