In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv("../notebook/data/rta_data.csv")

data.head()

Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,...,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
0,17:02:00,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,...,Going straight,na,na,na,na,,,Not a Pedestrian,Moving Backward,Slight Injury
1,17:02:00,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,...,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury
2,17:02:00,Monday,18-30,Male,Junior high school,Employee,1-2yr,Lorry (41?100Q),Owner,,...,Going straight,Driver or rider,Male,31-50,3,Driver,,Not a Pedestrian,Changing lane to the left,Serious Injury
3,1:06:00,Sunday,18-30,Male,Junior high school,Employee,5-10yr,Public (> 45 seats),Governmental,,...,Going straight,Pedestrian,Female,18-30,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,Slight Injury
4,1:06:00,Sunday,18-30,Male,Junior high school,Employee,2-5yr,,Owner,5-10yrs,...,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury


In [3]:
df = data.drop(columns=['Time'], axis=1)

df.head(2)

Unnamed: 0,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,Defect_of_vehicle,...,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
0,Monday,18-30,Male,Above high school,Employee,1-2yr,Automobile,Owner,Above 10yr,No defect,...,Going straight,na,na,na,na,,,Not a Pedestrian,Moving Backward,Slight Injury
1,Monday,31-50,Male,Junior high school,Employee,Above 10yr,Public (> 45 seats),Owner,5-10yrs,No defect,...,Going straight,na,na,na,na,,,Not a Pedestrian,Overtaking,Slight Injury


In [4]:
numerical_features = df.select_dtypes(exclude="object").columns

numerical_features

Index(['Number_of_vehicles_involved', 'Number_of_casualties'], dtype='object')

In [5]:
categorical_features = df.select_dtypes(include="object").columns

categorical_features

Index(['Day_of_week', 'Age_band_of_driver', 'Sex_of_driver',
       'Educational_level', 'Vehicle_driver_relation', 'Driving_experience',
       'Type_of_vehicle', 'Owner_of_vehicle', 'Service_year_of_vehicle',
       'Defect_of_vehicle', 'Area_accident_occured', 'Lanes_or_Medians',
       'Road_allignment', 'Types_of_Junction', 'Road_surface_type',
       'Road_surface_conditions', 'Light_conditions', 'Weather_conditions',
       'Type_of_collision', 'Vehicle_movement', 'Casualty_class',
       'Sex_of_casualty', 'Age_band_of_casualty', 'Casualty_severity',
       'Work_of_casuality', 'Fitness_of_casuality', 'Pedestrian_movement',
       'Cause_of_accident', 'Accident_severity'],
      dtype='object')

In [6]:
num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ordinalencoder", OrdinalEncoder()),
        ("scaler", StandardScaler(with_mean=False))
    ]
)

In [7]:
preprocessor = ColumnTransformer(
    [
        ("num_pipeline", num_pipeline,numerical_features),
        ("cat_pipeline", cat_pipeline, categorical_features)
        
    ]
)

In [9]:
train_df = pd.read_csv("../artifacts/train.csv")

train_df.head(2)

Unnamed: 0,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,Defect_of_vehicle,...,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
0,Wednesday,Over 51,Male,Writing & reading,Employee,Below 1yr,Special vehicle,Owner,Unknown,No defect,...,Reversing,Passenger,Female,31-50,3,Driver,Normal,Not a Pedestrian,No priority to vehicle,Slight Injury
1,Saturday,Under 18,Male,Junior high school,Employee,Above 10yr,Automobile,Owner,,No defect,...,Going straight,Driver or rider,Male,18-30,3,,,Not a Pedestrian,Changing lane to the right,Slight Injury


In [10]:
test_df = pd.read_csv("../artifacts/test.csv")

test_df.head(2)

Unnamed: 0,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,Defect_of_vehicle,...,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
0,Wednesday,31-50,Male,Junior high school,Employee,5-10yr,Public (13?45 seats),Owner,1-2yr,No defect,...,Going straight,Driver or rider,Male,18-30,3,Driver,Normal,Not a Pedestrian,Changing lane to the right,Slight Injury
1,Monday,Under 18,Male,Elementary school,Employee,2-5yr,,Owner,Unknown,No defect,...,Moving Backward,Pedestrian,Female,31-50,3,Driver,Normal,Crossing from nearside - masked by parked or s...,No priority to vehicle,Serious Injury


In [13]:
numerical_features = ['Number_of_vehicles_involved', 'Number_of_casualties']

categorical_features = ['Day_of_week', 'Age_band_of_driver', 'Sex_of_driver',
       'Educational_level', 'Vehicle_driver_relation', 'Driving_experience',
       'Type_of_vehicle', 'Owner_of_vehicle', 'Service_year_of_vehicle',
       'Defect_of_vehicle', 'Area_accident_occured', 'Lanes_or_Medians',
       'Road_allignment', 'Types_of_Junction', 'Road_surface_type',
       'Road_surface_conditions', 'Light_conditions', 'Weather_conditions',
       'Type_of_collision', 'Vehicle_movement', 'Casualty_class',
       'Sex_of_casualty', 'Age_band_of_casualty', 'Casualty_severity',
       'Work_of_casuality', 'Fitness_of_casuality', 'Pedestrian_movement',
       'Cause_of_accident']

In [14]:
target_column_name = "Accident_severity"

In [15]:
input_feature_train_df=train_df.drop(columns=[target_column_name],axis=1)
target_feature_train_df=train_df[target_column_name]

In [18]:
input_feature_train_df.head(2)

Unnamed: 0,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,Defect_of_vehicle,...,Number_of_casualties,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident
0,Wednesday,Over 51,Male,Writing & reading,Employee,Below 1yr,Special vehicle,Owner,Unknown,No defect,...,2,Reversing,Passenger,Female,31-50,3,Driver,Normal,Not a Pedestrian,No priority to vehicle
1,Saturday,Under 18,Male,Junior high school,Employee,Above 10yr,Automobile,Owner,,No defect,...,2,Going straight,Driver or rider,Male,18-30,3,,,Not a Pedestrian,Changing lane to the right


In [16]:
target_feature_train_df

0        Slight Injury
1        Slight Injury
2       Serious Injury
3       Serious Injury
4        Slight Injury
             ...      
8616     Slight Injury
8617     Slight Injury
8618     Slight Injury
8619     Slight Injury
8620     Slight Injury
Name: Accident_severity, Length: 8621, dtype: object

In [19]:
input_feature_test_df=test_df.drop(columns=[target_column_name],axis=1)
target_feature_test_df=test_df[target_column_name]

In [21]:
input_feature_train_arr=preprocessor.fit_transform(input_feature_train_df)

ValueError: A given column is not a column of the dataframe