##### `Automated Preprocessing with sklearn pipeline`

##### `Problem Statement: Estimate Weight(Column) of Car based on other Factors(Columns)`

In [9]:
#Ignore Warnings
from warnings import filterwarnings
filterwarnings("ignore")

In [None]:
#Step-1: Data Ingestion
import pandas as pd
df = pd.read_csv("Cars93.csv", keep_default_na=False, na_values =["", "NA"])
df.head(1)

Unnamed: 0,id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra


In [None]:
#Step-2: Data Sanity Checks 
#   Duplicate Removals 
#   Missing Values Replacement 
#   Less Unique Values Removal
df.duplicated().sum()
df = df.drop_duplicates(keep='first')
df.duplicated().sum()
m = df.isna().sum()
m[m>0]

AirBags            4
Rear.seat.room     2
Luggage.room      11
dtype: int64

In [None]:
# If Exist Replace as below
    # Categorical Values with most occured value
    # Numeric value with mean or median
# Create a replacer method to do the above replacing values

def replacer(df):
    cat_cols = df.select_dtypes(include='object').columns
    num_cols = df.select_dtypes(include='number').columns
    for col in df.columns:
        if col in cat_cols:
            mode = df[col].mode()[0]
            df[col] = df[col].fillna(mode)
        if col in num_cols:
            mean = df[col].mean()
            df[col] = df[col].fillna(mean)

replacer(df)

m = df.isna().sum()
m[m>0]


Series([], dtype: int64)

In [34]:
# Note: Categoric values which has huge unique values needs to be discarded/dropped from dataframe as 
# One Hot Encoding will create large amoun of columns which is not ideal for prediction
cat_uniq_cols = df.select_dtypes(include='object').nunique()
high_cat_uniq_cols = cat_uniq_cols[cat_uniq_cols/len(df) > 0.9].index
print(high_cat_uniq_cols)
df = df.drop(columns = high_cat_uniq_cols)

Index([], dtype='object')


In [None]:
#Step-3: Separate X and Y
X = df.drop(columns=['id', 'Weight'])
Y = df['Weight']
# X.head(1), Y.head(1)

In [48]:
#Step-4: Train Test Split
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=21)
xtrain.shape, xtest.shape

((74, 24), (19, 24))

In [None]:
#Step-5: Apply Preprocessing on X
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
num_cols = df.select_dtypes(include='number').columns
cat_cols = df.select_dtypes(include='object').columns

In [None]:
#Create number and categoric pipelines

num_pipe = make_pipeline(
   SimpleImputer(strategy='median'),
   StandardScaler()
)

cat_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
)

# Create preprocessing column transformer
pre = ColumnTransformer(
    ("numeric", num_pipe, num_cols),
    ("categoric", cat_pipe, cat_cols)
).set_output(transform='pandas')

pre.fit(xtrain)