##### `Automated Preprocessing with sklearn pipeline`

###### <b>`Problem Statement: Estimate Weight(Column) of Car based on other Factors(Columns)`<b>

In [41]:
#Ignore Warnings
from warnings import filterwarnings
filterwarnings("ignore")

In [42]:
#Step-1: Data Ingestion
import pandas as pd
df = pd.read_csv("Cars93.csv", na_values = ["", "NA"], keep_default_na=False)
print(f'Actual Data Size: {df.shape}')

Actual Data Size: (94, 28)


In [43]:
#Step-2: Data Sanity Checks
#--------------------------

#Duplicate Removals 
print('Duplicates Count Before Removal:', df.duplicated().sum())
df = df.drop_duplicates(keep='first').reset_index(drop=True)
print('Duplicates Count After Removal:', df.duplicated().sum())
m_before = df.isna().sum()
print("Missing Column Values Before Replacing: ", m_before[m_before > 0].size)

# If Missing Column Values Exist: Replace Categorical cols with most occured value, Numeric cols with mean|median

# Create a replacer method to do the above replacing values
def replacer(df):
    cat_cols = df.select_dtypes(include='object').columns
    num_cols = df.select_dtypes(include='number').columns
    for col in df.columns:
        if col in cat_cols:
            mode = df[col].mode()[0]
            df[col] = df[col].fillna(mode)
        if col in num_cols:
            mean = df[col].mean()
            df[col] = df[col].fillna(mean)

replacer(df)

m_after = df.isna().sum()
print("Missing Column Values After Replacing:", m_after[m_after > 0].size)
print('After Sanity Changes Data Size:', df.shape)


Duplicates Count Before Removal: 1
Duplicates Count After Removal: 0
Missing Column Values Before Replacing:  3
Missing Column Values After Replacing: 0
After Sanity Changes Data Size: (93, 28)


In [44]:
# Note: Categoric values which has huge unique values needs to be discarded/dropped from dataframe as 
# One Hot Encoding will create large amoun of columns which is not ideal for prediction
card = df.select_dtypes(include = "object").nunique() / len(df)
df = df.drop(columns = card[card >= 0.9].index)

In [45]:
#Step-3: Separate X and Y
#------------------------

X = df.drop(columns = ["id", "Weight"]); Y = df["Weight"]
X.shape

(93, 24)

In [46]:
#Step-4: Train Test Split
#------------------------
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

In [47]:
#Step-5: Apply Preprocessing on X
#--------------------------------
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_cols = X.select_dtypes(include='number').columns; cat_cols = X.select_dtypes(include='object').columns

#Create number and categoric pipelines
num_pipe = make_pipeline(
    SimpleImputer(strategy="median"), StandardScaler()
)

cat_pipe = make_pipeline(
    SimpleImputer(strategy= "most_frequent"), OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop="first")
)

# Create preprocessing column transformer
pre = ColumnTransformer(
    [
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols)
    ]
).set_output(transform='pandas')

pre.fit(xtrain)

xtrain_pre = pre.transform(xtrain); xtest_pre = pre.transform(xtest)

In [48]:
#Step-6: Build Model
from sklearn.linear_model import LinearRegression
model = LinearRegression(); model.fit(xtrain_pre, ytrain)
print('Linear Regression Model Train Score', model.score(xtrain_pre , ytrain))
print('Linear Regression Model Test Score', model.score(xtest_pre , ytest))

Linear Regression Model Train Score 0.9962358571591687
Linear Regression Model Test Score 0.8210643566752944


In [49]:
#Evaluate Model
from sklearn.metrics import (
    root_mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
)


def evaluate_model(model, x, y):
    ypred = model.predict(x)
    rmse = root_mean_squared_error(y, ypred); mae = mean_absolute_error(y, ypred); 
    mape = mean_absolute_percentage_error(y, ypred); r2 = r2_score(y, ypred)
    print('RMSE:', rmse, '|MAE:', mae, '|MAPE:', mape, '|R2_SCORE:', r2)

print("Train Data Evaluation..."); evaluate_model(model, xtrain_pre, ytrain)
print("\nTest Data Evaluation..."); evaluate_model(model, xtest_pre, ytest)


Train Data Evaluation...
RMSE: 36.08050182544436 |MAE: 25.49587482404905 |MAPE: 0.008165164065198582 |R2_SCORE: 0.9962358571591687

Test Data Evaluation...
RMSE: 243.07145364259554 |MAE: 194.30143429442538 |MAPE: 0.06591645161419468 |R2_SCORE: 0.8210643566752944


In [50]:
# Model Inference (Out Of Sample Data)
xnew = pd.read_csv("sample.csv", na_values = ["", "NA"], keep_default_na=False)
xnew_pre = pre.transform(xnew)

In [51]:
preds = model.predict(xnew_pre)
xnew['Weight_Predicted'] = preds.round(2)
xnew.head(1)
# xnew.to_csv('results.csv', index=False)

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make,Weight_Predicted
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,6,190,106,65,37,31.0,17.0,non-USA,Audi 100,3481.81


In [52]:
#Step-7: Save & Load Model
import joblib
#Save
# joblib.dump(pre, "pre.joblib")
# joblib.dump(model, "weight_model.joblib")

#Load
# p = joblib.load("pre.joblib")