##### `Manual Preprocessing`

In [35]:
# Filter Warnings
from warnings import filterwarnings
filterwarnings("ignore")

In [1]:
# Step-1: Data Ingestion
import pandas as pd
df = pd.read_csv("Cars93.csv", keep_default_na=False, na_values=["", "NA"])
df.head(3)

Unnamed: 0,id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,2,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,3,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90


In [2]:
# Problem Statement: Estimate Weight (Column) of Car based on other Factors (Columns)

In [3]:
# Step-2: Perform Data Sanity Checks


In [4]:
#Check duplicates
df.duplicated().sum()
# Drop Duplicates
df = df.drop_duplicates(keep="first")
#Check duplicates
df.duplicated().sum()

np.int64(0)

In [5]:
print("Missing Values Before Replacing")
# Check Missing Values in File
m_before = df.isna().sum()
print(m_before[m_before > 0])


Missing Values Before Replacing
AirBags            4
Rear.seat.room     2
Luggage.room      11
dtype: int64


In [6]:
# If Exist Replace as below
    # Categorical Values with most occured value
    # Numeric value with mean or median
# Create a replacer method to do the above replacing values

def replacer(df):
    cat_cols = df.select_dtypes(include="object").columns
    num_cols = df.select_dtypes(include='number').columns
    for i in df.columns:
        if i in cat_cols:
            mode = df[i].mode()[0]
            df[i] = df[i].fillna(mode)
        elif i in num_cols:
            mean = df[i].mean()
            df[i] = df[i].fillna(mean)

replacer(df)
print("Missing Values After Replacing")
m_after = df.isna().sum()
print(m_after[m_after > 0])


Missing Values After Replacing
Series([], dtype: int64)


In [7]:
# Check Data Size
df.shape

(93, 28)

In [8]:
# Note: Categoric values which has huge unique values needs to be discarded/dropped from dataframe as 
# One Hot Encoding will create large amoun of columns which is not ideal for prediction
cat_uniq_cols = df.select_dtypes(include="object").nunique()
high_cat_uniq_cols = cat_uniq_cols[cat_uniq_cols/len(df) > 0.9].index
df = df.drop(columns = high_cat_uniq_cols )

In [9]:
# Step-3 Separate X and Y
X = df.drop(columns=['id', 'Weight'])
Y = df["Weight"]

In [10]:
# Apply Preprocessing on X
# Get categorical columns 
# Apply OHE(OneHotEncoding)
X_Cat = X.select_dtypes(include='object')

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(
    handle_unknown='ignore', sparse_output=False, drop='first'
).set_output(transform='pandas')

ohe.fit(X_Cat)
X_Cat_Pre = ohe.transform(X_Cat)
X_Cat_Pre.head(1)

Unnamed: 0,Manufacturer_Audi,Manufacturer_BMW,Manufacturer_Buick,Manufacturer_Cadillac,Manufacturer_Chevrolet,Manufacturer_Chrylser,Manufacturer_Chrysler,Manufacturer_Dodge,Manufacturer_Eagle,Manufacturer_Ford,...,AirBags_None,DriveTrain_Front,DriveTrain_Rear,Cylinders_4,Cylinders_5,Cylinders_6,Cylinders_8,Cylinders_rotary,Man.trans.avail_Yes,Origin_non-USA
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [11]:
# Get numeric columns 
# Apply scaling
X_Num = X.select_dtypes(include='number')

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().set_output(transform='pandas')
scaler.fit(X_Num)
X_Num_Pre = scaler.transform(X_Num)

In [12]:
# Join All columns as X_Pre
X_Pre = X_Num_Pre.join(X_Cat_Pre)
X_Pre.head(1)


Unnamed: 0,Min.Price,Price,Max.Price,MPG.city,MPG.highway,EngineSize,Horsepower,RPM,Rev.per.mile,Fuel.tank.capacity,...,AirBags_None,DriveTrain_Front,DriveTrain_Rear,Cylinders_4,Cylinders_5,Cylinders_6,Cylinders_8,Cylinders_rotary,Man.trans.avail_Yes,Origin_non-USA
0,-0.485787,-0.37572,-0.282465,0.471312,0.360925,-0.841022,-0.073484,1.717489,1.12953,-1.062184,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [20]:
# Train Test Split
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain , ytest = train_test_split(X_Pre, Y, train_size=0.25, random_state=21)
xtrain.head(1)

Unnamed: 0,Min.Price,Price,Max.Price,MPG.city,MPG.highway,EngineSize,Horsepower,RPM,Rev.per.mile,Fuel.tank.capacity,...,AirBags_None,DriveTrain_Front,DriveTrain_Rear,Cylinders_4,Cylinders_5,Cylinders_6,Cylinders_8,Cylinders_rotary,Man.trans.avail_Yes,Origin_non-USA
77,0.364897,0.956592,1.38557,-0.423219,-0.581941,-0.55026,-0.073484,1.212025,1.17003,0.409445,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [21]:
# Build Model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain, ytrain)

model.score(xtrain , ytrain)

1.0

In [22]:

model.score(xtest , ytest)

0.8414224279367748

In [23]:
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score
)

In [24]:
def evaluate_model(model, x, y):
    ypred = model.predict(x)
    rmse = root_mean_squared_error(y, ypred)
    print(f'RMSE:{rmse:.4f}')
    mae = mean_absolute_error(y, ypred)
    print(f'MAE:{mae:.2f}')
    mape = mean_absolute_percentage_error(y, ypred)
    print(f'MAPE:{mape:.2%}')
    r2 = r2_score(y, ypred)
    print(f'R2_SCORE:{r2:.2%}')

print("Train Results...")
evaluate_model(model, xtrain, ytrain)
print("Test Results...")
evaluate_model(model, xtest, ytest)

Train Results...
RMSE:0.0000
MAE:0.00
MAPE:0.00%
R2_SCORE:100.00%
Test Results...
RMSE:238.2948
MAE:179.38
MAPE:5.99%
R2_SCORE:84.14%


In [None]:
# Model Inference
xnew = pd.read_csv("sample.csv", na_values = ["", "NA"], keep_default_na = False)
xnew.head(1)


Unnamed: 0,Manufacturer,Type,AirBags,DriveTrain,Cylinders,Man.trans.avail,Origin
0,Audi,Midsize,,Front,6,Yes,non-USA


In [30]:
# Data Sanity check for new file
replacer(xnew)

In [31]:
xnew_cat = xnew.select_dtypes(include = "object").drop(columns = high_cat_uniq_cols)
xnew_cat.head(1)

Unnamed: 0,Manufacturer,Type,AirBags,DriveTrain,Cylinders,Man.trans.avail,Origin
0,Audi,Midsize,,Front,6,Yes,non-USA


In [32]:
xnew_cat_pre = ohe.transform(xnew_cat)
xnew_cat_pre.head(1)

Unnamed: 0,Manufacturer_Audi,Manufacturer_BMW,Manufacturer_Buick,Manufacturer_Cadillac,Manufacturer_Chevrolet,Manufacturer_Chrylser,Manufacturer_Chrysler,Manufacturer_Dodge,Manufacturer_Eagle,Manufacturer_Ford,...,AirBags_None,DriveTrain_Front,DriveTrain_Rear,Cylinders_4,Cylinders_5,Cylinders_6,Cylinders_8,Cylinders_rotary,Man.trans.avail_Yes,Origin_non-USA
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0


In [33]:
xnew_num = xnew.select_dtypes(include = "number")
xnew_num_pre = scaler.transform(xnew_num)
xnew_num_pre.head(1)

Unnamed: 0,Min.Price,Price,Max.Price,MPG.city,MPG.highway,EngineSize,Horsepower,RPM,Rev.per.mile,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room
0,1.571949,1.893374,2.069191,-0.602126,-0.581941,0.128186,0.540813,0.369586,0.410659,-0.510323,0.884457,0.467905,0.302785,-1.16435,-0.610436,1.078172,1.111472


In [34]:
xnew_pre = xnew_num_pre.join(xnew_cat_pre)
xnew_pre.head(1)

Unnamed: 0,Min.Price,Price,Max.Price,MPG.city,MPG.highway,EngineSize,Horsepower,RPM,Rev.per.mile,Fuel.tank.capacity,...,AirBags_None,DriveTrain_Front,DriveTrain_Rear,Cylinders_4,Cylinders_5,Cylinders_6,Cylinders_8,Cylinders_rotary,Man.trans.avail_Yes,Origin_non-USA
0,1.571949,1.893374,2.069191,-0.602126,-0.581941,0.128186,0.540813,0.369586,0.410659,-0.510323,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0


In [35]:
ypred = model.predict(xnew_pre)
ypred

array([3313.15815908, 2575.        , 2964.3217968 , 2888.40955834,
       2329.22542872])

In [36]:
xnew["Weight_pred"] = ypred.round(2)
xnew

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make,Weight_pred
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,6,190,106,65,37,31.0,17.0,non-USA,Audi 100,3313.16
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird,2575.0
2,Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,...,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina,2964.32
3,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,2,169,96,69,37,27.625,14.0,non-USA,Mazda RX-7,2888.41
4,Volkswagen,Fox,Small,8.7,9.1,9.5,25,33,,Front,...,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox,2329.23
