##### `Manual Preprocessing`
###### <b>Problem Statement: Estimate Weight (Column) of Car based on other Factors (Columns)<b>

In [21]:
# Filter Warnings
from warnings import filterwarnings
filterwarnings("ignore")

In [23]:
# Step-1: Data Ingestion
import pandas as pd
df = pd.read_csv("Cars93.csv", keep_default_na=False, na_values=["", "NA"])
print(f'Actual Data Size: {df.shape}')

Actual Data Size: (94, 28)


In [24]:
# Step-2: Perform Data Sanity Checks
#Check duplicates
print('Duplicates Count Before Removal:', df.duplicated().sum())
# Drop Duplicates
df = df.drop_duplicates(keep="first").reset_index(drop=True)
#Check duplicates
print('Duplicates Count After Removal:', df.duplicated().sum())

Duplicates Count Before Removal: 1
Duplicates Count After Removal: 0


In [25]:
# Check Missing Values in File

m_before = df.isna().sum()
print("Missing Column Values Before Replacing: ", m_before[m_before > 0].size)

# If Exist Replace as below
    # Categorical Values with most occured value
    # Numeric value with mean or median

# Create a replacer method to do the above replacing values
def replacer(df):
    cat_cols = df.select_dtypes(include="object").columns
    num_cols = df.select_dtypes(include='number').columns
    for col in df.columns:
        if col in cat_cols:
            mode = df[col].mode()[0]
            df[col] = df[col].fillna(mode)
        elif col in num_cols:
            mean = df[col].mean()
            df[col] = df[col].fillna(mean)

replacer(df)

m_after = df.isna().sum()
print("Missing Column Values After Replacing:", m_after[m_after > 0].size)
# Check Data Size
print('After Sanity Changes Data Size:', df.shape)

Missing Column Values Before Replacing:  3
Missing Column Values After Replacing: 0
After Sanity Changes Data Size: (93, 28)


In [26]:
# Note: Categoric values which has huge unique values needs to be discarded/dropped from dataframe as 
# One Hot Encoding will create large amoun of columns which is not ideal for prediction
cat_uniq_cols = df.select_dtypes(include="object").nunique()
high_cat_uniq_cols = cat_uniq_cols[cat_uniq_cols/len(df) >= 0.9].index
df = df.drop(columns = high_cat_uniq_cols )

In [27]:
# Step-3 Separate X and Y
X = df.drop(columns=['id', 'Weight'])
Y = df["Weight"]

In [40]:
# Apply Preprocessing on X

# Apply OHE(OneHotEncoding) on categorical columns
X_Cat = X.select_dtypes(include='object')
print("Categorical Columns size:", X_Cat.columns.size)

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(
    handle_unknown='ignore', sparse_output=False, drop='first'
).set_output(transform='pandas')

ohe.fit(X_Cat)
X_Cat_Pre = ohe.transform(X_Cat)
print('Categorical Columns converted to Numeric:')
X_Cat_Pre.head(1)

Categorical Columns size: 7
Categorical Columns converted to Numeric:


Unnamed: 0,Manufacturer_Audi,Manufacturer_BMW,Manufacturer_Buick,Manufacturer_Cadillac,Manufacturer_Chevrolet,Manufacturer_Chrylser,Manufacturer_Chrysler,Manufacturer_Dodge,Manufacturer_Eagle,Manufacturer_Ford,...,AirBags_None,DriveTrain_Front,DriveTrain_Rear,Cylinders_4,Cylinders_5,Cylinders_6,Cylinders_8,Cylinders_rotary,Man.trans.avail_Yes,Origin_non-USA
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [41]:
# Get numeric columns 
# Apply scaling
X_Num = X.select_dtypes(include='number')
print("Numeric Columns size:", X_Num.columns.size)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().set_output(transform='pandas')
scaler.fit(X_Num)
X_Num_Pre = scaler.transform(X_Num)

Numeric Columns size: 17


In [42]:
# Join All columns as X_Pre
X_Pre = X_Num_Pre.join(X_Cat_Pre)
X_Pre.head(1)


Unnamed: 0,Min.Price,Price,Max.Price,MPG.city,MPG.highway,EngineSize,Horsepower,RPM,Rev.per.mile,Fuel.tank.capacity,...,AirBags_None,DriveTrain_Front,DriveTrain_Rear,Cylinders_4,Cylinders_5,Cylinders_6,Cylinders_8,Cylinders_rotary,Man.trans.avail_Yes,Origin_non-USA
0,-0.485787,-0.37572,-0.282465,0.471312,0.360925,-0.841022,-0.073484,1.717489,1.12953,-1.062184,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [43]:
# Train Test Split
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain , ytest = train_test_split(X_Pre, Y, train_size=0.25, random_state=21)
xtrain.head(1)

Unnamed: 0,Min.Price,Price,Max.Price,MPG.city,MPG.highway,EngineSize,Horsepower,RPM,Rev.per.mile,Fuel.tank.capacity,...,AirBags_None,DriveTrain_Front,DriveTrain_Rear,Cylinders_4,Cylinders_5,Cylinders_6,Cylinders_8,Cylinders_rotary,Man.trans.avail_Yes,Origin_non-USA
77,0.364897,0.956592,1.38557,-0.423219,-0.581941,-0.55026,-0.073484,1.212025,1.17003,0.409445,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [45]:
# Build Model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain, ytrain)
print('Linear Regression Model Train Score', model.score(xtrain , ytrain))
print('Linear Regression Model Test Score', model.score(xtest , ytest))

Linear Regression Model Train Score 1.0
Linear Regression Model Test Score 0.8414224279367748


In [50]:
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score
)

def evaluate_model(model, x, y):
    ypred = model.predict(x)
    rmse = root_mean_squared_error(y, ypred)
    mae = mean_absolute_error(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred)
    r2 = r2_score(y, ypred)
    print('RMSE:', rmse, '|MAE:', mae, '|MAPE:', mape, '|R2_SCORE:', r2)

print("Train Results...")
evaluate_model(model, xtrain, ytrain)
print("Test Results...")
evaluate_model(model, xtest, ytest)

Train Results...
RMSE: 1.282719818000544e-12 |MAE: 9.292663257245137e-13 |MAPE: 3.1620742452351e-16 |R2_SCORE: 1.0
Test Results...
RMSE: 238.29484074098394 |MAE: 179.37806365986836 |MAPE: 0.05989043270883522 |R2_SCORE: 0.8414224279367748


In [None]:
# Model Inference
xnew = pd.read_csv("sample.csv", na_values = ["", "NA"], keep_default_na = False)
xnew.head(1)


Unnamed: 0,Manufacturer,Type,AirBags,DriveTrain,Cylinders,Man.trans.avail,Origin
0,Audi,Midsize,,Front,6,Yes,non-USA


In [30]:
# Data Sanity check for new file
replacer(xnew)

In [31]:
xnew_cat = xnew.select_dtypes(include = "object").drop(columns = high_cat_uniq_cols)
xnew_cat.head(1)

Unnamed: 0,Manufacturer,Type,AirBags,DriveTrain,Cylinders,Man.trans.avail,Origin
0,Audi,Midsize,,Front,6,Yes,non-USA


In [32]:
xnew_cat_pre = ohe.transform(xnew_cat)
xnew_cat_pre.head(1)

Unnamed: 0,Manufacturer_Audi,Manufacturer_BMW,Manufacturer_Buick,Manufacturer_Cadillac,Manufacturer_Chevrolet,Manufacturer_Chrylser,Manufacturer_Chrysler,Manufacturer_Dodge,Manufacturer_Eagle,Manufacturer_Ford,...,AirBags_None,DriveTrain_Front,DriveTrain_Rear,Cylinders_4,Cylinders_5,Cylinders_6,Cylinders_8,Cylinders_rotary,Man.trans.avail_Yes,Origin_non-USA
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0


In [33]:
xnew_num = xnew.select_dtypes(include = "number")
xnew_num_pre = scaler.transform(xnew_num)
xnew_num_pre.head(1)

Unnamed: 0,Min.Price,Price,Max.Price,MPG.city,MPG.highway,EngineSize,Horsepower,RPM,Rev.per.mile,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room
0,1.571949,1.893374,2.069191,-0.602126,-0.581941,0.128186,0.540813,0.369586,0.410659,-0.510323,0.884457,0.467905,0.302785,-1.16435,-0.610436,1.078172,1.111472


In [34]:
xnew_pre = xnew_num_pre.join(xnew_cat_pre)
xnew_pre.head(1)

Unnamed: 0,Min.Price,Price,Max.Price,MPG.city,MPG.highway,EngineSize,Horsepower,RPM,Rev.per.mile,Fuel.tank.capacity,...,AirBags_None,DriveTrain_Front,DriveTrain_Rear,Cylinders_4,Cylinders_5,Cylinders_6,Cylinders_8,Cylinders_rotary,Man.trans.avail_Yes,Origin_non-USA
0,1.571949,1.893374,2.069191,-0.602126,-0.581941,0.128186,0.540813,0.369586,0.410659,-0.510323,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0


In [35]:
ypred = model.predict(xnew_pre)
ypred

array([3313.15815908, 2575.        , 2964.3217968 , 2888.40955834,
       2329.22542872])

In [36]:
xnew["Weight_pred"] = ypred.round(2)
xnew

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make,Weight_pred
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,6,190,106,65,37,31.0,17.0,non-USA,Audi 100,3313.16
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird,2575.0
2,Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,...,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina,2964.32
3,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,2,169,96,69,37,27.625,14.0,non-USA,Mazda RX-7,2888.41
4,Volkswagen,Fox,Small,8.7,9.1,9.5,25,33,,Front,...,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox,2329.23
