##### `Manual Preprocessing`
Problem Statement: Estimate Weight(Col) of Car based on other Factors (Cols)

In [5]:
# Filter Warnings
from warnings import filterwarnings
filterwarnings("ignore")

#-----------------------
# Step-1: Data Ingestion
#-----------------------

import pandas as pd
df = pd.read_csv("Cars93.csv", keep_default_na=False, na_values=["", "NA"])
print(f'Actual Data Size: {df.shape}')

#-----------------------------------
# Step-2: Perform Data Sanity Checks
#-----------------------------------

#Check duplicates
print('Duplicates Count Before Removal:', df.duplicated().sum())
# Drop Duplicates
df = df.drop_duplicates(keep="first").reset_index(drop=True)
#Check duplicates
print('Duplicates Count After Removal:', df.duplicated().sum())

# Check Missing Values in File

m_before = df.isna().sum()
print("Missing Column Values Before Replacing: ", m_before[m_before > 0].size)

# If Exist Replace as below
    # Categorical Values with most occured value
    # Numeric value with mean or median

# Create a replacer method to do the above replacing values
def replacer(df):
    cat_cols = df.select_dtypes(include="object").columns
    num_cols = df.select_dtypes(include='number').columns
    for col in df.columns:
        if col in cat_cols:
            mode = df[col].mode()[0]
            df[col] = df[col].fillna(mode)
        elif col in num_cols:
            mean = df[col].mean()
            df[col] = df[col].fillna(mean)

replacer(df)

m_after = df.isna().sum()
print("Missing Column Values After Replacing:", m_after[m_after > 0].size)
# Check Data Size
print('After Sanity Changes Data Size:', df.shape)

# Note: Categoric values which has huge unique values needs to be discarded/dropped from dataframe as 
# One Hot Encoding will create large amoun of columns which is not ideal for prediction
cat_uniq_cols = df.select_dtypes(include="object").nunique()
high_cat_uniq_cols = cat_uniq_cols[cat_uniq_cols/len(df) >= 0.9].index
df = df.drop(columns = high_cat_uniq_cols )

#------------------------
# Step-3 Separate X and Y
#------------------------

X = df.drop(columns=['id', 'Weight'])
Y = df["Weight"]

#--------------------------------
#Step-4: Apply Preprocessing on X
#--------------------------------

# Apply OHE(OneHotEncoding) on categorical columns
X_Cat = X.select_dtypes(include='object')


from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(
    handle_unknown='ignore', sparse_output=False, drop='first'
).set_output(transform='pandas')

ohe.fit(X_Cat)
X_Cat_Pre = ohe.transform(X_Cat)

# Apply scaling on numeric columns
X_Num = X.select_dtypes(include='number')

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().set_output(transform='pandas')
scaler.fit(X_Num)
X_Num_Pre = scaler.transform(X_Num)

# Join All columns as X_Pre
X_Pre = X_Num_Pre.join(X_Cat_Pre)
X_Pre.head(1)

#------------------------
#Step-5: Train Test Split
#------------------------

from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain , ytest = train_test_split(X_Pre, Y, train_size=0.25, random_state=21)

#-------------------
#Step-6: Build Model
#-------------------

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain, ytrain)
print('Linear Regression Model Train Score', model.score(xtrain , ytrain))
print('Linear Regression Model Test Score', model.score(xtest , ytest))

#----------------------
#Step-6: Evaluate Model
#----------------------
from sklearn.metrics import (
    root_mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
)

def evaluate_model(model, x, y):
    ypred = model.predict(x)
    rmse = root_mean_squared_error(y, ypred); mae = mean_absolute_error(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred); r2 = r2_score(y, ypred)
    return f"RMSE:{rmse:.2f} |MAE: {mae:.2f} |MAPE: {mape:.2%} |R2_SCORE: {r2:.2%}"

print("Train Results:", evaluate_model(model, xtrain, ytrain))
print("Test Results:", evaluate_model(model, xtest, ytest))

#------------------------
# Step-7: Model Inference
#------------------------

xnew = pd.read_csv("sample.csv", na_values = ["", "NA"], keep_default_na = False)
# Data Sanity check for new file
replacer(xnew)

xnew_cat = xnew.select_dtypes(include = "object").drop(columns = high_cat_uniq_cols)
xnew_cat_pre = ohe.transform(xnew_cat)

xnew_num = xnew.select_dtypes(include = "number")
xnew_num_pre = scaler.transform(xnew_num)

xnew_pre = xnew_num_pre.join(xnew_cat_pre)

ypred = model.predict(xnew_pre)

xnew["Weight_pred"] = ypred.round(2)

xnew.head(2)

Actual Data Size: (94, 28)
Duplicates Count Before Removal: 1
Duplicates Count After Removal: 0
Missing Column Values Before Replacing:  3
Missing Column Values After Replacing: 0
After Sanity Changes Data Size: (93, 28)
Linear Regression Model Train Score 1.0
Linear Regression Model Test Score 0.8414224279367748
Train Results: RMSE:0.00 |MAE: 0.00 |MAPE: 0.00% |R2_SCORE: 100.00%
Test Results: RMSE:238.29 |MAE: 179.38 |MAPE: 5.99% |R2_SCORE: 84.14%


Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make,Weight_pred
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,6,190,106,65,37,31.0,17.0,non-USA,Audi 100,3313.16
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird,2575.0
