##### `Forward Feature Selection`

In [1]:
# Ignore Warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [8]:
#Step-1: Data Ingestion
import pandas as pd
df = pd.read_csv('Cars93.csv', keep_default_na=False, na_values=["", "NA"])
df.head(1)

Unnamed: 0,id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra


In [3]:
# Problem Stmt: Estimate Weight (Col) based on other Cols in File

In [None]:
#Step-2: Data Sanity - Duplicate removals


In [9]:
# Duplicate Data
duplicate_count = df.duplicated().sum()

if duplicate_count > 0:
    print(f'Duplicates Found: {duplicate_count}')
    print('Removing Duplicates...')
    df = df.drop_duplicates(keep = "first").reset_index(drop = True)
    print('Removed Duplicates...')
    df.shape
else:
    print('No Duplicates Found')

Duplicates Found: 1
Removing Duplicates...
Removed Duplicates...


In [15]:
#Step-3: Separate X and Y
X = df.drop(columns = ["id", "Weight"])
Y = df["Weight"]
X.head(1)

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25,31,,Front,...,13.2,5,177,102,68,37,26.5,11.0,non-USA,Acura Integra


In [16]:
#Step-4: Remove High Unique Cat Cols
card = df.select_dtypes(include = "object").nunique() / len(df)
high_card = card[card >= 0.9]
X = X.drop(columns = high_card.index)

In [17]:
#Step-5: Train Test Split
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

In [18]:
#Step-6: Apply Preprocessing on X
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [19]:
num_pipe = make_pipeline(
    SimpleImputer(strategy = "median"),
    StandardScaler()
)

cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    StandardScaler()
)
num_cols = X.select_dtypes(include = "number").columns
cat_cols = X.select_dtypes(include = "object").columns

pre = ColumnTransformer(
    [
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ]
).set_output(transform = "pandas")

pre.fit(xtrain)

xtrain_pre = pre.transform(xtrain)
xtest_pre = pre.transform(xtest)

In [23]:
#Step-7: Forward Feature Selection
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

base_model = LinearRegression()
sel = SequentialFeatureSelector(
    base_model, n_features_to_select="auto", scoring="r2", direction="forward"
)
sel.fit(xtrain_pre, ytrain)
sel_cols = sel.get_feature_names_out()
print(f'Selected Features Count: {len(sel_cols)}')

xtrain_pre_sel = sel.transform(xtrain_pre)
# xtrain_pre_sel[0:5]
xtest_pre_sel = sel.transform(xtest_pre)
# xtest_pre_sel[0:5]

Selected Features Count: 12


In [29]:
#Step-8: Build Model
from sklearn.linear_model import LinearRegression

model_1 = LinearRegression()
model_1.fit(xtrain_pre_sel, ytrain)

print('Train Score...')
print(model_1.score(xtrain_pre_sel, ytrain))
print('Test Score...')
print(model_1.score(xtest_pre_sel, ytest))

Train Score...
0.9672855150369575
Test Score...
0.9329345515373539


In [35]:
#Step-9: Evaluate Model
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score
)

def evaluate_metrics(model, x, y):
    ypred = model.predict(x)
    rmse = root_mean_squared_error(y, ypred)
    mae = mean_absolute_error(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred)
    r2 = r2_score(y, ypred)
    print(f"RMSE : {rmse:.2f}")
    print(f"MAE : {mae:.2f}")
    print(f"MAPE : {mape:.2%}")
    print(f"R2 : {r2:.2%}")

print("Train Results : ")
evaluate_metrics(model_1, xtrain_pre_sel, ytrain)
print("\nTest Results : ")
evaluate_metrics(model_1, xtest_pre_sel, ytest)

print('\nPredicted Weights...')
ypred = model_1.predict(xtest_pre_sel)
print(ypred[0:2])

Train Results : 
RMSE : 106.37
MAE : 82.26
MAPE : 2.68%
R2 : 96.73%

Test Results : 
RMSE : 148.81
MAE : 118.79
MAPE : 4.20%
R2 : 93.29%

Predicted Weights...
[2936.47928708 2358.44312523]


In [36]:
#Step-10: Model Inference (Out of Sample Prediction)
xnew = pd.read_csv("sample.csv", na_values = ["", "NA"], keep_default_na = False)
xnew = xnew.drop(columns = high_card.index)
xnew_pre = pre.transform(xnew)
xnew_pre_sel = sel.transform(xnew_pre)
preds = model_1.predict(xnew_pre_sel)
print('\nOut of Sample: Predicted Weights...')
print(preds[0:2])


Out of Sample: Predicted Weights...
[3309.72705212 2652.99494296]


In [37]:
#Step-11: Save & Load Model
import joblib

# joblib.dump(pre, "pre.joblib")
# joblib.dump(sel, "sel.joblib")
# joblib.dump(model_1, "car_model.joblib")

# joblib.load("car_model.joblib")