##### `Ridge & Lasso`
1. Ridge penalty is applied on square of coefficient
2. Lasso penalty is applied on absolute values of coefficient

Problem Stmt: Estimate Weight(Col) based on other Cols in File

In [None]:
from warnings import filterwarnings
filterwarnings("ignore")

#----------------------
#Step-1: Data Ingestion
#----------------------

import pandas as pd
df = pd.read_csv("Cars93.csv", na_values = ["", "NA"], keep_default_na = False)

#----------------------------------------
#Step-2: Data Sanity - Duplicates Removal
#----------------------------------------

# Duplicate Data
duplicate_count = df.duplicated().sum()

if duplicate_count > 0:
    print(f'Duplicates Found: {duplicate_count}', 'Removing Duplicates...')
    df = df.drop_duplicates(keep = "first").reset_index(drop = True)
    print('Removed Duplicates...')
else:
    print('No Duplicates Found')

#Step-3: Separate X and Y
X = df.drop(columns = ["id", "Weight"]); Y = df["Weight"]

#-----------------------------------
#Step-4: Remove High Unique Cat Cols
#-----------------------------------

card = df.select_dtypes(include = "object").nunique() / len(df); high_card = card[card >= 0.9]
X = X.drop(columns = high_card.index)

#------------------------
#Step-5: Train Test Split
#------------------------
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

#--------------------------------
#Step-6: Apply Preprocessing on X
#--------------------------------

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

num_pipe = make_pipeline(SimpleImputer(strategy = "median"), StandardScaler())
cat_pipe = make_pipeline(SimpleImputer(strategy="most_frequent"), 
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), StandardScaler())

num_cols = X.select_dtypes(include = "number").columns; cat_cols = X.select_dtypes(include = "object").columns

pre = ColumnTransformer([("num", num_pipe, num_cols), ("cat", cat_pipe, cat_cols)]).set_output(transform = "pandas")
pre.fit(xtrain)

xtrain_pre = pre.transform(xtrain); xtest_pre = pre.transform(xtest)

#-------------------
#Step-7: Build Model
#-------------------

##### `Ridge (Penalty applied on square of coefficient)`
#------------------------------------------------------

from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha=1); ridge_model.fit(xtrain_pre, ytrain)

#-----------------------------
#Step-8: Hyper paramter tuning
#-----------------------------
from sklearn.model_selection import GridSearchCV
params = {"alpha": [0.1, 0.2, 1, 10, 50, 100, 200, 500]}
ridge_model = Ridge()

gscv_ridge = GridSearchCV(ridge_model, params, scoring="r2", cv=5)
gscv_ridge.fit(xtrain_pre, ytrain)

best_ridge = gscv_ridge.best_estimator_

print(f'[Ridge Model]--Best Ridge Train Score: {best_ridge.score(xtrain_pre, ytrain)}')
print(f'[Ridge Model]--Best Ridge Test Score: {best_ridge.score(xtest_pre, ytest)}')

ytrain_pred = best_ridge.predict(xtrain_pre)
ytest_pred = best_ridge.predict(xtest_pre)

print(f'[GridSearchCV Ridge]--Best parameters: {gscv_ridge.best_params_}')
print(f'[GridSearchCV Ridge]--Best Score: {gscv_ridge.best_score_}')
print(f'[GridSearchCV Ridge]--Best Estimator: {best_ridge}')
print(f'[Ridge Model]--ytrain pred with [Best Ridge Model]: {ytrain_pred[0:2]}')
print(f'[Ridge Model]--ytest pred with [Best Ridge Model]: {ytest_pred[0:2]}')

#----------------------
#Step-9: Evaluate Model
#----------------------

from sklearn.metrics import (root_mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score)

def evaluate_metrics(model, x, y):
    ypred = model.predict(x)
    rmse = root_mean_squared_error(y, ypred); mae = mean_absolute_error(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred); r2 = r2_score(y, ypred)
    return f"[RMSE : {rmse:.2f}| MAE : {mae:.2f}| MAPE : {mape:.2%}| R2 : {r2:.2%}]"

print("[Ridge Model]--Train Results : ", evaluate_metrics(best_ridge, xtrain_pre, ytrain))
print("[Ridge Model]--Test Results : ", evaluate_metrics(best_ridge, xtest_pre, ytest))

#---------------------------------------------------------------
#Step-10: Model Inference (Out of Sample Prediction) Using Ridge
#---------------------------------------------------------------
xnew = pd.read_csv("sample.csv", na_values = ["", "NA"], keep_default_na = False)
xnew = xnew.drop(columns = high_card.index)
xnew_pre = pre.transform(xnew)
preds = best_ridge.predict(xnew_pre)
print(f'[Ridge Model]--Out of Sample: Predicted Weights (First 2) : {preds[0:2].round(2)}')
print('\n')

#-------------------------------------------------------------------
##### `Lasso - L1 (Penalty applied on absolute value of coefficients)`
#--------------------------------------------------------------------

from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha= 0.1)
lasso_model.fit(xtrain_pre, ytrain)

params = {"alpha": [0.1, 1, 10, 100, 200, 500, 1000]}
lasso_model = Lasso()

gscv_lasso = GridSearchCV(lasso_model, params, cv=5, scoring="r2")
gscv_lasso.fit(xtrain_pre, ytrain)

best_lasso = gscv_lasso.best_estimator_

ytrain_pred = best_lasso.predict(xtrain_pre)
ytest_pred = best_lasso.predict(xtest_pre)

print(f'[GridSearchCV Lasso]--Best Parameters: {gscv_lasso.best_params_}')
print(f'[GridSearchCV Lasso]--Best Score: {gscv_lasso.best_score_}')
print(f'[GridSearchCV Lasso]--Best Estimator: {best_lasso}')
print(f'[Lasso Model]--Best Lasso Train Score: {best_lasso.score(xtrain_pre, ytrain)}')
print(f'[Lasso Model]--Best Lasso Test Score: {best_lasso.score(xtest_pre, ytest)}')
print(f'[Lasso Model]--ytrain pred with [Best Lasso Model]: {ytrain_pred[0:2]}')
print(f'[Lasso Model]--ytest pred with [Best Lasso Model]: {ytest_pred[0:2]}')

# Evaluate Lasso for Train & Test
print("[Lasso Model]--Train Results:", evaluate_metrics(best_lasso, xtrain_pre, ytrain))
print("[Lasso Model]--Test Results :", evaluate_metrics(best_lasso, xtest_pre, ytest))

#Model Inference (Out of sample) for Lasso
xnew = pd.read_csv("sample.csv", na_values = ["", "NA"], keep_default_na=False)
new_pre = pre.transform(xnew)
preds = best_lasso.predict(xnew_pre)
print(f'[Lasso Model]--Out of Sample: Predicted Weights (First 2) : {preds[0:2].round(2)}')

#--------------------------
#Step-11: Save & Load Model
#--------------------------
import joblib

#Save Models
# joblib.dump(pre, "pre.joblib")
# joblib.dump(best_ridge, "ridge_model.joblib")
# joblib.dump(best_lasso, "lasso_model.joblib")

# Load Models
p = joblib.load("pre.joblib")
best_ridge_model = joblib.load("ridge_model.joblib")


Duplicates Found: 1 Removing Duplicates...
Removed Duplicates...
[Ridge Model]--Best Ridge Train Score: 0.9632962078121209
[Ridge Model]--Best Ridge Test Score: 0.9331374344753898
[GridSearchCV Ridge]--Best parameters: {'alpha': 10}
[GridSearchCV Ridge]--Best Score: 0.928904369376218
[GridSearchCV Ridge]--Best Estimator: Ridge(alpha=10)
[Ridge Model]--ytrain pred with [Best Ridge Model]: [3810.53246426 3699.62408947]
[Ridge Model]--ytest pred with [Best Ridge Model]: [2942.38293133 2317.9397172 ]
[Ridge Model]--Train Results :  [RMSE : 112.67| MAE : 88.04| MAPE : 2.89%| R2 : 96.33%]
[Ridge Model]--Test Results :  [RMSE : 148.59| MAE : 113.32| MAPE : 3.96%| R2 : 93.31%]
[Ridge Model]--Out of Sample: Predicted Weights (First 2) : [3281.88 2679.53]


[GridSearchCV Lasso]--Best Parameters: {'alpha': 1}
[GridSearchCV Lasso]--Best Score: 0.929679262747895
[GridSearchCV Lasso]--Best Estimator: Lasso(alpha=1)
[Lasso Model]--Best Lasso Train Score: 0.9701324385598128
[Lasso Model]--Best Lasso T