In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPRegressor
from lightgbm import LGBMRegressor
from sklearn.kernel_ridge import KernelRidge
%matplotlib inline
from sklearn.metrics import mean_squared_log_error
from sklearn.neighbors import KNeighborsRegressor

import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

imputed_data = pd.read_csv("sbux_clv_drop_AOV_and_completed_offers_across_channels_with_dummies.csv")

#After calling get dummies in preprocessing phase, 
#Drop Gender_Unknown, Age_Unknown, HH Income_Unknown to reduce multicollinearity
# Prepare the data
X = imputed_data[['MemberSince',
       'Recency (# Days ago from last trans. As of final day up to Day 15)',
       'Frequency (# Trans. from Day 1-15)',
       'Monetary Value (Sum of Trans. from Day 1-15)',
       '# of Marketing offers that Starbucks sent to each customer from Day 1-15',
       '# of Marketing offers that were viewed from Day 1-15',
       '# of Marketing offers that were successfully completed from Day 1-15',
       'Marketing Offer View Rate from Day 1-15',
       'Marketing Offer Response Rate from Day 1-15',
       'Age_18-34', 'Age_35-50', 'Age_51-67',
       'Age_68-84', 'Age_85-101', 'Gender_F', 'Gender_M',#'Age_Unknown',
       'Gender_O', 'HH Income_100k-120k',#'Gender_Unknown','HH Income_Unknown'
       'HH Income_30k-50k', 'HH Income_50k-75k', 'HH Income_75k-100k']]
   

y = imputed_data['Sum(Trans. Amt from Day 16-30)']

# Define the models
mlr = LinearRegression()
ridge = Ridge()
lasso = Lasso()
elastic = ElasticNet()
dt = DecisionTreeRegressor()
rf = RandomForestRegressor()
xgb = XGBRegressor()
svr = SVR()
lgbm = LGBMRegressor()
mlp = MLPRegressor()
kr = KernelRidge()
knn = KNeighborsRegressor()

# Define base estimators to be used in the ensemble
base_estimators = [('rf', RandomForestRegressor()),
                   ('xgb', XGBRegressor()),
                   ('dt', DecisionTreeRegressor())]

# Define a final estimator to combine the predictions of the base estimators
final_estimator = StackingRegressor(estimators=[('ridge', RidgeCV()),
                                                 ('lasso', LassoCV())])

# Define a StackingRegressor object with the base and final estimator
stacked = StackingRegressor(estimators=base_estimators, final_estimator=final_estimator)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Define the cross-validation strategy
cv = KFold(n_splits=5, shuffle=True)

# Fit and evaluate each model using cross-validation
models = {'MLR': mlr, 'Ridge': ridge, 'Lasso': lasso, 'Elastic Net': elastic, 'Decision Tree': dt, 'Random Forest': rf, 'XGBoost': xgb, 'SVR': svr, 'Stacked Ensemble': stacked, 'LGBM':lgbm, 'MLP':mlp, 'Kernel Ridge': kr, 'KNN': knn}

# Initialize variables to track the best model and its MAE
best_model_mae = None
best_mae = np.inf

metrics_dict = {}
for name, model in tqdm(models.items(), desc="Processing models"):
    metrics_dict[name] = []

    # Evaluate the model using cross-validation on the training set
    mae_train = -cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error')

    # Evaluate the model on the test set
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    mae_test = mean_absolute_error(y_test, y_pred_test)

    metrics_dict[name].extend([mae_train.mean(), mae_test])

    # Check if current model has lower MAE
    if mae_test < best_mae:
        best_model_mae = name
        best_mae = mae_test

# Create a DataFrame and print the results
metrics_df = pd.DataFrame(metrics_dict, index=['MAE Train', 'MAE Test']).T
metrics_df['MAE Train'] = metrics_df['MAE Train'].apply(lambda x: f'${x:.2f}')
metrics_df['MAE Test'] = metrics_df['MAE Test'].apply(lambda x: f'${x:.2f}')
metrics_df.index.name = 'Model'
print(metrics_df.to_string())

# Print the best model for MAE
print(f"\nBest Model (based on MAE): {best_model_mae}")


Processing models: 100%|███████████████████████████████████████████████████████████████████| 13/13 [05:59<00:00, 27.64s/it]

                 MAE Train MAE Test
Model                              
MLR                 $39.11   $40.65
Ridge               $39.08   $40.65
Lasso               $39.89   $41.37
Elastic Net         $42.08   $43.57
Decision Tree       $54.45   $53.03
Random Forest       $38.77   $39.51
XGBoost             $39.31   $40.48
SVR                 $45.03   $46.28
Stacked Ensemble    $38.12   $39.03
LGBM                $36.30   $37.02
MLP                 $37.44   $40.74
Kernel Ridge        $39.48   $40.98
KNN                 $40.27   $42.06

Best Model (based on MAE): LGBM





# My Winner Model is LGBM.