### Includes RandomSearch, Train Test Split, KFold CV=5, shuffle=True 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV

import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

imputed_data = pd.read_csv("sbux_clv_drop_AOV_and_completed_offers_across_channels_95percent.csv")

In [2]:
imputed_data=imputed_data.drop(['Unnamed: 0','Unnamed: 0.1','Customer ID','CLV_Quantile'],axis=1)
imputed_data.head()

Unnamed: 0,Age,Gender,HH Income,MemberSince,Recency (# Days ago from last trans. As of final day up to Day 15),Frequency (# Trans. from Day 1-15),Monetary Value (Sum of Trans. from Day 1-15),# of Marketing offers that Starbucks sent to each customer from Day 1-15,# of Marketing offers that were viewed from Day 1-15,# of Marketing offers that were successfully completed from Day 1-15,Marketing Offer View Rate from Day 1-15,Marketing Offer Response Rate from Day 1-15,Sum(Trans. Amt from Day 16-30)
0,18-34,M,50k-75k,2017,5,1,22.16,2.0,1.0,0.0,0.5,0.0,105.44
1,Unknown,Unknown,Unknown,2018,2,1,0.7,1.0,1.0,0.0,1.0,0.0,3.39
2,35-50,O,50k-75k,2018,4,2,25.42,3.0,3.0,1.0,1.0,0.33,54.04
3,51-67,F,75k-100k,2016,8,4,98.33,3.0,1.0,1.0,0.33,0.33,98.53
4,18-34,F,50k-75k,2016,2,5,62.86,2.0,2.0,1.0,1.0,0.5,91.19


In [3]:
imputed_data = pd.get_dummies(imputed_data)

imputed_data

Unnamed: 0,MemberSince,Recency (# Days ago from last trans. As of final day up to Day 15),Frequency (# Trans. from Day 1-15),Monetary Value (Sum of Trans. from Day 1-15),# of Marketing offers that Starbucks sent to each customer from Day 1-15,# of Marketing offers that were viewed from Day 1-15,# of Marketing offers that were successfully completed from Day 1-15,Marketing Offer View Rate from Day 1-15,Marketing Offer Response Rate from Day 1-15,Sum(Trans. Amt from Day 16-30),...,Age_Unknown,Gender_F,Gender_M,Gender_O,Gender_Unknown,HH Income_100k-120k,HH Income_30k-50k,HH Income_50k-75k,HH Income_75k-100k,HH Income_Unknown
0,2017,5,1,22.160,2.0,1.0,0.0,0.50,0.00,105.440,...,0,0,1,0,0,0,0,1,0,0
1,2018,2,1,0.700,1.0,1.0,0.0,1.00,0.00,3.390,...,1,0,0,0,1,0,0,0,0,1
2,2018,4,2,25.420,3.0,3.0,1.0,1.00,0.33,54.040,...,0,0,0,1,0,0,0,1,0,0
3,2016,8,4,98.330,3.0,1.0,1.0,0.33,0.33,98.530,...,0,1,0,0,0,0,0,0,1,0
4,2016,2,5,62.860,2.0,2.0,1.0,1.00,0.50,91.190,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15037,2015,1,8,142.379,3.0,2.0,2.0,0.67,0.67,171.239,...,0,1,0,0,0,0,0,0,1,0
15038,2017,3,3,16.050,2.0,2.0,1.0,1.00,0.50,13.890,...,0,0,1,0,0,0,0,1,0,0
15039,2018,14,1,0.640,0.0,0.0,0.0,0.00,0.00,11.510,...,0,0,1,0,0,0,0,1,0,0
15040,2017,1,7,42.500,2.0,2.0,1.0,1.00,0.50,46.330,...,0,0,1,0,0,0,1,0,0,0


In [4]:
# After calling get dummies in the preprocessing phase,
# Drop Gender_Unknown, Age_Unknown, HH Income_Unknown to reduce multicollinearity
# Prepare the data
X = imputed_data[['MemberSince',
                  'Recency (# Days ago from last trans. As of final day up to Day 15)',
                  'Frequency (# Trans. from Day 1-15)',
                  'Monetary Value (Sum of Trans. from Day 1-15)',
                  '# of Marketing offers that Starbucks sent to each customer from Day 1-15',
                  '# of Marketing offers that were viewed from Day 1-15',
                  '# of Marketing offers that were successfully completed from Day 1-15',
                  'Marketing Offer View Rate from Day 1-15',
                  'Marketing Offer Response Rate from Day 1-15',
                  'Age_18-34', 'Age_35-50', 'Age_51-67',
                  'Age_68-84', 'Age_85-101', 'Gender_F', 'Gender_M',  # 'Age_Unknown',
                  'Gender_O', 'HH Income_100k-120k',  # 'Gender_Unknown','HH Income_Unknown'
                  'HH Income_30k-50k', 'HH Income_50k-75k', 'HH Income_75k-100k']]

y = imputed_data['Sum(Trans. Amt from Day 16-30)']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': [0.1, 0.05, 0.01],
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, -1],
    'num_leaves': [31, 63, 127],
}

# Perform hyperparameter tuning using randomized search and cross-validation
model = LGBMRegressor()
random_search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=KFold(n_splits=5, shuffle=True),
                                   scoring='neg_mean_absolute_error')
random_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_

# Train the best model on the entire training set
best_model.fit(X_train, y_train)

In [5]:
# Calculate quantiles
quantiles = [0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.99]

# Calculate actual and predicted CLV quantiles on the test set
actual_clv_quantiles = np.quantile(y_test, quantiles)
predicted_clv_quantiles = np.quantile(best_model.predict(X_test), quantiles)

# Calculate evaluation metrics for each quantile
mae = []
mape = []

# Add tqdm progress bar
with tqdm(total=len(quantiles), desc="Calculating Metrics") as pbar:
    for i in range(len(quantiles)):
        # Select the data points within the quantile range
        if i == len(quantiles) - 1:
            quantile_mask = (y_test >= actual_clv_quantiles[i])
        else:
            quantile_mask = (y_test >= actual_clv_quantiles[i]) & (y_test <= actual_clv_quantiles[i + 1])
        quantile_X = X_test[quantile_mask]
        quantile_y = y_test[quantile_mask]

        # Calculate evaluation metrics for the quantile
        mae.append(mean_absolute_error(quantile_y, best_model.predict(quantile_X)))
        mape.append(np.mean(np.abs((quantile_y - best_model.predict(quantile_X)) / quantile_y)) * 100)
        
        # Update the tqdm progress bar
        pbar.update(1)

# Create a DataFrame with the results
results = pd.DataFrame({
    'Quantile': ['Bottom 20%', '20-40%', '40-60%', '60-80%', '80-90%', '90-95%', '95-99%'],
    'Avg. Actual CLV': actual_clv_quantiles.round(2),
    'Avg. Predicted CLV': predicted_clv_quantiles.round(2),
    'MAE': mae,
    'MAPE': mape
})

# Format the columns
results['Avg. Actual CLV'] = '$' + results['Avg. Actual CLV'].astype(str)
results['Avg. Predicted CLV'] = '$' + results['Avg. Predicted CLV'].astype(str)
results['MAE'] = results['MAE'].round(2)
results['MAPE'] = results['MAPE'].round(1).astype(str) + '%'

print("Best Model Parameters:")
print(best_params)

# Display the results DataFrame
results

Calculating Metrics: 100%|███████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 60.07it/s]

Best Model Parameters:
{'num_leaves': 31, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1}





Unnamed: 0,Quantile,Avg. Actual CLV,Avg. Predicted CLV,MAE,MAPE
0,Bottom 20%,$10.81,$15.51,18.71,105.8%
1,20-40%,$26.0,$36.77,27.14,71.7%
2,40-60%,$54.95,$72.06,19.88,27.7%
3,60-80%,$99.23,$93.89,30.8,26.2%
4,80-90%,$135.07,$102.38,59.3,39.1%
5,90-95%,$169.55,$109.17,78.48,45.8%
6,95-99%,$171.24,$118.5,78.67,45.9%
