### Utilizes Train Test Split, KFold CV=5, shuffle=True, and RandomSearch

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
imputed_data = pd.read_csv("sbux_clv_drop_AOV_and_completed_offers_across_channels_95percent_log_transform.csv")

In [2]:
imputed_data=imputed_data.drop(['Unnamed: 0','Unnamed: 0.1','Customer ID','CLV_Quantile'],axis=1)
imputed_data.head()

Unnamed: 0,Age,Gender,HH Income,MemberSince,Recency (# Days ago from last trans. As of final day up to Day 15),Frequency (# Trans. from Day 1-15),Monetary Value (Sum of Trans. from Day 1-15),# of Marketing offers that Starbucks sent to each customer from Day 1-15,# of Marketing offers that were viewed from Day 1-15,# of Marketing offers that were successfully completed from Day 1-15,Marketing Offer View Rate from Day 1-15,Marketing Offer Response Rate from Day 1-15,Sum(Trans. Amt from Day 16-30)
0,18-34,M,50k-75k,2017,5,1,3.098289,2.0,1.0,0.0,0.5,0.0,4.658142
1,Unknown,Unknown,Unknown,2018,2,1,-0.356675,1.0,1.0,0.0,1.0,0.0,1.22083
2,35-50,O,50k-75k,2018,4,2,3.235536,3.0,3.0,1.0,1.0,0.33,3.989725
3,51-67,F,75k-100k,2016,8,4,4.588329,3.0,1.0,1.0,0.33,0.33,4.590361
4,18-34,F,50k-75k,2016,2,5,4.14091,2.0,2.0,1.0,1.0,0.5,4.512945


In [3]:
imputed_data = pd.get_dummies(imputed_data)
imputed_data

Unnamed: 0,MemberSince,Recency (# Days ago from last trans. As of final day up to Day 15),Frequency (# Trans. from Day 1-15),Monetary Value (Sum of Trans. from Day 1-15),# of Marketing offers that Starbucks sent to each customer from Day 1-15,# of Marketing offers that were viewed from Day 1-15,# of Marketing offers that were successfully completed from Day 1-15,Marketing Offer View Rate from Day 1-15,Marketing Offer Response Rate from Day 1-15,Sum(Trans. Amt from Day 16-30),...,Age_Unknown,Gender_F,Gender_M,Gender_O,Gender_Unknown,HH Income_100k-120k,HH Income_30k-50k,HH Income_50k-75k,HH Income_75k-100k,HH Income_Unknown
0,2017,5,1,3.098289,2.0,1.0,0.0,0.50,0.00,4.658142,...,0,0,1,0,0,0,0,1,0,0
1,2018,2,1,-0.356675,1.0,1.0,0.0,1.00,0.00,1.220830,...,1,0,0,0,1,0,0,0,0,1
2,2018,4,2,3.235536,3.0,3.0,1.0,1.00,0.33,3.989725,...,0,0,0,1,0,0,0,1,0,0
3,2016,8,4,4.588329,3.0,1.0,1.0,0.33,0.33,4.590361,...,0,1,0,0,0,0,0,0,1,0
4,2016,2,5,4.140910,2.0,2.0,1.0,1.00,0.50,4.512945,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15037,2015,1,8,4.958493,3.0,2.0,2.0,0.67,0.67,5.143060,...,0,1,0,0,0,0,0,0,1,0
15038,2017,3,3,2.775709,2.0,2.0,1.0,1.00,0.50,2.631169,...,0,0,1,0,0,0,0,1,0,0
15039,2018,14,1,-0.446287,0.0,0.0,0.0,0.00,0.00,2.443216,...,0,0,1,0,0,0,0,1,0,0
15040,2017,1,7,3.749504,2.0,2.0,1.0,1.00,0.50,3.835790,...,0,0,1,0,0,0,1,0,0,0


In [4]:
# After calling get dummies in the preprocessing phase,
# Drop Gender_Unknown, Age_Unknown, HH Income_Unknown to reduce multicollinearity
# Prepare the data
X = imputed_data[['MemberSince',
                  'Recency (# Days ago from last trans. As of final day up to Day 15)',
                  'Frequency (# Trans. from Day 1-15)',
                  'Monetary Value (Sum of Trans. from Day 1-15)',
                  '# of Marketing offers that Starbucks sent to each customer from Day 1-15',
                  '# of Marketing offers that were viewed from Day 1-15',
                  '# of Marketing offers that were successfully completed from Day 1-15',
                  'Marketing Offer View Rate from Day 1-15',
                  'Marketing Offer Response Rate from Day 1-15',
                  'Age_18-34', 'Age_35-50', 'Age_51-67',
                  'Age_68-84', 'Age_85-101', 'Gender_F', 'Gender_M',  # 'Age_Unknown',
                  'Gender_O', 'HH Income_100k-120k',  # 'Gender_Unknown','HH Income_Unknown'
                  'HH Income_30k-50k', 'HH Income_50k-75k', 'HH Income_75k-100k']]

y = imputed_data['Sum(Trans. Amt from Day 16-30)']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': [0.1, 0.05, 0.01],
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, -1],
    'num_leaves': [31, 63, 127],
}

# Perform hyperparameter tuning using randomized search and cross-validation
model = LGBMRegressor()
random_search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=KFold(n_splits=5, shuffle=True),
                                   scoring='neg_mean_absolute_error')
random_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_

# Train the best model on the entire training set
best_model.fit(X_train, y_train)

print("Best Model Parameters:")
print(best_params)

Best Model Parameters:
{'num_leaves': 127, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.01}


In [5]:
X_test

Unnamed: 0,MemberSince,Recency (# Days ago from last trans. As of final day up to Day 15),Frequency (# Trans. from Day 1-15),Monetary Value (Sum of Trans. from Day 1-15),# of Marketing offers that Starbucks sent to each customer from Day 1-15,# of Marketing offers that were viewed from Day 1-15,# of Marketing offers that were successfully completed from Day 1-15,Marketing Offer View Rate from Day 1-15,Marketing Offer Response Rate from Day 1-15,Age_18-34,...,Age_51-67,Age_68-84,Age_85-101,Gender_F,Gender_M,Gender_O,HH Income_100k-120k,HH Income_30k-50k,HH Income_50k-75k,HH Income_75k-100k
2894,2016,4,5,2.510412,3.0,3.0,0.0,1.00,0.00,0,...,0,0,0,0,0,0,0,0,0,0
3820,2017,2,12,3.597860,3.0,3.0,0.0,1.00,0.00,0,...,0,0,0,0,1,0,0,1,0,0
14440,2018,0,6,4.350665,2.0,2.0,1.0,1.00,0.50,1,...,0,0,0,1,0,0,0,1,0,0
9902,2018,0,7,4.079400,3.0,3.0,2.0,1.00,0.67,1,...,0,0,0,0,1,0,0,1,0,0
13739,2017,9,2,2.109000,3.0,2.0,0.0,0.67,0.00,0,...,1,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7497,2017,6,1,0.587787,2.0,2.0,0.0,1.00,0.00,0,...,0,0,0,0,1,0,0,1,0,0
12603,2018,3,2,2.337952,1.0,1.0,0.0,1.00,0.00,0,...,1,0,0,0,1,0,0,0,1,0
2685,2015,5,1,0.928219,2.0,1.0,0.0,0.50,0.00,0,...,0,0,0,0,0,0,0,0,0,0
9414,2018,6,1,1.217876,3.0,1.0,0.0,0.33,0.00,0,...,0,0,0,0,1,0,0,0,1,0


In [6]:
# Calculate quantiles
quantiles = [0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.99]

# Calculate actual and predicted CLV quantiles on the test set
actual_clv_quantiles = np.quantile(y_test, quantiles)
predicted_clv_quantiles = np.quantile(best_model.predict(X_test), quantiles)

predicted_clv_quantiles

array([2.18666622, 3.05185627, 3.62960417, 4.09299324, 4.21589531,
       4.27499406, 4.32791187])

In [7]:
actual_clv_quantiles = np.exp(actual_clv_quantiles)
predicted_clv_quantiles = np.exp(predicted_clv_quantiles)
# Create a DataFrame with the results
results = pd.DataFrame({
    'Quantile': ['Bottom 20%', '20-40%', '40-60%', '60-80%', '80-90%', '90-95%', '95-99%'],
    'Avg. Actual CLV': actual_clv_quantiles,
    'Avg. Predicted CLV': predicted_clv_quantiles.round(2)
})
predicted_clv_quantiles

array([ 8.90547471, 21.15457663, 37.69789158, 59.9189754 , 67.75480056,
       71.87971231, 75.78587057])

In [8]:
# Calculate quantiles
quantiles = [0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.99]
quantile_labels = ['Bottom 20%', '20-40%', '40-60%', '60-80%', '80-90%', '90-95%', '95-99%']

# Create a DataFrame to store MAE and MAPE by record in each quantile
clv_by_quantile = pd.DataFrame(columns=['Quantile', 'MAE', 'MAPE'])

# Iterate over each quantile
for i in range(len(quantiles)):
    # Select the data points within the quantile range
    if i == len(quantiles) - 1:
        quantile_mask = (y_test >= np.quantile(y_test, quantiles[i]))
    else:
        quantile_mask = (y_test >= np.quantile(y_test, quantiles[i])) & (y_test <= np.quantile(y_test, quantiles[i + 1]))
    quantile_X = X_test[quantile_mask]
    quantile_y = y_test[quantile_mask]
    quantile_pred = best_model.predict(quantile_X)

    # Ensure quantile_y and quantile_pred have the same length
    min_len = min(len(quantile_y), len(quantile_pred))
    quantile_y = quantile_y[:min_len]
    quantile_pred = quantile_pred[:min_len]

    # Convert actual and predicted CLV from logged scale to original scale
    quantile_y = np.exp(quantile_y)
    quantile_pred = np.exp(quantile_pred)

    # Calculate MAE and MAPE for each record
    mae = abs(quantile_y - quantile_pred)
    mape = abs((quantile_y - quantile_pred) / quantile_y) * 100

    # Create a DataFrame for the quantile records
    quantile_df = pd.DataFrame({
        'Quantile': [quantile_labels[i]] * min_len,
        'MAE': mae,
        'MAPE': mape
    })

    # Append the quantile DataFrame to the overall CLV by quantile DataFrame
    clv_by_quantile = clv_by_quantile.append(quantile_df, ignore_index=True)

# Calculate the average for each quantile
average_by_quantile = clv_by_quantile.groupby('Quantile').mean()

# Round MAE to two decimal places and MAPE to one decimal place
average_by_quantile['MAE'] = average_by_quantile['MAE'].round(2)
average_by_quantile['MAPE'] = average_by_quantile['MAPE'].round(1)

# Sort quantiles based on MAE in ascending order, with 'Bottom 20%' first
average_by_quantile = average_by_quantile.reindex(index=['Bottom 20%', '20-40%', '40-60%', '60-80%', '80-90%', '90-95%', '95-99%'])

# Join the results DataFrame and average by quantile DataFrame on 'Quantile' column
joined_df = pd.merge(results, average_by_quantile, on='Quantile')

# Display the joined DataFrame
joined_df

Unnamed: 0,Quantile,Avg. Actual CLV,Avg. Predicted CLV,MAE,MAPE
0,Bottom 20%,10.81,8.91,10.75,61.8
1,20-40%,26.0,21.15,17.39,45.2
2,40-60%,54.952,37.7,28.96,37.5
3,60-80%,99.23399,59.92,63.45,54.5
4,80-90%,135.069994,67.75,94.92,62.9
5,90-95%,169.549857,71.88,115.48,67.4
6,95-99%,171.239,75.79,115.39,67.4


In [9]:
# Round MAE to two decimal places and MAPE to one decimal place
average_by_quantile['MAE'] = average_by_quantile['MAE'].round(2).apply(lambda x: '{:,.2f}'.format(x))
average_by_quantile['MAPE'] = average_by_quantile['MAPE'].round(1).apply(lambda x: '{:.1f}%'.format(x))

# Sort quantiles based on MAE in ascending order, with 'Bottom 20%' first
average_by_quantile = average_by_quantile.reindex(index=['Bottom 20%', '20-40%', '40-60%', '60-80%', '80-90%', '90-95%', '95-99%'])

# Join the results DataFrame and average by quantile DataFrame on 'Quantile' column
joined_df = pd.merge(results, average_by_quantile, on='Quantile')

# Add dollar sign to Avg. Actual CLV and Avg. Predicted CLV columns
joined_df['Avg. Actual CLV'] = joined_df['Avg. Actual CLV'].apply(lambda x: '${:,.2f}'.format(x))
joined_df['Avg. Predicted CLV'] = joined_df['Avg. Predicted CLV'].apply(lambda x: '${:,.2f}'.format(x))

# Display the modified joined DataFrame
joined_df

Unnamed: 0,Quantile,Avg. Actual CLV,Avg. Predicted CLV,MAE,MAPE
0,Bottom 20%,$10.81,$8.91,10.75,61.8%
1,20-40%,$26.00,$21.15,17.39,45.2%
2,40-60%,$54.95,$37.70,28.96,37.5%
3,60-80%,$99.23,$59.92,63.45,54.5%
4,80-90%,$135.07,$67.75,94.92,62.9%
5,90-95%,$169.55,$71.88,115.48,67.4%
6,95-99%,$171.24,$75.79,115.39,67.4%
