### Utilizes Train Test Split, KFold CV=5, shuffle=True, and RandomSearch

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold, train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
from lightgbm import LGBMRegressor
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
imputed_data = pd.read_csv("sbux_clv_drop_AOV_and_completed_offers_across_channels_with_dummies.csv")

In [2]:
imputed_data=imputed_data.drop(['Unnamed: 0'],axis=1)
imputed_data.head()
# Log transform "Sum of Trans. from Day 1-15"
imputed_data['Monetary Value (Sum of Trans. from Day 1-15)'] = np.log(imputed_data['Monetary Value (Sum of Trans. from Day 1-15)'])

# Log transform "Sum(Trans. Amt from Day 16-30)"
imputed_data['Sum(Trans. Amt from Day 16-30)'] = np.log(imputed_data['Sum(Trans. Amt from Day 16-30)'])

In [3]:
threshold = 0.001  # Define your threshold value here

# Filter and drop rows with values close to zero or negative in "Sum of Trans. from Day 1-15" column
imputed_data = imputed_data[imputed_data['Monetary Value (Sum of Trans. from Day 1-15)'] > threshold]

# Filter and drop rows with values close to zero or negative in "Sum(Trans. Amt from Day 16-30)" column
imputed_data = imputed_data[imputed_data['Sum(Trans. Amt from Day 16-30)'] > threshold]

In [4]:
imputed_data = pd.get_dummies(imputed_data)

In [5]:
# After calling get dummies in the preprocessing phase,
# Drop Gender_Unknown, Age_Unknown, HH Income_Unknown to reduce multicollinearity
# Prepare the data
X = imputed_data[['MemberSince',
                  'Recency (# Days ago from last trans. As of final day up to Day 15)',
                  'Frequency (# Trans. from Day 1-15)',
                  'Monetary Value (Sum of Trans. from Day 1-15)',
                  '# of Marketing offers that Starbucks sent to each customer from Day 1-15',
                  '# of Marketing offers that were viewed from Day 1-15',
                  '# of Marketing offers that were successfully completed from Day 1-15',
                  'Marketing Offer View Rate from Day 1-15',
                  'Marketing Offer Response Rate from Day 1-15',
                  'Age_18-34', 'Age_35-50', 'Age_51-67',
                  'Age_68-84', 'Age_85-101', 'Gender_F', 'Gender_M',  # 'Age_Unknown',
                  'Gender_O', 'HH Income_100k-120k',  # 'Gender_Unknown','HH Income_Unknown'
                  'HH Income_30k-50k', 'HH Income_50k-75k', 'HH Income_75k-100k']]

y = imputed_data['Sum(Trans. Amt from Day 16-30)']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': [0.1, 0.05, 0.01],
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, -1],
    'num_leaves': [31, 63, 127],
}

# Perform hyperparameter tuning using randomized search and cross-validation
model = LGBMRegressor()
random_search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=KFold(n_splits=5, shuffle=True),
                                   scoring='neg_mean_absolute_error')
random_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_

# Train the best model on the entire training set
best_model.fit(X_train, y_train)

print("Best Model Parameters:")
print(best_params)

Best Model Parameters:
{'num_leaves': 63, 'n_estimators': 500, 'max_depth': -1, 'learning_rate': 0.01}


In [6]:
# Calculate quantiles
quantiles = [0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.99]

# Calculate actual and predicted CLV quantiles on the test set
actual_clv_quantiles = np.quantile(y_test, quantiles)
predicted_clv_quantiles = np.quantile(best_model.predict(X_test), quantiles)

actual_clv_quantiles=[]

In [7]:
actual_clv_quantiles = [10.81,26.00, 54.95, 99.23, 135.07, 169.55,435.80]
predicted_clv_quantiles = np.exp(predicted_clv_quantiles)

In [8]:
actual_clv_quantiles
predicted_clv_quantiles

array([ 11.20503663,  30.78521506,  62.64078759,  85.54311423,
        94.59935125,  99.94300934, 110.88686861])

In [9]:
# Create a DataFrame with the results
results = pd.DataFrame({
    'Quantile': ['Bottom 20%', '20-40%', '40-60%', '60-80%', '80-90%', '90-95%', '95-99%'],
    'Avg. Actual CLV': actual_clv_quantiles,
    'Avg. Predicted CLV': predicted_clv_quantiles.round(2)
})
predicted_clv_quantiles

array([ 11.20503663,  30.78521506,  62.64078759,  85.54311423,
        94.59935125,  99.94300934, 110.88686861])

In [10]:
# Calculate quantiles
quantiles = [0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.99]
quantile_labels = ['Bottom 20%', '20-40%', '40-60%', '60-80%', '80-90%', '90-95%', '95-99%']

# Create a DataFrame to store MAE and MAPE by record in each quantile
clv_by_quantile = pd.DataFrame(columns=['Quantile', 'MAE', 'MAPE'])

# Iterate over each quantile
for i in range(len(quantiles)):
    # Select the data points within the quantile range
    if i == len(quantiles) - 1:
        quantile_mask = (y_test >= np.quantile(y_test, quantiles[i]))
    else:
        quantile_mask = (y_test >= np.quantile(y_test, quantiles[i])) & (y_test <= np.quantile(y_test, quantiles[i + 1]))
    quantile_X = X_test[quantile_mask]
    quantile_y = y_test[quantile_mask]
    quantile_pred = best_model.predict(quantile_X)

    # Ensure quantile_y and quantile_pred have the same length
    min_len = min(len(quantile_y), len(quantile_pred))
    quantile_y = quantile_y[:min_len]
    quantile_pred = quantile_pred[:min_len]

    # Convert actual and predicted CLV from logged scale to original scale
    quantile_y = np.exp(quantile_y)
    quantile_pred = np.exp(quantile_pred)

    # Calculate MAE and MAPE for each record
    mae = abs(quantile_y - quantile_pred)
    mape = abs((quantile_y - quantile_pred) / quantile_y) * 100

    # Create a DataFrame for the quantile records
    quantile_df = pd.DataFrame({
        'Quantile': [quantile_labels[i]] * min_len,
        'MAE': mae,
        'MAPE': mape
    })

    # Append the quantile DataFrame to the overall CLV by quantile DataFrame
    clv_by_quantile = clv_by_quantile.append(quantile_df, ignore_index=True)

# Calculate the average for each quantile
average_by_quantile = clv_by_quantile.groupby('Quantile').mean()

# Round MAE to two decimal places and MAPE to one decimal place
average_by_quantile['MAE'] = average_by_quantile['MAE'].round(2)
average_by_quantile['MAPE'] = average_by_quantile['MAPE'].round(1)

# Sort quantiles based on MAE in ascending order, with 'Bottom 20%' first
average_by_quantile = average_by_quantile.reindex(index=['Bottom 20%', '20-40%', '40-60%', '60-80%', '80-90%', '90-95%', '95-99%'])

# Join the results DataFrame and average by quantile DataFrame on 'Quantile' column
joined_df = pd.merge(results, average_by_quantile, on='Quantile')

# Display the joined DataFrame
joined_df

Unnamed: 0,Quantile,Avg. Actual CLV,Avg. Predicted CLV,MAE,MAPE
0,Bottom 20%,10.81,11.21,16.94,88.6
1,20-40%,26.0,30.79,22.33,55.5
2,40-60%,54.95,62.64,20.89,26.8
3,60-80%,99.23,85.54,41.41,34.0
4,80-90%,135.07,94.6,74.27,47.1
5,90-95%,169.55,99.94,151.27,62.0
6,95-99%,435.8,110.89,657.88,88.9


In [11]:
# Round MAE to two decimal places and MAPE to one decimal place
average_by_quantile['MAE'] = average_by_quantile['MAE'].round(2).apply(lambda x: '{:,.2f}'.format(x))
average_by_quantile['MAPE'] = average_by_quantile['MAPE'].round(1).apply(lambda x: '{:.1f}%'.format(x))

# Sort quantiles based on MAE in ascending order, with 'Bottom 20%' first
average_by_quantile = average_by_quantile.reindex(index=['Bottom 20%', '20-40%', '40-60%', '60-80%', '80-90%', '90-95%', '95-99%'])

# Join the results DataFrame and average by quantile DataFrame on 'Quantile' column
joined_df = pd.merge(results, average_by_quantile, on='Quantile')

# Add dollar sign to Avg. Actual CLV and Avg. Predicted CLV columns
joined_df['Avg. Actual CLV'] = joined_df['Avg. Actual CLV'].apply(lambda x: '${:,.2f}'.format(x))
joined_df['Avg. Predicted CLV'] = joined_df['Avg. Predicted CLV'].apply(lambda x: '${:,.2f}'.format(x))

# Display the modified joined DataFrame
joined_df

Unnamed: 0,Quantile,Avg. Actual CLV,Avg. Predicted CLV,MAE,MAPE
0,Bottom 20%,$10.81,$11.21,16.94,88.6%
1,20-40%,$26.00,$30.79,22.33,55.5%
2,40-60%,$54.95,$62.64,20.89,26.8%
3,60-80%,$99.23,$85.54,41.41,34.0%
4,80-90%,$135.07,$94.60,74.27,47.1%
5,90-95%,$169.55,$99.94,151.27,62.0%
6,95-99%,$435.80,$110.89,657.88,88.9%
