In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from tqdm import tqdm
import lightgbm as lgb
import matplotlib.pyplot as plt


# Load the data
imputed_data = pd.read_csv("sbux_clv_drop_AOV_and_completed_offers_across_channels_with_dummies.csv")

# Prepare the data
# After calling get dummies in the preprocessing phase,
# Drop Gender_Unknown, Age_Unknown, HH Income_Unknown to reduce multicollinearity
# Prepare the data
X = imputed_data[['MemberSince',
                  'Recency (# Days ago from last trans. As of final day up to Day 15)',
                  'Frequency (# Trans. from Day 1-15)',
                  'Monetary Value (Sum of Trans. from Day 1-15)',
                  '# of Marketing offers that Starbucks sent to each customer from Day 1-15',
                  '# of Marketing offers that were viewed from Day 1-15',
                  '# of Marketing offers that were successfully completed from Day 1-15',
                  'Marketing Offer View Rate from Day 1-15',
                  'Marketing Offer Response Rate from Day 1-15',
                  'Age_18-34', 'Age_35-50', 'Age_51-67',
                  'Age_68-84', 'Age_85-101', 'Gender_F', 'Gender_M',  # 'Age_Unknown',
                  'Gender_O', 'HH Income_100k-120k',  # 'Gender_Unknown','HH Income_Unknown'
                  'HH Income_30k-50k', 'HH Income_50k-75k', 'HH Income_75k-100k']]

y = imputed_data['Sum(Trans. Amt from Day 16-30)']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': [0.1, 0.05, 0.01],
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, -1],
    'num_leaves': [31, 63, 127],
}

# Perform hyperparameter tuning using randomized search and cross-validation
model = LGBMRegressor()
random_search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=KFold(n_splits=5, shuffle=True),
                                   scoring='neg_mean_absolute_error')
random_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_

# Train the best model on the entire training set
best_model.fit(X_train, y_train)

In [2]:
# Calculate quantiles
quantiles = [0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.99]

# Calculate actual and predicted CLV quantiles on the test set
actual_clv_quantiles = np.quantile(y_test, quantiles)
predicted_clv_quantiles = np.quantile(best_model.predict(X_test), quantiles)

# Calculate evaluation metrics for each quantile
mae = []
mape = []

# Add tqdm progress bar
with tqdm(total=len(quantiles), desc="Calculating Metrics") as pbar:
    for i in range(len(quantiles)):
        # Select the data points within the quantile range
        if i == len(quantiles) - 1:
            quantile_mask = (y_test >= actual_clv_quantiles[i])
        else:
            quantile_mask = (y_test >= actual_clv_quantiles[i]) & (y_test <= actual_clv_quantiles[i + 1])
        quantile_X = X_test[quantile_mask]
        quantile_y = y_test[quantile_mask]

        # Calculate evaluation metrics for the quantile
        mae.append(mean_absolute_error(quantile_y, best_model.predict(quantile_X)))
        mape.append(np.mean(np.abs((quantile_y - best_model.predict(quantile_X)) / quantile_y)) * 100)
        
        # Update the tqdm progress bar
        pbar.update(1)

# Create a DataFrame with the results
results = pd.DataFrame({
    'Quantile': ['Bottom 20%', '20-40%', '40-60%', '60-80%', '80-90%', '90-95%', '95-99%'],
    'Avg. Actual CLV': actual_clv_quantiles.round(2),
    'Avg. Predicted CLV': predicted_clv_quantiles.round(2),
    'MAE': mae,
    'MAPE': mape
})

# Format the columns
results['Avg. Actual CLV'] = '$' + results['Avg. Actual CLV'].astype(str)
results['Avg. Predicted CLV'] = '$' + results['Avg. Predicted CLV'].astype(str)
results['MAE'] = results['MAE'].round(2)
results['MAPE'] = results['MAPE'].round(1).astype(str) + '%'

print("Best Model Parameters:")
print(best_params)

# Display the results DataFrame
results


Calculating Metrics: 100%|████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 52.28it/s]

Best Model Parameters:
{'num_leaves': 127, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1}





Unnamed: 0,Quantile,Avg. Actual CLV,Avg. Predicted CLV,MAE,MAPE
0,Bottom 20%,$10.81,$18.01,22.83,129.8%
1,20-40%,$26.0,$40.38,33.33,87.4%
2,40-60%,$54.95,$82.1,26.57,37.2%
3,60-80%,$99.23,$107.49,24.76,21.2%
4,80-90%,$135.07,$117.79,46.28,30.5%
5,90-95%,$169.55,$126.67,109.94,47.7%
6,95-99%,$435.8,$144.45,631.54,85.7%


In [8]:
# Get the users in the top quantile
top_quantile_mask = (y_test >= actual_clv_quantiles[-2])
top_quantile_users = X_test[top_quantile_mask].index

# Get the top 10 predicted CLV and actual CLV
top_10_predicted = best_model.predict(X_test.loc[top_quantile_users])[:10]
top_10_actual = y_test.loc[top_quantile_users][:10].values

# Print the top 10 predicted CLV vs Actual CLV
print("Top 10 Predicted CLV vs Actual CLV:")
for i in range(10):
    print("User", i+1)
    print("Predicted CLV:", top_10_predicted[i])
    print("Actual CLV:", top_10_actual[i])
    print()


Top 10 Predicted CLV vs Actual CLV:
User 1
Predicted CLV: 114.33274089893115
Actual CLV: 170.93

User 2
Predicted CLV: 122.75285032720522
Actual CLV: 201.84

User 3
Predicted CLV: 127.28211358160402
Actual CLV: 195.37

User 4
Predicted CLV: 107.97890699353651
Actual CLV: 219.99

User 5
Predicted CLV: 78.38702566081653
Actual CLV: 350.43

User 6
Predicted CLV: 104.73412764700849
Actual CLV: 171.51

User 7
Predicted CLV: 107.24691956879025
Actual CLV: 436.77999999999986

User 8
Predicted CLV: 120.03444273433334
Actual CLV: 183.9

User 9
Predicted CLV: 115.78451515621349
Actual CLV: 173.68

User 10
Predicted CLV: 143.80690594791253
Actual CLV: 172.54000000000005



In [13]:
# Create a DataFrame to store the top 10 predicted CLV vs actual CLV
top_10_results = pd.DataFrame(columns=['User', 'Predicted CLV', 'Actual CLV', 'Difference'])

# Get the users in the top quantile
top_quantile_mask = (y_test >= actual_clv_quantiles[-2])
top_quantile_users = X_test[top_quantile_mask].index

# Get the top 10 predicted CLV and actual CLV
top_10_predicted = best_model.predict(X_test.loc[top_quantile_users])[:10]
top_10_actual = y_test.loc[top_quantile_users][:10].values

# Add the results to the DataFrame
for i in range(10):
    user = "User " + str(i+1)
    predicted_clv = round(top_10_predicted[i], 2)  # Round predicted CLV to two decimal places
    actual_clv = round(top_10_actual[i], 2)  # Round actual CLV to two decimal places
    difference = round(predicted_clv - actual_clv, 2)
    
    top_10_results = top_10_results.append({'User': user, 'Predicted CLV': '$' + str(predicted_clv), 
                                            'Actual CLV': '$' + str(actual_clv), 'Difference': str(difference)},
                                           ignore_index=True)

# Print the top 10 predicted CLV vs actual CLV DataFrame
print("Top 10 Predicted CLV vs Actual CLV:")
top_10_results


Top 10 Predicted CLV vs Actual CLV:


Unnamed: 0,User,Predicted CLV,Actual CLV,Difference
0,User 1,$114.33,$170.93,-56.6
1,User 2,$122.75,$201.84,-79.09
2,User 3,$127.28,$195.37,-68.09
3,User 4,$107.98,$219.99,-112.01
4,User 5,$78.39,$350.43,-272.04
5,User 6,$104.73,$171.51,-66.78
6,User 7,$107.25,$436.78,-329.53
7,User 8,$120.03,$183.9,-63.87
8,User 9,$115.78,$173.68,-57.9
9,User 10,$143.81,$172.54,-28.73
