### Utilizes Train Test Split, KFold CV=5, shuffle=True, and RandomSearch

In [39]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error

# Load the data
imputed_data = pd.read_csv("sbux_clv_drop_AOV_and_completed_offers_across_channels.csv")

# Create X and y
X = imputed_data[['Monetary Value (Sum of Trans. from Day 1-15)']]
y = imputed_data['Sum(Trans. Amt from Day 16-30)']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Calculate quantiles
quantiles = [0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.99]

# Calculate actual and predicted CLV quantiles on the test set
actual_clv_quantiles = np.quantile(y_test, quantiles)
predicted_clv_quantiles = np.quantile(X_test, quantiles)

# Calculate evaluation metrics for each quantile
mae = []
mape = []

# Add tqdm progress bar
with tqdm(total=len(quantiles), desc="Calculating Metrics") as pbar:
    for i in range(len(quantiles)):
        # Select the data points within the quantile range
        if i == len(quantiles) - 1:
            quantile_mask = (y_test >= actual_clv_quantiles[i])
        else:
            quantile_mask = (y_test >= actual_clv_quantiles[i]) & (y_test < actual_clv_quantiles[i + 1])
        quantile_X = X_test[quantile_mask]
        quantile_y = y_test[quantile_mask]

        # Calculate evaluation metrics for the quantile
        mae.append(mean_absolute_error(quantile_y, quantile_X))
        mape.append(np.mean(np.abs((quantile_y - quantile_X.squeeze()) / quantile_y)) * 100)
        
        # Update the tqdm progress bar
        pbar.update(1)

# Create a DataFrame with the results
results = pd.DataFrame({
    'Quantile': ['Bottom 20%', '20-40%', '40-60%', '60-80%', '80-90%', '90-95%', '95-99%'],
    'Avg. Actual CLV': actual_clv_quantiles.round(2),
    'Avg. Predicted CLV': predicted_clv_quantiles.round(2),
    'MAE': mae,
    'MAPE': mape
})

# Format the columns
results['Avg. Actual CLV'] = '$' + results['Avg. Actual CLV'].astype(str)
results['Avg. Predicted CLV'] = '$' + results['Avg. Predicted CLV'].astype(str)
results['MAE'] = np.round(results['MAE'], 2)
results['MAPE'] = np.round(results['MAPE'], 1).astype(str) + '%'

results

Calculating Metrics: 100%|██████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 274.40it/s]


Unnamed: 0,Quantile,Avg. Actual CLV,Avg. Predicted CLV,MAE,MAPE
0,Bottom 20%,$10.81,$8.65,17.67,99.9%
1,20-40%,$26.0,$21.07,33.33,87.0%
2,40-60%,$54.95,$44.57,41.75,55.4%
3,60-80%,$99.23,$78.63,60.64,51.9%
4,80-90%,$135.07,$110.27,81.53,54.0%
5,90-95%,$169.55,$141.66,131.01,58.2%
6,95-99%,$435.8,$245.66,653.16,89.2%
