In [1]:
from tqdm import tqdm
from sklearn.model_selection import cross_val_score
import warnings
from sklearn.exceptions import DataConversionWarning, ConvergenceWarning
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_absolute_error
import numpy as np
import pandas as pd

In [2]:
imputed_data = pd.read_csv("sbux_clv_drop_AOV_and_completed_offers_across_channels.csv")
imputed_data=imputed_data.drop(['Unnamed: 0','Customer ID','CLV_Quantile'],axis=1)
imputed_data.head()

Unnamed: 0,Age,Gender,HH Income,MemberSince,Recency (# Days ago from last trans. As of final day up to Day 15),Frequency (# Trans. from Day 1-15),Monetary Value (Sum of Trans. from Day 1-15),# of Marketing offers that Starbucks sent to each customer from Day 1-15,# of Marketing offers that were viewed from Day 1-15,# of Marketing offers that were successfully completed from Day 1-15,Marketing Offer View Rate from Day 1-15,Marketing Offer Response Rate from Day 1-15,Sum(Trans. Amt from Day 16-30)
0,18-34,M,50k-75k,2017,5,1,22.16,2.0,1.0,0.0,0.5,0.0,105.44
1,Unknown,Unknown,Unknown,2018,2,1,0.7,1.0,1.0,0.0,1.0,0.0,3.39
2,35-50,O,50k-75k,2018,4,2,25.42,3.0,3.0,1.0,1.0,0.33,54.04
3,51-67,F,75k-100k,2016,8,4,98.33,3.0,1.0,1.0,0.33,0.33,98.53
4,18-34,F,50k-75k,2016,2,5,62.86,2.0,2.0,1.0,1.0,0.5,91.19


In [3]:
# Convert categorical variables into dummies
imputed_data = pd.get_dummies(imputed_data)

# Display all the columns in the DataFrame
print(imputed_data.columns)

# Display the modified DataFrame
imputed_data.head()

Index(['MemberSince',
       'Recency (# Days ago from last trans. As of final day up to Day 15)',
       'Frequency (# Trans. from Day 1-15)',
       'Monetary Value (Sum of Trans. from Day 1-15)',
       '# of Marketing offers that Starbucks sent to each customer from Day 1-15',
       '# of Marketing offers that were viewed from Day 1-15',
       '# of Marketing offers that were successfully completed from Day 1-15',
       'Marketing Offer View Rate from Day 1-15',
       'Marketing Offer Response Rate from Day 1-15',
       'Sum(Trans. Amt from Day 16-30)', 'Age_18-34', 'Age_35-50', 'Age_51-67',
       'Age_68-84', 'Age_85-101', 'Age_Unknown', 'Gender_F', 'Gender_M',
       'Gender_O', 'Gender_Unknown', 'HH Income_100k-120k',
       'HH Income_30k-50k', 'HH Income_50k-75k', 'HH Income_75k-100k',
       'HH Income_Unknown'],
      dtype='object')


Unnamed: 0,MemberSince,Recency (# Days ago from last trans. As of final day up to Day 15),Frequency (# Trans. from Day 1-15),Monetary Value (Sum of Trans. from Day 1-15),# of Marketing offers that Starbucks sent to each customer from Day 1-15,# of Marketing offers that were viewed from Day 1-15,# of Marketing offers that were successfully completed from Day 1-15,Marketing Offer View Rate from Day 1-15,Marketing Offer Response Rate from Day 1-15,Sum(Trans. Amt from Day 16-30),...,Age_Unknown,Gender_F,Gender_M,Gender_O,Gender_Unknown,HH Income_100k-120k,HH Income_30k-50k,HH Income_50k-75k,HH Income_75k-100k,HH Income_Unknown
0,2017,5,1,22.16,2.0,1.0,0.0,0.5,0.0,105.44,...,0,0,1,0,0,0,0,1,0,0
1,2018,2,1,0.7,1.0,1.0,0.0,1.0,0.0,3.39,...,1,0,0,0,1,0,0,0,0,1
2,2018,4,2,25.42,3.0,3.0,1.0,1.0,0.33,54.04,...,0,0,0,1,0,0,0,1,0,0
3,2016,8,4,98.33,3.0,1.0,1.0,0.33,0.33,98.53,...,0,1,0,0,0,0,0,0,1,0
4,2016,2,5,62.86,2.0,2.0,1.0,1.0,0.5,91.19,...,0,1,0,0,0,0,0,1,0,0


In [4]:
imputed_data.to_csv('sbux_clv_drop_AOV_and_completed_offers_across_channels_with_dummies.csv')

In [5]:
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split, KFold

X = imputed_data.drop('Sum(Trans. Amt from Day 16-30)', axis=1)
y = imputed_data['Sum(Trans. Amt from Day 16-30)']

# Create a list of all feature names
all_features = X.columns.tolist()

# Set the desired number of features
desired_num_features = 20

# Initialize the best feature set and the best error
best_features = all_features
best_error = float('inf')

# Perform backward stepwise selection
while len(best_features) > desired_num_features:
    # Initialize the feature to remove and the error for this iteration
    feature_to_remove = None
    iteration_error = float('inf')

    # Iterate over the remaining features
    for feature in best_features:
        # Create a copy of the current feature set without the feature to be removed
        features_subset = best_features.copy()
        features_subset.remove(feature)

        # Select the corresponding columns from X
        X_subset = X[features_subset]

        # Split the data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.3)

        # Initialize the KFold cross-validation
        kf = KFold(n_splits=5, shuffle=True)

        errors = []

        # Perform cross-validation
        for train_index, val_index in kf.split(X_train):
            X_train_kf, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_kf, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

            # Initialize and train the KNNRegressor model
            model = KNeighborsRegressor()
            model.fit(X_train_kf, y_train_kf)

            # Predict the target variable for the validation data
            y_pred_val = model.predict(X_val)

            # Calculate the root mean squared logarithmic error (RMSLE) for validation data
            rmsle_val = np.sqrt(mean_squared_log_error(y_val, y_pred_val))
            errors.append(rmsle_val)

        # Calculate the average error across all folds
        avg_error = np.mean(errors)

        # Check if this iteration's error is better than the previous best
        if avg_error < iteration_error:
            iteration_error = avg_error
            feature_to_remove = feature

    # Remove the feature with the highest error from the best feature set
    best_features.remove(feature_to_remove)

    # Update the best error
    best_error = iteration_error

# Print the selected features and best error
print("Selected features:")
for feature in best_features:
    print(feature)

Selected features:
MemberSince
Recency (# Days ago from last trans. As of final day up to Day 15)
Frequency (# Trans. from Day 1-15)
Monetary Value (Sum of Trans. from Day 1-15)
# of Marketing offers that were viewed from Day 1-15
# of Marketing offers that were successfully completed from Day 1-15
Marketing Offer View Rate from Day 1-15
Age_18-34
Age_35-50
Age_51-67
Age_68-84
Gender_F
Gender_M
Gender_O
Gender_Unknown
HH Income_100k-120k
HH Income_30k-50k
HH Income_50k-75k
HH Income_75k-100k
HH Income_Unknown
