In [210]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt
%matplotlib inline

from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, ElasticNet
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.ensemble import StackingRegressor, StackingClassifier, RandomForestRegressor

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from scikeras.wrappers import KerasRegressor, KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.utils import to_categorical

import warnings
warnings.filterwarnings("ignore")

In [201]:
data = pd.read_excel('HW3.xlsx')
data.drop('sequence_number', axis=1, inplace=True)
data.head()

Unnamed: 0,US,source_a,source_c,source_b,source_d,source_e,source_m,source_o,source_h,source_r,...,source_x,source_w,Freq,last_update_days_ago,1st_update_days_ago,Web order,Gender=male,Address_is_res,Purchase,Spending
0,1,0,0,1,0,0,0,0,0,0,...,0,0,2,3662,3662,1,0,1,1,127.87
1,1,0,0,0,0,1,0,0,0,0,...,0,0,0,2900,2900,1,1,0,0,0.0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,2,3883,3914,0,0,0,1,127.48
3,1,0,1,0,0,0,0,0,0,0,...,0,0,1,829,829,0,1,0,0,0.0
4,1,0,1,0,0,0,0,0,0,0,...,0,0,1,869,869,0,0,0,0,0.0


# Exploratory Data 

In [160]:
n_samples, n_features = data.shape
print('The dimensions of the data set are', n_samples, 'by', n_features)

The dimensions of the data set are 2000 by 24


In [90]:
data.describe()

Unnamed: 0,US,source_a,source_c,source_b,source_d,source_e,source_m,source_o,source_h,source_r,...,source_x,source_w,Freq,last_update_days_ago,1st_update_days_ago,Web order,Gender=male,Address_is_res,Purchase,Spending
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.8245,0.1265,0.056,0.06,0.0415,0.151,0.0165,0.0335,0.0525,0.0685,...,0.018,0.1375,1.417,2155.101,2435.6015,0.426,0.5245,0.221,0.5,102.560745
std,0.380489,0.332495,0.229979,0.237546,0.199493,0.358138,0.12742,0.179983,0.223089,0.252665,...,0.132984,0.344461,1.405738,1141.302846,1077.872233,0.494617,0.499524,0.415024,0.500125,186.749816
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1133.0,1671.25,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,2280.0,2721.0,0.0,1.0,0.0,0.5,1.855
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,3139.25,3353.0,1.0,1.0,0.0,1.0,152.5325
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,15.0,4188.0,4188.0,1.0,1.0,1.0,1.0,1500.06


In [202]:
X = data.drop(['Purchase', 'Spending'], axis=1)
y = data[['Spending']]

In [203]:
# categorical
cate_col = ['US', 'source_a', 'source_c', 'source_b', 'source_d', 'source_e','source_m', 'source_o', 'source_h', 'source_r', 'source_s', 'source_t',
'source_u', 'source_p', 'source_x', 'source_w', 'Web order','Gender=male', 'Address_is_res']

# numeric
num_col = ["Freq", "last_update_days_ago", "1st_update_days_ago"]

In [204]:
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X[num_col])
X_normalized = pd.DataFrame(X_normalized, columns=num_col)

In [205]:
def MSE(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    return mse

def score():
    return make_scorer(MSE, greater_is_better=False)

# (a) All data

In [206]:
# Split training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [207]:
# Normalize
scaler = StandardScaler()
X_train[num_col] = scaler.fit_transform(X_train[num_col])
X_test[num_col] = scaler.transform(X_test[num_col])

In [208]:
# Nasted cross-validation
inner_cv = KFold(n_splits=4, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

## linear regression

In [141]:
lr = ElasticNet(random_state = 42)
ratio = [0.1, 0.3, 0.5, 0.7, 0.9]
lr_grid = {'l1_ratio':ratio}
         
lr_clf = GridSearchCV(estimator=lr, param_grid=lr_grid, cv=inner_cv, scoring=score())
lr_pred = lr_clf.fit(X_train, y_train)

print('best params: ', lr_pred.best_params_)
print('rmse: ', np.sqrt(-lr_pred.best_score_))

best params:  {'l1_ratio': 0.9}
rmse:  129.70808513797073


## KNN

In [100]:
weights = ['uniform', 'distance']
k_values = list(range(1, 30))
k_grid = {'weights': weights, 'n_neighbors': k_values}

knn = KNeighborsRegressor()
knn_clf = GridSearchCV(knn, k_grid, cv=inner_cv, scoring=score())
knn_pred = knn_clf.fit(X_train, y_train)

print('best params: ', knn_pred.best_params_)
print('rmse: ', np.sqrt(-knn_pred.best_score_))

best params:  {'n_neighbors': 8, 'weights': 'uniform'}
rmse:  130.34266644849652


## Regression Tree

In [101]:
depth = list(range(1, 10))  # max_depth
split = list(range(2, 10))  # min_samples_split
rt_grid = {'max_depth': depth, 'min_samples_split': split}

rt = DecisionTreeRegressor()
rt_clf = GridSearchCV(rt, rt_grid, cv=inner_cv, scoring=score())
rt_pred = rt_clf.fit(X_train, y_train)

print('best params: ', rt_pred.best_params_)
print('rmse: ', np.sqrt(-rt_pred.best_score_))

best params:  {'max_depth': 6, 'min_samples_split': 3}
rmse:  136.17687782999906


## SVM regreesion

In [171]:
kernal = ['rbf']
gamma = ['scale', 'auto']
c = [10 ** i for i in range(-2, 4)] 
svr_grid = {'kernel': kernal, 'C' : c, 'gamma' : gamma }

svr = SVR()
svr_clf = RandomizedSearchCV(svr, param_distributions=svr_grid, n_iter=20, cv=inner_cv, scoring=score(), random_state=42)
svr_pred = svr_clf.fit(X_train, y_train)

print('best params: ', svr_pred.best_params_)
print('rmse: ', np.sqrt(-svr_pred.best_score_))

best params:  {'kernel': 'rbf', 'gamma': 'auto', 'C': 1000}
rmse:  125.77035848660468


## Neural Network

In [181]:
def create_model(nb_hidden, activation):
    model = Sequential()
    model.add(Dense(nb_hidden, input_dim=X_train.shape[1], activation=activation))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])
    return model

act = ['relu', 'tanh']
hidden = np.array([64, 128, 256])
epoch = list(range(3, 10))
NN_grid = {'activation': act, 'nb_hidden': hidden, 'epochs': epoch}

NN = KerasRegressor(build_fn=create_model, batch_size=256, verbose=0, nb_hidden=64, activation='relu')

In [182]:
NN_clf = GridSearchCV(estimator=NN, param_grid=NN_grid, scoring = score(), cv=5)
NN_pred=NN_clf.fit(X_train, y_train)

print('best params: ', NN_pred.best_params_)
print('rmse:' , np.sqrt(-NN_pred.best_score_))

best params:  {'activation': 'tanh', 'epochs': 9, 'nb_hidden': 256}
rmse: 211.34970411701931


## Ensembling

In [120]:
estimators = [
    ('linear regression', ElasticNet(l1_ratio=0.9)),
    ('regression tree', DecisionTreeRegressor(max_depth=6, min_samples_split=9)),
    ('knn', KNeighborsRegressor(n_neighbors=8, weights='uniform')),
    ('svr', SVR(C=100, gamma='auto', kernel='rbf'))
]

meta_model = RandomForestRegressor()

srlf = StackingRegressor(estimators=estimators, final_estimator=meta_model)

grid = {'final_estimator__max_depth': list(range(1, 10)), 'final_estimator__min_samples_split': list(range(2, 10))}

search = RandomizedSearchCV(srlf, grid, n_iter=20, cv=inner_cv, scoring=score(), random_state=42)
result = search.fit(X_train, y_train)

print('best params: ', result.best_params_)
print('rmse:', np.sqrt(-result.best_score_))

best params:  {'final_estimator__min_samples_split': 8, 'final_estimator__max_depth': 3}
rmse: 124.73319989419122


# Model Selection

"Stack" has the lowest RMSE (133.83), which suggests that the stacking ensemble model has the best overall performance among the models 

In [121]:
lr_score = cross_val_score(lr_clf, X=X_normalized, y=y, cv=outer_cv)
knn_score = cross_val_score(knn_clf, X=X_normalized, y=y, cv=outer_cv)
rt_score = cross_val_score(rt_clf , X=X_normalized, y=y, cv=outer_cv)
svm_score = cross_val_score(svr_clf, X=X_normalized, y=y, cv=outer_cv)
stack_score = cross_val_score(search, X=X_normalized, y=y, cv=outer_cv)

In [122]:
score = {}
score['linear regression'] = np.sqrt(-lr_score).mean()
score['KNN'] = np.sqrt(-knn_score).mean()
score['Regression tree'] = np.sqrt(-rt_score).mean()
score['SVR'] = np.sqrt(-svm_score).mean()
score['Stack'] = np.sqrt(-stack_score).mean()
score

{'linear regression': 135.2529179706996,
 'KNN': 135.86246033304639,
 'Regression tree': 140.4184178864601,
 'SVR': 139.6390532210261,
 'Stack': 133.83387523437838}

# (b): Purchase = 1

In [183]:
data_b = data[data['Purchase'] == 1]

In [184]:
X = data_b.drop(['Spending','Purchase'],axis=1)
y = data_b[['Spending']]

In [185]:
# categorical
cate_col = ['US', 'source_a', 'source_c', 'source_b', 'source_d', 'source_e','source_m', 'source_o', 'source_h', 'source_r', 'source_s', 'source_t',
'source_u', 'source_p', 'source_x', 'source_w', 'Web order','Gender=male', 'Address_is_res']

# numeric
num_col = ["Freq", "last_update_days_ago", "1st_update_days_ago"]

In [186]:
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X[num_col])
X_normalized = pd.DataFrame(X_normalized, columns=num_col)

In [187]:
# Split training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [188]:
# Normalize
scaler = StandardScaler()
X_train[num_col] = scaler.fit_transform(X_train[num_col])
X_test[num_col] = scaler.transform(X_test[num_col])

In [189]:
# Nasted cross-validation
inner_cv = KFold(n_splits=4, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

## Score

In [190]:
def MSE(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    return mse

def score():
    return make_scorer(MSE, greater_is_better=False)

## linear regression

In [191]:
lr = ElasticNet(random_state = 42)
ratio = [0.1, 0.3, 0.5, 0.7, 0.9]
lr_grid = {'l1_ratio':ratio}
         
lr_clf = GridSearchCV(estimator=lr, param_grid=lr_grid, cv=inner_cv, scoring=score())
lr_pred = lr_clf.fit(X_train, y_train)

print('best params: ', lr_pred.best_params_)
print('rmse: ', np.sqrt(-lr_pred.best_score_))

best params:  {'l1_ratio': 0.9}
rmse:  160.6944881454774


## KNN

In [192]:
weights = ['uniform', 'distance']
k_values = list(range(1, 30))
k_grid = {'weights': weights, 'n_neighbors': k_values}

knn = KNeighborsRegressor()
knn_clf = GridSearchCV(knn, k_grid, cv=inner_cv, scoring=score())
knn_pred = knn_clf.fit(X_train, y_train)

print('best params: ', knn_pred.best_params_)
print('rmse: ', np.sqrt(-knn_pred.best_score_))

best params:  {'n_neighbors': 9, 'weights': 'distance'}
rmse:  163.19243730160628


## Regression Tree

In [193]:
depth = list(range(1, 10))  # max_depth
split = list(range(2, 10))  # min_samples_split
rt_grid = {'max_depth': depth, 'min_samples_split': split}

rt = DecisionTreeRegressor()
rt_clf = GridSearchCV(rt, rt_grid, cv=inner_cv, scoring=score())
rt_pred = rt_clf.fit(X_train, y_train)

print('best params: ', rt_pred.best_params_)
print('rmse: ', np.sqrt(-rt_pred.best_score_))

best params:  {'max_depth': 3, 'min_samples_split': 2}
rmse:  169.76163685923171


## SVM regreesion

In [194]:
kernal = ['rbf']
gamma = ['scale', 'auto']
c = [10 ** i for i in range(-2, 4)] 
svr_grid = {'kernel': kernal, 'C' : c, 'gamma' : gamma }

svr = SVR()
svr_clf = RandomizedSearchCV(svr, param_distributions=svr_grid, n_iter=20, cv=inner_cv, scoring=score(), random_state=42)
svr_pred = svr_clf.fit(X_train, y_train)

print('best params: ', svr_pred.best_params_)
print('rmse: ', np.sqrt(-svr_pred.best_score_))

best params:  {'kernel': 'rbf', 'gamma': 'auto', 'C': 1000}
rmse:  154.29887358848418


## Ensembling

In [195]:
estimators = [
    ('linear regression', ElasticNet(l1_ratio=0.9)),
    ('regression tree', DecisionTreeRegressor(max_depth=6, min_samples_split=9)),
    ('knn', KNeighborsRegressor(n_neighbors=8, weights='uniform')),
    ('svr', SVR(C=100, gamma='auto', kernel='rbf'))
]

meta_model = RandomForestRegressor()

srlf = StackingRegressor(estimators=estimators, final_estimator=meta_model)

grid = {'final_estimator__max_depth': list(range(1, 10)), 'final_estimator__min_samples_split': list(range(2, 10))}

search = RandomizedSearchCV(srlf, grid, n_iter=20, cv=inner_cv, scoring=score(), random_state=42)
result = search.fit(X_train, y_train)

print('best params: ', result.best_params_)
print('rmse:', np.sqrt(-result.best_score_))

best params:  {'final_estimator__min_samples_split': 6, 'final_estimator__max_depth': 2}
rmse: 162.9214252528662


## Neural Network

In [196]:
def create_model(nb_hidden, activation):
    model = Sequential()
    model.add(Dense(nb_hidden, input_dim=X_train.shape[1], activation=activation))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])
    return model

act = ['relu', 'tanh']
hidden = np.array([64, 128, 256])
epoch = list(range(3, 10))
NN_grid = {'activation': act, 'nb_hidden': hidden, 'epochs': epoch}

NN = KerasRegressor(build_fn=create_model, batch_size=256, verbose=0, nb_hidden=64, activation='relu')

In [197]:
NN_clf = GridSearchCV(estimator=NN, param_grid=NN_grid, scoring = score(), cv=5)
NN_pred=NN_clf.fit(X_train, y_train)

print('best params: ', NN_pred.best_params_)
print('rmse:' , np.sqrt(-NN_pred.best_score_))

best params:  {'activation': 'tanh', 'epochs': 9, 'nb_hidden': 256}
rmse: 286.0403727465075


# Model Selection

In [198]:
lr_score = cross_val_score(lr_clf, X=X_normalized, y=y, cv=outer_cv)
knn_score = cross_val_score(knn_clf, X=X_normalized, y=y, cv=outer_cv)
rt_score = cross_val_score(rt_clf , X=X_normalized, y=y, cv=outer_cv)
svm_score = cross_val_score(svr_clf, X=X_normalized, y=y, cv=outer_cv)
stack_score = cross_val_score(search, X=X_normalized, y=y, cv=outer_cv)

In [199]:
score = {}
score['linear regression'] = np.sqrt(-lr_score).mean()
score['KNN'] = np.sqrt(-knn_score).mean()
score['Regression tree'] = np.sqrt(-rt_score).mean()
score['SVR'] = np.sqrt(-svm_score).mean()
score['Stack'] = np.sqrt(-stack_score).mean()
score

{'linear regression': 167.67832131870085,
 'KNN': 169.90874057134783,
 'Regression tree': 183.30661319425926,
 'SVR': 170.76628550096305,
 'Stack': 167.49949605164755}

# for task (a) vs. task (b): which models exhibit better predictive performance?

- In the scenario where you have both purchase categories, the models tend to perform better, as they can capture general trends that apply to the entire dataset.
- When using only purchase=1 data, the models can struggle because they may overfit or fail to capture the broader distribution of spending.
- Stacking consistently performs well, as it can adapt to the specific nuances of the data while also considering the broader patterns.