# Consumer Spending Predictive

- **Goal:** Predict the amount of purchase for consumers
- **Data source:** please refer to the dataset in the repository
- **Model:** Ensemble Stacking/ Deep Neural Network/ Support Vector Regressor/ ElasticNet/ Regression Tree/ KNN Regressor

### Process

- Data Preprocessing
- Modeling (Including Hyperparameter tuning)
- ConclusionReference: GitHub user bgg11117

In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from mlxtend.classifier import StackingClassifier
from sklearn.ensemble import StackingRegressor, StackingClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, roc_curve, auc, confusion_matrix, 
                            plot_confusion_matrix, classification_report

import tensorflow as tf 
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.datasets import mnist
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier

pd.set_option('display.max_columns', None)

In [28]:
import tensorflow as tf 
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.datasets import mnist
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier

In [152]:
df = pd.read_csv('HW3.csv').drop(['sequence_number','Purchase'], axis= 1)
df.head(3)

Unnamed: 0,US,source_a,source_c,source_b,source_d,source_e,source_m,source_o,source_h,source_r,source_s,source_t,source_u,source_p,source_x,source_w,Freq,last_update_days_ago,1st_update_days_ago,Web order,Gender=male,Address_is_res,Spending
0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,3662,3662,1,0,1,127.87
1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2900,2900,1,1,0,0.0
2,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,3883,3914,0,0,0,127.48


## Data Preprocessing

In [153]:
# Change the int data into float data
column1 = df.loc[:, df.columns.isin(['Freq', 'last_update_days_ago', '1st_update_days_ago','Spending'])].columns
for col in column1:
    df[col] = df[col].astype('float')

# Change the dummy variable into string data type
column2 = df.loc[:, ~df.columns.isin(['Freq', 'last_update_days_ago', '1st_update_days_ago','Spending'])].columns
for col in column2:
    df[col] = df[col].astype('str')

In [226]:
# Check the missing value
df.isnull().sum()

US                      0
source_a                0
source_c                0
source_b                0
source_d                0
source_e                0
source_m                0
source_o                0
source_h                0
source_r                0
source_s                0
source_t                0
source_u                0
source_p                0
source_x                0
source_w                0
Freq                    0
last_update_days_ago    0
1st_update_days_ago     0
Web order               0
Gender=male             0
Address_is_res          0
Spending                0
dtype: int64

In [161]:
# Train-Test Split
X = df.iloc[:,:22]
y = df.iloc[:,22]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

# Define cross-validation set
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)


## Feature Engineering

In [227]:
def gridsearch(estimator, param_grid, cv=inner_cv, scoring='neg_mean_squared_error'):

    # Create pipeline to standardize features and perform gridsearch
    scaler = ColumnTransformer(
                transformers=[('ss', StandardScaler(), ["Freq", "last_update_days_ago", "1st_update_days_ago"])],
                remainder="passthrough")
                # set remainder="passthrough" so that the non-specified columns will not be dropped  

    pipe = Pipeline([('scaler', scaler), ('estimator', estimator)])

    clf = GridSearchCV(estimator=pipe, param_grid=param_grid, n_jobs=-1, cv=inner_cv, scoring=scoring, error_score = 'raise')
    grid_result = clf.fit(X_train, y_train)
    
    # best params of gridsearch 
    print(f"Best score is {grid_result.best_score_} with best param {grid_result.best_params_}")

    # prediction result
    y_pred = clf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Prediction result -> MSE = {mse}") 
    
    

# Modeling

#### Create param_grid for all the hyperparameters


In [198]:
# Linear Regression(resularization: ElasticNet) hyperparameter
elasticnet_paramGrid = dict(estimator__l1_ratio = [0.1, 0.3, 0.5, 0.7, 0.9])

# KNN hyperparamter
k = list(range(2,15))
knn_paramGrid = dict(estimator__n_neighbors = k)

# Regression Tree hyperparamter
depth = list(range(2,6))
dtr_paramGrid = dict(estimator__max_depth = depth)

# SVR hyperparamter
svr_paramGrid = dict(estimator__kernel = ['rbf'], estimator__gamma = [1e-3, 1e-4], estimator__C = [1, 10, 100, 1000])

# Neural Network
activations = ['relu', 'tanh']
optimizers = ['adam']
hiddens = [64, 128, 256]
epochs = list(range(3, 10))
nn_paramGrid = dict(estimator__activation=activations, 
                     estimator__optimizer=optimizers, 
                     estimator__hidden=hiddens, 
                     estimator__epochs = epochs)

# Ensemble method hyperparameter
stack_paramGrid = {'estimator__regrtree__max_depth': depth}



### Linear Regression (resularization: ElasticNet)

In [166]:
elasticnet_paramGrid = dict(estimator__l1_ratio = [0.1, 0.3, 0.5, 0.7, 0.9])
EN = ElasticNet()
gridsearch(EN, elasticnet_paramGrid)

Best score is -16763.62882315244 with best param {'estimator__l1_ratio': 0.9}
Prediction result -> MSE = 17757.601219225937


### KNN

In [167]:
knn = KNeighborsRegressor() 
gridsearch(knn, knn_paramGrid)


Best score is -17521.96483054584 with best param {'estimator__n_neighbors': 10}
Prediction result -> MSE = 19217.8189857825


### Regression Tree

In [170]:
regr_tree = DecisionTreeRegressor()
gridsearch(regr_tree, dtr_paramGrid)

Best score is -18544.884478096414 with best param {'estimator__max_depth': 3}
Prediction result -> MSE = 19685.53248668529


### Support Vector Regressor

In [173]:
svr_rbf = SVR()
gridsearch(svr_rbf, svr_paramGrid)

Best score is -18798.134845108973 with best param {'estimator__C': 1000, 'estimator__gamma': 0.001, 'estimator__kernel': 'rbf'}
Prediction result -> MSE = 20488.117916344934


### Deep Neural Network

In [223]:
def nn_construct(activation, optimizer, hidden):
    model = Sequential()
    model.add(Dense(hidden, input_dim=X_train.shape[1], activation=activation))
    model.add(Dense(hidden, activation=activation))
    model.add(Dense(hidden, activation=activation))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse'])    
    return model

In [224]:
# Since Neural Network only accept float data, I then transform the object data into float data

df_nn = df.copy()
for col in column2:
    df_nn[col] = df_nn[col].astype('float')

# Train-Test Split
X = df_nn.iloc[:,:22]
y = df_nn.iloc[:,22]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

NN = KerasRegressor(build_fn = nn_construct, verbose = 0)

gridsearch(NN, nn_paramGrid)


  NN = KerasRegressor(build_fn = nn_construct, verbose = 0)


Best score is -15418.869329424706 with best param {'estimator__activation': 'relu', 'estimator__epochs': 9, 'estimator__hidden': 256, 'estimator__optimizer': 'adam'}
Prediction result -> MSE = 15866.246731986128


### Ensemble Stacking

In [213]:
for col in column2:
    df[col] = df[col].astype('str')

# Train-Test Split
X = df.iloc[:,:22]
y = df.iloc[:,22]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

stacking_estimators = [('regrtree', DecisionTreeRegressor()),
              ('elasticnet', ElasticNet(random_state=42, l1_ratio= 0.9)),
              ('knnreg', KNeighborsRegressor(n_neighbors= 10)),
              ('svr',SVR(C = 1000, gamma = 0.001, kernel = 'rbf'))]
               
ensemble_stacking = StackingRegressor(estimators= stacking_estimators, final_estimator=RandomForestRegressor())

gridsearch(ensemble_stacking, stack_paramGrid)

Best score is -17385.47402379638 with best param {'estimator__regrtree__max_depth': 5}
Prediction result -> MSE = 17618.37960483427


# Conclusion

In this project, I first created a function to compiled the normalization and gridsearch into pipeline. The return of the function including the best gridsearch score, best parameter of the model, and the MSE of test set of each model. The result shows that Deep Neural Network is the best model with the lowest MSE score.