In [55]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy.random as nr
import math
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm
import sklearn.decomposition as skde

from sklearn import feature_selection as fs

from sklearn import metrics

import sklearn.decomposition as skde

from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import AdaBoostClassifier

from sklearn.neural_network import MLPClassifier

from sklearn import svm

%matplotlib inline

In [56]:
## Load dataset
AdvWorksCusts = pd.read_csv('./PreparedData/AdvWorksCusts_Prepared.csv')
AW_BikeBuyer = pd.read_csv('./PreparedData/AW_BikeBuyer_Prepared.csv')
AdvWorksCusts = AdvWorksCusts.merge(AW_BikeBuyer, on='CustomerID')
AdvWorksCusts.dtypes

CustomerID               int64
Title                   object
FirstName               object
MiddleName              object
LastName                object
Suffix                  object
AddressLine1            object
AddressLine2            object
City                    object
StateProvinceName       object
CountryRegionName       object
PostalCode              object
PhoneNumber             object
BirthDate               object
Education               object
Occupation              object
Gender                  object
MaritalStatus           object
HomeOwnerFlag            int64
NumberCarsOwned          int64
NumberChildrenAtHome     int64
TotalChildren            int64
YearlyIncome             int64
BikeBuyer                int64
dtype: object

In [57]:
# Examine the class imbalance in the data
AdvWorksCusts[['CustomerID', 'BikeBuyer']].groupby('BikeBuyer').count()

Unnamed: 0_level_0,CustomerID
BikeBuyer,Unnamed: 1_level_1
0,10953
1,5451


In [58]:
Features = np.array(pd.read_csv('./PreparedData/BikeBuyerFeatures.csv'))
Labels = np.array(pd.read_csv('./PreparedData/BikeBuyerLabels.csv'))
Features_test = np.array(pd.read_csv('./PreparedData/Features_test.csv'))
Features.shape

(16404, 34)

In [59]:
def scoring_model(probs, threshold):
    # compute the positive (1) predictions
    return [1 if prob > threshold else 0 for prob in probs[:, 1]]

## Feature Selection


In [60]:
# Remove features with low variance (variance threshold set to .8 * (1 - .8)))
sel = fs.VarianceThreshold(threshold=(.8 * (1 - .8)))
Features_reduced = sel.fit_transform(Features)
Features_test_reduced = sel.fit_transform(Features_test)
# Reshape the Label array
Labels = Labels.reshape(Labels.shape[0],)
# Set folds for nested cross validation
feature_folds = ms.KFold(n_splits=10, shuffle = True) 

# Logistic Regression Model

In [61]:
logistic_mod = linear_model.LogisticRegression(C=0.1, class_weight = {0:0.45, 1:0.55})

# Perform feature selection by CV with high variance features only
selector = fs.RFECV(estimator = logistic_mod, cv = feature_folds,
                      scoring = 'roc_auc').fit(Features_reduced, Labels)
Features_logistic = selector.transform(Features_reduced)
Features_test_logistic = selector.transform(Features_test_reduced)
print(Features_logistic.shape, Features_test_logistic.shape)

logistic_mod.fit(Features_logistic, Labels)
probs = logistic_mod.predict_proba(Features_test_logistic)
result = scoring_model(probs, threshold=0.5)

(16404, 16) (500, 16)


In [62]:
pd.DataFrame(result).to_csv('./PredictionResults/BikeBuyerPrediction_LogisticRegression.csv', 
                            index = False, header = True)

# Random Forest Model

In [63]:
rf_mod = RandomForestClassifier(max_features = 3,
                                min_samples_leaf = 20)
rf_mod.fit(Features_reduced, Labels)
probs = rf_mod.predict_proba(Features_test_reduced)
result = scoring_model(probs, 0.5)
pd.DataFrame(result).to_csv('./PredictionResults/BikeBuyerPrediction_RandomForest.csv', 
                            index = False, header = True)

# AdaBoost Model

In [64]:
ab_mod = AdaBoostClassifier(learning_rate=1)
ab_mod.fit(Features_reduced, Labels)
probs = rf_mod.predict_proba(Features_test_reduced)
result = scoring_model(probs, 0.5)
pd.DataFrame(result).to_csv('./PredictionResults/BikeBuyerPrediction_AdaBoost.csv', 
                            index = False, header = True)

# Neural Network

In [65]:
nn_mod = MLPClassifier(beta_1=0.9, beta_2=0.99)
nn_mod.fit(Features_reduced, Labels)
probs = nn_mod.predict_proba(Features_test_reduced)
result = scoring_model(probs, 0.5)
pd.DataFrame(result).to_csv('./PredictionResults/BikeBuyerPrediction_NN.csv', 
                            index = False, header = True)



# SVM

In [66]:
svm_mod = svm.SVC(class_weight='balanced',
                 C=100,
                 gamma=0.02,
                 probability=True)
svm_mod.fit(Features_reduced, Labels)
probs = svm_mod.predict_proba(Features_test_reduced)
result = scoring_model(probs, 0.5)
pd.DataFrame(result).to_csv('./PredictionResults/BikeBuyerPrediction_SVM.csv', 
                            index = False, header = True)