In [1]:
from sklearn import svm, preprocessing
#from statsmodels.api import datasets
from sklearn import datasets ## Get dataset from sklearn
import sklearn.model_selection as ms
import sklearn.metrics as sklm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import numpy.random as nr
import itertools

%matplotlib inline

In [2]:
bikesData = pd.read_csv('BikesProcessed.csv')
Labels = bikesData['BikeBuyer']
bikesData.drop(bikesData.columns[0], axis=1, inplace=True)
bikesData.drop(['BikeBuyer'], axis=1, inplace=True)
Features = np.array(bikesData)
print(Features.shape)
bikesData.head()

(16404, 12)


Unnamed: 0,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,AreaCode,Sex,Married,JobType,EducationType,Country,Income,AgeBracket,ChildrenFlag
0,1,0,0,500,1,1,0,0,1,3,0,1
1,0,1,3,500,1,0,0,0,1,3,1,1
2,1,1,3,500,1,1,0,0,1,2,1,1
3,0,1,0,500,0,0,0,0,1,2,0,0
4,1,4,5,500,0,0,0,0,1,2,0,1


# Randomly sample cases to create independent training and test data

nr.seed(1115)
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 5000)
X_train = Features[indx[0],:]
y_train = np.ravel(Labels[indx[0]])
X_test = Features[indx[1],:]
y_test = np.ravel(Labels[indx[1]])

#Rescale numeric features
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
print(X_train.shape)
X_train[:5,:]

print("kernal\t C\t gamma\t Training Set %\t Test Set %")
for c in [0.1, 1, 100]:
    for g in [0.1, 1, 10]:
        for k in ['linear', 'poly', 'rbf', 'sigmoid']:
            svm_mod = svm.SVC(kernel=k, C=c, gamma=g)
            svm_mod.fit(X_train, y_train)
            print(k, "\t", c, "\t", g, "\t %f" % svm_mod.score(X_train, y_train), "\t %f" % svm_mod.score(X_test, y_test))

print("C\t gamma\t Training Set %\t Test Set %")
for c in [0.01, 0.1, 1, 10, 100, 1000]:
    for g in [0.01, 0.1, 1, 10, 100, 1000]:
        svm_mod = svm.SVC(kernel='rbf', C=c, gamma=g)
        svm_mod.fit(X_train, y_train)
        print(c, "\t", g, "\t %f" % svm_mod.score(X_train, y_train), "\t %f" % svm_mod.score(X_test, y_test))

# Build a tree and compute the feature importances
tree = DecisionTreeClassifier(random_state=0)

tree.fit(X_train, y_train)
importances = tree.feature_importances_

indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, bikesData.columns[int(indices[f])], importances[indices[f]]))

print('\n')
print("accuracy on training set: %f" % tree.score(X_train, y_train))
print("accuracy on test set: %f" % tree.score(X_test, y_test))

Nested cross validation
Code below based on:
https://www.edx.org/course/principles-of-machine-learning-python-edition

#nr.seed(123)
inside = ms.KFold(n_splits=2, shuffle = True)
#nr.seed(321)
outside = ms.KFold(n_splits=2, shuffle = True)

#nr.seed(3456)
## Define the dictionary for the grid search and the model object to search on
param_grid = {"C": [0.1, 1, 100], "gamma":[0.1, 1, 10]}
## Define the SVM model
svc_clf = svm.SVC() #class_weight = {0:0.33, 1:0.67}

## Perform the grid search over the parameters
clf = ms.GridSearchCV(estimator = svc_clf, param_grid = param_grid, 
                      cv = inside, # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)
clf.fit(Features, Labels)
print(clf.best_estimator_.C)
print(clf.best_estimator_.gamma)

#nr.seed(498)
cv_estimate = ms.cross_val_score(clf, Features, Labels, 
                                 cv = outside) # Use the outside folds

print('Mean performance metric = %4.3f' % np.mean(cv_estimate))
print('SDT of the metric       = %4.3f' % np.std(cv_estimate))
print('Outcomes by cv fold')
for i, x in enumerate(cv_estimate):
    print('Fold %2d    %4.3f' % (i+1, x))

## Randomly sample cases to create independent training and test data
#nr.seed(1115)
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 5000)
X_train = Features[indx[0],:]
y_train = np.ravel(Labels[indx[0]])
X_test = Features[indx[1],:]
y_test = np.ravel(Labels[indx[1]])

#Rescale numeric features
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

#nr.seed(1115)
svm_mod = svm.SVC(C = clf.best_estimator_.C,
                  gamma = clf.best_estimator_.gamma,
                  #class_weight = {0:0.33, 1:0.67},
                  probability=True) 
svm_mod.fit(X_train, y_train)

def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])

def print_metrics(labels, probs, threshold):
    scores = score_model(probs, threshold)
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy        %0.2f' % sklm.accuracy_score(labels, scores))
    print('AUC             %0.2f' % sklm.roc_auc_score(labels, probs[:,1]))
    print('Macro precision %0.2f' % float((float(metrics[0][0]) + float(metrics[0][1]))/2.0))
    print('Macro recall    %0.2f' % float((float(metrics[1][0]) + float(metrics[1][1]))/2.0))
    print(' ')
    print('           Positive      Negative')
    print('Num case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])
    
probabilities = svm_mod.predict_proba(X_test)
print_metrics(y_test, probabilities, 0.5) 

Finds the best features for predicting the labels

In [3]:
feature_combs = []

for i in range(1, 13):
    feature_combs.extend(list(itertools.combinations(list(bikesData), i)))         
print(len(feature_combs))

4095


In [4]:
print("Training Set %\t Test Set %\t Features")
size = []
train_scores = []
test_scores = []
nr.seed(1115)
for comb in feature_combs:
    comb_features = np.array(bikesData[np.array(comb)])
    indx = range(comb_features.shape[0])
    indx = ms.train_test_split(indx, test_size = 5000, random_state=0)
    X_train = comb_features[indx[0],:]
    y_train = np.ravel(Labels[indx[0]])
    X_test = comb_features[indx[1],:]
    y_test = np.ravel(Labels[indx[1]])
    
    #print(bikesData[list(comb)])
    #X_train, X_test, y_train, y_test = ms.train_test_split(
    #    bikesData[list(comb)], Labels)
    
    #Rescale numeric features
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    svm_mod = svm.SVC(random_state=0)
    svm_mod.fit(X_train, y_train)
    #print("%f" % svm_mod.score(X_train, y_train), "\t %f" % svm_mod.score(X_test, y_test), "\t", comb)
    train_scores.append(svm_mod.score(X_train, y_train))
    test_scores.append(svm_mod.score(X_test, y_test))
    size.append(len(comb))

Training Set %	 Test Set %	 Features




In [5]:
midx = pd.DataFrame()
midx['size'] = size
midx['train_scores'] = train_scores
midx['test_scores'] = test_scores
midx['combs'] = feature_combs
midx.to_csv('SVM.csv')

midx_size = pd.DataFrame()
midx_size['size'] = size

midx_train = pd.DataFrame()
midx_train['train_scores'] = train_scores

midx_test = pd.DataFrame()
midx_test['test_scores'] = test_scores

midx_combs = pd.DataFrame()
midx_combs['combs'] = feature_combs

midx = pd.concat([midx_size, midx_train, midx_test, midx_combs], ignore_index=True, axis=1)

midx.to_csv('SVM.csv')

(pd.Series(test_scores, index=feature_combs)
   .nlargest(10)
   .plot(kind='barh')) 

Nested cross validation
Code below based on:
https://www.edx.org/course/principles-of-machine-learning-python-edition

chosen_features = bikesData[['HomeOwnerFlag', 'Sex', 'Married', 'JobType', 'EducationType', 'Income', 'AgeBracket', 'ChildrenFlag', 'OnePlusCarsFlag']]
chosen_features = np.array(chosen_features)

#nr.seed(123)
inside = ms.KFold(n_splits=2, shuffle = True)
#nr.seed(321)
outside = ms.KFold(n_splits=2, shuffle = True)

#nr.seed(3456)
## Define the dictionary for the grid search and the model object to search on
param_grid = {"C": [0.1, 1, 10, 100], "gamma":[0.1, 1, 10, 100]}
## Define the SVM model
svc_clf = svm.SVC() #class_weight = {0:0.33, 1:0.67}

## Perform the grid search over the parameters
clf = ms.GridSearchCV(estimator = svc_clf, param_grid = param_grid, 
                      cv = inside, # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)
clf.fit(chosen_features, Labels)
print(clf.best_estimator_.C)
print(clf.best_estimator_.gamma)

#nr.seed(498)
cv_estimate = ms.cross_val_score(clf, chosen_features, Labels, 
                                 cv = outside) # Use the outside folds

print('Mean performance metric = %4.3f' % np.mean(cv_estimate))
print('SDT of the metric       = %4.3f' % np.std(cv_estimate))
print('Outcomes by cv fold')
for i, x in enumerate(cv_estimate):
    print('Fold %2d    %4.3f' % (i+1, x))

## Randomly sample cases to create independent training and test data
#nr.seed(1115)
indx = range(chosen_features.shape[0])
indx = ms.train_test_split(indx, test_size = 5000)
X_train = chosen_features[indx[0],:]
y_train = np.ravel(Labels[indx[0]])
X_test = chosen_features[indx[1],:]
y_test = np.ravel(Labels[indx[1]])

#Rescale numeric features
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

#nr.seed(1115)
svm_mod = svm.SVC(C = clf.best_estimator_.C,
                  gamma = clf.best_estimator_.gamma,
                  #class_weight = {0:0.33, 1:0.67},
                  probability=True) 
svm_mod.fit(X_train, y_train)

def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])

def print_metrics(labels, probs, threshold):
    scores = score_model(probs, threshold)
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy        %0.2f' % sklm.accuracy_score(labels, scores))
    print('AUC             %0.2f' % sklm.roc_auc_score(labels, probs[:,1]))
    print('Macro precision %0.2f' % float((float(metrics[0][0]) + float(metrics[0][1]))/2.0))
    print('Macro recall    %0.2f' % float((float(metrics[1][0]) + float(metrics[1][1]))/2.0))
    print(' ')
    print('           Positive      Negative')
    print('Num case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])
    
probabilities = svm_mod.predict_proba(X_test)
print_metrics(y_test, probabilities, 0.5) 