In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nose.tools import *

np.random.seed(24680)

Write your imports in the cell below.

In [3]:
from sklearn import preprocessing, svm, metrics
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# SVMs and Random Forests
## Training and comparing different algorithms

The goal is to try and improve our predictions (if they can be improved at all) using different types of algorithms.
### 1. Read the data 
Read `bank` data. Separate the indicator variables. 
Read the dataset and save it in the variable `bank_data`. The target column is `y`. Use the variables `bank_attributes` and `bank_labels` to save the attributes (explanatory variables, features, predictors), and labels (`y`).

In [4]:
bank_data = pd.read_csv('data/bank.csv', sep=",")
bank_attributes = bank_data.drop("y", axis = 1) 
bank_labels = bank_data[["y"]]

In [5]:
bank_data

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,y
0,30,1787,19,79,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,33,4789,11,220,1,339,4,0,0,0,...,0,1,0,0,0,1,0,0,0,0
2,35,1350,16,185,1,330,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,30,1476,3,199,4,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,59,0,5,226,1,-1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
5,35,747,23,141,2,176,3,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6,36,307,14,341,1,330,2,0,0,0,...,0,1,0,0,0,0,1,0,0,0
7,39,147,6,151,2,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
8,41,221,14,57,2,-1,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
9,43,-88,17,313,1,147,2,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [6]:
assert_is_not_none(bank_data)
assert_is_not_none(bank_attributes)
assert_is_not_none(bank_labels)

### 2. Normalize the data
Because both forests and SVMs are sensitive to non-scaled data, we need to normalize our dataset first.

Rescale all columns in `bank_attributes` so they have mean = 0 and variance = 1. You can either look at the `sklearn` docs or do this yourself. When you're ready, overwrite the `bank_attributes` column. Make sure that you don't lose the column names in the process.

In [7]:
std_scale = preprocessing.StandardScaler().fit(bank_attributes)
bank_attributes = std_scale.transform(bank_attributes)

In [8]:
pd.DataFrame(bank_attributes).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,...,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,-1.352355e-16,4.4285520000000006e-17,-1.150742e-16,-4.6707460000000006e-17,2.915778e-16,2.864331e-16,1.93804e-16,-1.744531e-16,-4.9899870000000006e-17,4.308284e-16,...,3.106709e-16,-4.249654e-16,6.22717e-16,-6.269408000000001e-17,-1.74846e-17,9.111270000000001e-17,2.404378e-16,-3.288677e-16,4.675289e-16,1.709169e-17
std,1.000111,1.000111,1.000111,1.000111,1.000111,1.000111,1.000111,1.000111,1.000111,1.000111,...,1.000111,1.000111,1.000111,1.000111,1.000111,1.000111,1.000111,1.000111,1.000111,1.000111
min,-2.096455,-1.573671,-1.808625,-1.000513,-0.5768295,-0.4072183,-0.3204128,-0.3438445,-0.5144078,-0.1964537,...,-0.364805,-0.104676,-0.6690639,-0.3068278,-0.1342161,-0.107869,-0.3486516,-0.2134471,-0.1713814,-2.130831
25%,-0.7725828,-0.449824,-0.8385461,-0.6156433,-0.5768295,-0.4072183,-0.3204128,-0.3438445,-0.5144078,-0.1964537,...,-0.364805,-0.104676,-0.6690639,-0.3068278,-0.1342161,-0.107869,-0.3486516,-0.2134471,-0.1713814,0.4693005
50%,-0.2052091,-0.3252105,0.01027262,-0.3038984,-0.2552305,-0.4072183,-0.3204128,-0.3438445,-0.5144078,-0.1964537,...,-0.364805,-0.104676,-0.6690639,-0.3068278,-0.1342161,-0.107869,-0.3486516,-0.2134471,-0.1713814,0.4693005
75%,0.7404137,0.01905496,0.6165717,0.2503146,0.06636847,-0.4072183,-0.3204128,-0.3438445,-0.5144078,-0.1964537,...,-0.364805,-0.104676,1.494626,-0.3068278,-0.1342161,-0.107869,-0.3486516,-0.2134471,-0.1713814,0.4693005
max,4.33378,23.18321,1.82917,10.62641,15.18152,8.303196,14.443,2.908291,1.943983,5.090257,...,2.74119,9.553288,1.494626,3.259158,7.450671,9.270507,2.868193,4.685001,5.834939,0.4693005


In [9]:
assert_is_not_none(bank_attributes)

### 3. Split the data 
Use the standard 70% / 30% split. Since this is a classification problem, be sure to stratify the split according to the `bank_labels`.

In [10]:

bank_attributes_train, bank_attributes_test, bank_labels_train, bank_labels_test = train_test_split(bank_attributes, bank_labels, stratify=bank_labels, train_size=0.7, test_size=0.3)


In [11]:
assert_is_not_none(bank_attributes_train)
assert_is_not_none(bank_labels_train)

assert_is_not_none(bank_attributes_test)
assert_is_not_none(bank_labels_test)

In [12]:
# turn to DF
bank_attributes_train = pd.DataFrame(bank_attributes_train)
bank_attributes_test = pd.DataFrame(bank_attributes_test)

### 4. Prepare the cross-validation folds 
Use a stratified k-fold cross-validation split, with $k = 5$. Fit it to the train data. Save the trained cross-validator to the variable `k_fold`.

The data should already be shuffled. There's no need to shuffle it again.

In [13]:
cv = StratifiedKFold(n_splits=5)
svm = SVC()
# split the data, fit the model and compute the score 5 consecutive times (with different splits each time):
k_fold = cross_val_score(svm, bank_attributes_train, bank_labels_train['y'], cv=5)

In [14]:
k_fold

array([0.89099526, 0.90047393, 0.8878357 , 0.89257504, 0.88924051])

In [15]:
assert_is_not_none(k_fold)

### 5. Decision Tree 
Use cross-validation to train and optimize the hyperparameters for a decision tree classifier.

Use grid search with the following grid:
* `max_depth`: 1, 5, 7, 15, 20
* `min_samples_leaf`: 2, 5, 10, 12
* `max_leaf_nodes`: 5, 10, 20

Use the most appropriate scoring metric (remember that accuracy doesn't work in this case because the data is highly imbalanced; we need something which combines precision and recall). Use the cross-validation splits you just created.

Save the grid results in `grid_search`. Save the best classifier in `tree_classifier`.

Optionally, you can print and / or visualize the cross-validation results and the best chosen parameters.

In [16]:
# explore 5*4*3 combinations of Decision Tree Classifier hyperparameters and train each model 5 times. 60 * 5 training rounds
param_grid = [
    {'max_depth': [1, 5, 7, 15, 20], 
     'min_samples_leaf':[2, 5, 10, 12], 
     'max_leaf_nodes':[5, 10, 20]}
]
decis_tree_classifier = DecisionTreeClassifier()
grid_search = GridSearchCV(decis_tree_classifier, param_grid, cv=cv, scoring = 'f1')
grid_search.fit(bank_attributes_train, bank_labels_train)
tree_classifier = grid_search.best_estimator_
# explore 5*4*3 combinations of Decision Tree Classifier hyperparameters and train each model 5 times. 60 * 5 training rounds

In [17]:

print("Decision tree - best parameters:", grid_search.best_params_) 
print("Decision tree - best score:", grid_search.best_score_)


Decision tree - best parameters: {'max_depth': 15, 'max_leaf_nodes': 10, 'min_samples_leaf': 2}
Decision tree - best score: 0.5090009764263382


In [18]:
assert_is_not_none(grid_search)
assert_is_not_none(tree_classifier)

### 5. Random Forest 
Use cross-validation to train and optimize the hyperparameters for a random forest classifier. Use the same technique as before.

Use the following grid:
* `n_estimators`: 100, 200, 300 
* `max_depth`: 20, 50, 100

Note that this grid is on the small side but this is mainly due to performance reasons. Also note that the training will take some time.

Save the grid results in `grid_search`. Save the best classifier in `forest_classifier`.

Optionally, you can print and / or visualize the cross-validation results and the best chosen parameters.

Due to the relatively slow training, we've chosen low values for the parameters. The performance of the random forest will be worse than the decision tree. This is not necessarily the case in general, it's due to the parameters we've chosen to try.

In [19]:
param_grid = [
    {'max_depth': [20, 50, 100], 
     'n_estimators':[100, 200, 300]
    }]
rand_forest_classifier = RandomForestClassifier()
grid_search = GridSearchCV(rand_forest_classifier, param_grid, cv=cv, scoring = 'f1')
grid_search.fit(bank_attributes_train, bank_labels_train['y'])
forest_classifier = grid_search.best_estimator_

In [20]:
print("Random forest - best parameters:", grid_search.best_params_)
print("Random forest - best score:", grid_search.best_score_)

Random forest - best parameters: {'max_depth': 50, 'n_estimators': 200}
Random forest - best score: 0.39075133569332104


In [21]:
assert_is_not_none(grid_search)
assert_is_not_none(forest_classifier)

### 6. Linear SVM 
Use cross-validation to train and optimize the hyperparameters for a linear support vector machine. Use the same technique as before.

Use the following grid:
* `C`: 0.1, 0.5, 0.8, 1, 1.5, 2, 6, 10, 15, 20

Note that we're choosing relatively small values for `C`. This is allowed because our data is normalized.

Save the grid results in `grid_search`. Save the best classifier in `linear_svm_classifier`. There are many ways to create a linear SVM classifier. Look up the `sklearn` docs to choose the fastest one (in terms of performance).

Optionally, you can print and / or visualize the cross-validation results and the best chosen parameters.

In [22]:
param_grid = [
        {'C': [0.1, 0.5, 0.8, 1, 1.5, 2, 6, 10, 15, 20]}
    ]
linear_svc = LinearSVC()
grid_search = GridSearchCV(linear_svc, param_grid, cv=cv, scoring = 'f1')
grid_search.fit(bank_attributes_train, bank_labels_train['y'])
linear_svm_classifier = grid_search.best_estimator_

In [23]:
print("Linear SVM - best parameters:", grid_search.best_params_)
print("Linear SVM - best score:", grid_search.best_score_)

Linear SVM - best parameters: {'C': 15}
Linear SVM - best score: 0.421722108864764


In [24]:
assert_is_not_none(grid_search)
assert_is_not_none(linear_svm_classifier)

### 7. Gaussian SVM 
Use cross-validation to train and optimize the hyperparameters for an SVM with a Gaussian kernel. Use the same technique as before.

Use the following grid:
* `C`: 10, 15, 20, 50, 200
* `gamma`: 0.001, 0.01, 0.1, 0.2

Note that this time we give larger values of `C` because the governing parameter here is `gamma`.

Save the grid results in `grid_search`. Save the best classifier in `gaussian_svm_classifier`.

Optionally, you can print and / or visualize the cross-validation results and the best chosen parameters.

In [25]:
# Radial Basis  Function (Gaussian) kernel. Principally, he kernel type to be used in the algorithm is 'rbf' by default

param_grid = [
        {'C': [10, 15, 20, 50, 200], 
        'gamma': [0.001, 0.01, 0.1, 0.2]}
    ]
rbf_svc = SVC(kernel='rbf');

grid_search = GridSearchCV(rbf_svc, param_grid, cv=cv, scoring = 'f1')
grid_search.fit(bank_attributes_train, bank_labels_train['y'])
gaussian_svm_classifier = grid_search.best_estimator_

In [26]:
print("Gaussian SVM - best parameters:", grid_search.best_params_)
print("Gaussian SVM - best score:", grid_search.best_score_)

Gaussian SVM - best parameters: {'C': 20, 'gamma': 0.01}
Gaussian SVM - best score: 0.4576158438127443


In [27]:
assert_is_not_none(grid_search)
assert_is_not_none(gaussian_svm_classifier)

### 9. Compare performance on the testing data 
Now that you've trained all your models, you've got to select the best one. This should be done on the testing data.

Use the appropriate scoring metric to get the testing scores for all your models. Don't forget to pass the **testing**, not the training data. Save all scores.

Choose the best classifier, based on these scores (the one with the highest test score). Of course, this is not enough. We need to look at ROC curves, track performance through other measures, debug the sources of variance in testing results, try more hyperparameters, etc. However, this is enough for an introductory lab :).

Optionally, you can think of combining them into a boosted model but this is out of the scope of this lab.

In [41]:
y_pred_tree = tree_classifier.predict(bank_attributes_test)
tree_classifier_score = metrics.f1_score(bank_labels_test, y_pred_tree)

y_pred_forest = forest_classifier.predict(bank_attributes_test)
forest_classifier_score = metrics.f1_score(bank_labels_test, y_pred_forest)

y_pred_svm = linear_svm_classifier.predict(bank_attributes_test)
linear_svm_classifier_score = metrics.f1_score(bank_labels_test, y_pred_svm)

y_pred_gaussian_svm = gaussian_svm_classifier.predict(bank_attributes_test)
gaussian_svm_classifier_score = metrics.f1_score(bank_labels_test, y_pred_gaussian_svm)


print("Testing scores:")
print("Decision tree:", tree_classifier_score)
print("Random forest:", forest_classifier_score)
print("Linear SVM:", linear_svm_classifier_score)
print("Gaussian SVM:", gaussian_svm_classifier_score)

Testing scores:
Decision tree: 0.46212121212121204
Random forest: 0.35023041474654376
Linear SVM: 0.41666666666666663
Gaussian SVM: 0.4386617100371747


In [30]:
best_classifier = "Decision Tree"
# in most of the cases the three_classifier performs at its best, however not always. 

In [31]:
assert_not_equal(best_classifier, "")