In [4]:
'''
In this chapter, we will be working with the Breast Cancer Wisconsin dataset, 
which contains 569 samples of malignant and benign tumor cells. 
The first two columns in the dataset store the unique ID numbers of the samples and the corresponding diagnosis (M=malignant, B=benign), 
respectively. The columns 3-32 contain 30 real-value features that have been computed from digitized images of the cell nuclei, 
which can be used to build a model to predict whether a tumor is benign or malignant. 
The Breast Cancer Wisconsin dataset has been deposited on the UCI machine learning repository and more detailed information 
about this dataset can be found at https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic).
'''
import pandas as pd
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
from sklearn.preprocessing import LabelEncoder
X = df.loc[:, 2:].values
y =df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)

In [5]:
'''
we need to standardize the columns in the Breast Cancer Wisconsin dataset before we can feed them to a linear classifier,
such as logistic regression. Furthermore, let's assume that we want to compress our data from the initial 30 dimensions
onto a lower two-dimensional subspace via principal component analysis (PCA), 
a feature extraction technique for dimensionality reduction that we introduced in Chapter 5,
Compressing Data via Dimensionality Reduction. 
Instead of going through the fitting and transformation steps for the training and test dataset separately, we can chain the StandardScaler,
PCA, and LogisticRegression objects in a pipeline
'''
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
pipe_lr = Pipeline([('scl', StandardScaler()),('pca', PCA(n_components=2)),('clf', LogisticRegression(random_state=1))])
pipe_lr.fit(X_train, y_train)
print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))



Test Accuracy: 0.947


In [7]:
'''
In k-fold cross-validation, we randomly split the training dataset into k folds without replacement, 
where k-1 folds are used for the model training and one fold is used for testing. 
This procedure is repeated k times so that we obtain k models and performance estimates.
'''
import numpy as np
from sklearn.cross_validation import StratifiedKFold
kfold = StratifiedKFold(y=y_train,n_folds=10,random_state=1)
scores = []
for k, (train, test) in enumerate(kfold):
    pipe_lr.fit(X_train[train], y_train[train])
    score = pipe_lr.score(X_train[test], y_train[test])
    scores.append(score)
print np.mean(scores)

0.949565217391


In [8]:
'''
scikit-learn also implements a k-fold cross-validation
scorer, which allows us to evaluate our model using stratified k-fold
cross-validation more efficiently:
'''
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(estimator = pipe_lr, X=X_train,y=y_train,cv=10, n_jobs=1)
print 'CV accuracy scores: %s' % scores

CV accuracy scores: [ 0.89130435  0.97826087  0.97826087  0.91304348  0.93478261  0.97777778
  0.93333333  0.95555556  0.97777778  0.95555556]


In [9]:
'''
Diagnosing bias and variance problems with learning curves
'''
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve
pipe_lr = Pipeline([('scl', StandardScaler()),('clf', LogisticRegression(penalty='l2', random_state=0))])
train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_lr, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1)
#print train_scores, test_scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(train_sizes, train_mean,color='blue', marker='o',markersize=5,label='training accuracy')
plt.plot(train_sizes, test_mean,color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
plt.grid()
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.8, 1.0])
plt.show()



In [10]:
'''
Addressing overfitting and underfitting with validation curves
Validation curves are a useful tool for improving the performance of a model by addressing issues such as overfitting or underfitting.
Validation curves are related to learning curves, but instead of plotting the training and test accuracies as functions of the sample size,
we vary the values of the model parameters, for example, the inverse regularization parameter C in logistic regression.
'''
from sklearn.learning_curve import validation_curve
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores, test_scores = validation_curve(estimator=pipe_lr, X=X_train, y=y_train, param_name='clf__C', param_range=param_range, cv=10)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(param_range, train_mean,color='blue', marker='o', markersize=5, label='training accuracy')
plt.plot(param_range, test_mean,color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
plt.grid()
plt.xscale('log')
plt.legend(loc='lower right')
plt.xlabel('Parameter C')
plt.ylabel('Accuracy')
plt.ylim([0.8, 1.0])
plt.show()

In [14]:
'''
Tuning hyperparameters via grid search

The approach of grid search is quite simple, 
it's a brute-force exhaustive search paradigm where we specify a list of values for different hyperparameters, 
and the computer evaluates the model performance for each combination of those to obtain the optimal set
'''
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
pipe_svc = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))])
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [{'clf__C': param_range, 'clf__kernel': ['linear']}, {'clf__C': param_range, 'clf__gamma': param_range, 'clf__kernel': ['rbf']}]
gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)
gs = gs.fit(X_train, y_train)
print gs.best_score_
clf = gs.best_estimator_
clf.fit(X_train, y_train)
print 'Test accuracy: %.3f' % clf.score(X_test, y_test)

0.978021978022
Test accuracy: 0.965


In [20]:
'''
Looking at different performance evaluation metrics

we evaluated our models using the model accuracy, which is a useful metric to quantify the performance of a model in general.
However, there are several other performance metrics that can be used to measure a model's relevance, such as precision, recall,
and the F1-score
'''
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
pipe_svc.fit(X_train, y_train)
y_pred = pipe_svc.predict(X_test)
print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred))
print('auc: %.3f' % roc_auc_score(y_true=y_test, y_score=y_pred))

Precision: 0.976
Recall: 0.952
F1: 0.964
auc: 0.969
