In [1]:
# introduction to machine learning by muller
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import sklearn as sk

In [2]:
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [3]:
x, y = make_blobs(random_state = 0)

x_train, x_test, y_train,y_test = train_test_split(x,y, random_state = 0)

logreg = LogisticRegression().fit(x_train, y_train)

logreg.score(x_test, y_test)

0.88

# cross validation

In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

In [5]:
iris = load_iris()
logreg = LogisticRegression()

In [6]:
scores = cross_val_score(logreg, iris.data, iris.target)
print ('cross_validation scores: ', scores)

cross_validation scores:  [ 0.96078431  0.92156863  0.95833333]


In [7]:
scores = cross_val_score(logreg, iris.data,iris.target, cv = 5)
scores

array([ 1.        ,  0.96666667,  0.93333333,  0.9       ,  1.        ])

In [8]:
scores.mean()

0.96000000000000019

# Benefits of cross_validation

In [26]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits = 5)

cross_val_score(logreg, iris.data, iris.target, cv = kfold)

array([ 1.        ,  0.93333333,  0.43333333,  0.96666667,  0.43333333])

In [28]:
kfolds = KFold(n_splits = 3)
cross_val_score(logreg, iris.data, iris.target, cv = kfold)

array([ 1.        ,  0.93333333,  0.43333333,  0.96666667,  0.43333333])

In [30]:
# shuffling the dataset by setting shuffle parameter to true and the random_state value
kfold = KFold(n_splits = 3, shuffle = True, random_state = 0)
cross_val_score(logreg, iris.data, iris.target, cv = kfold)

array([ 0.9 ,  0.96,  0.96])

In [19]:
# leave-one-out-cross-validation

from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()

scores = cross_val_score(logreg, iris.data, iris.target, cv = loo)
print ("Number of cv iteratons: ", len(scores))
print ("mean accuracy: ", scores.mean())

Number of cv iteratons:  150
mean accuracy:  0.953333333333


In [32]:
# shuffle-split-cross-validation

# the following code splits the dataset into 50% training set and 50% test set
# for 10 iterations

from sklearn.model_selection import ShuffleSplit
shuffle_split = ShuffleSplit(test_size = .5, train_size =.5, n_splits = 10)
cross_val_score(logreg, iris.data, iris.target, cv = shuffle_split)

array([ 0.92      ,  0.84      ,  0.92      ,  0.96      ,  0.96      ,
        0.93333333,  0.98666667,  0.93333333,  0.97333333,  0.92      ])

In [22]:
# label_kfold

from sklearn.model_selection import LabelKFold
# create synthetic dataset

x, y = make_blobs(n_samples = 12, random_state = 0)
# assume the first 3 samples belong to the same group, then the next 4 etc.

labels = [0,0,0,1,1,1,1,2,2,3,3,3]
cross_val_score(logreg, x, y, labels, cv = LabelKFold(n_folds = 3))

ImportError: cannot import name 'LabelKFold'

In [34]:
from sklearn.preprocessing import label

In [44]:
for k in sklearn.__all__:
    m = k.strip('')
    for j in sklearn.m.__all__:
        l = j.strip('')
        if j == 'LabelKFold':
            print ('found', m, l)
        else:
            print ('not found')

AttributeError: module 'sklearn' has no attribute 'm'

# Grid Search

In [50]:
from sklearn.svm import SVC
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state =0)
print ("Size of training set: %d size of test set: %d" % (x_train.shape[0], x_test.shape[0]))

best_test_score = []
best_score = 0
for gamma in [0.001, 0.01,0.1,1,10,100]:
    for C in [0.001, 0.01,0.1,10,100]:
        # for each combination of paramters
        # train as SVC
        svm = SVC(gamma = gamma, C = C)
        svm.fit(x_train, y_train)
        score = svm.score(x_train, y_train)
        y_predict = svm.predict(x_test)
       # score1 = svm.score(y_test, y_predict)
        #print (score1)
        score1 = sum(y_predict == y_test)/len(y_predict)
        best_test_score.append(score1)
        
        if score > best_score:
            best_score = score
            best_parameters = {'C':C, 'gamma':gamma}
            test_score_at_param = score1
            
print ("Best train score: ", best_score)
print ("best test_score: ", max(best_test_score))
print ("Best parameter: ", best_parameters)
print ("test score at paramter value is: ", test_score_at_param)

Size of training set: 112 size of test set: 38
Best train score:  1.0
best test_score:  0.973684210526
Best parameter:  {'C': 100, 'gamma': 1}
test score at paramter value is:  0.973684210526


# The danger of overfitting the parameters and the validatio set

In [52]:
from sklearn.svm import SVC
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state =0)
print ("Size of training set: %d size of test set: %d" % (x_train.shape[0], x_test.shape[0]))

best_test_score = []
best_score = 0
for gamma in [0.001, 0.01,0.1,1,10,100]:
    for C in [0.001, 0.01,0.1,1,10,100]:
        # for each combination of paramters
        # train as SVC
        svm = SVC(gamma = gamma, C = C)
        svm.fit(x_train, y_train)
        score = svm.score(x_train, y_train)
        y_predict = svm.predict(x_test)
       # score1 = svm.score(y_test, y_predict)
        #print (score1)
        score1 = sum(y_predict == y_test)/len(y_predict)
        best_test_score.append(score1)
        
        if score > best_score:
            best_score = score
            best_parameters = {'C':C, 'gamma':gamma}
            test_score_at_param = score1
            
print ("Best train score: ", best_score)
print ("best test_score: ", max(best_test_score))
print ("Best parameter: ", best_parameters)
print ("test score at paramter value is: ", test_score_at_param)

Size of training set: 112 size of test set: 38
Best train score:  1.0
best test_score:  0.973684210526
Best parameter:  {'C': 100, 'gamma': 1}
test score at paramter value is:  0.973684210526


In [55]:
from sklearn.svm import SVC
# split data into train+ validation set and test set
x_trainval, x_test, y_trainval, y_test = train_test_split(iris.data, iris.target, random_state = 0)
# split trin+ validation set into train and validation set
x_train, x_valid, y_train, y_valid = train_test_split(x_trainval, y_trainval, random_state =1)

print ("size of training set: %d size of validation set: %d size of test set: %d "% (x_train.shape[0], x_valid.shape[0], x_test.shape[0]))
best_score  = 0

size of training set: 84 size of validation set: 28 size of test set: 38 


In [56]:
for gamma in [0.001, 0.01,0.1,1,10,100]:
    for C in [0.001, 0.01,0.1,1,10,100]:
        # for each combination of paramters
        # train as SVC
        svm = SVC(gamma = gamma, C = C)
        svm.fit(x_train, y_train)
        # evaluate the SVC on the test set
        score = svm.score(x_valid, y_valid)
        # if we got a better score, store the score and paramters
        if score > best_score:
            best_score = score
            best_paramers = {'C':C, 'gamma':gamma}

In [59]:
# rebuild a model on the combined traing and validation set, and evaluate it on the test
# set
svm = SVC(** best_parameters)
svm.fit(x_trainval, y_trainval)
test_score = svm.score(x_test, y_test)
print ("Best score on validation set: ", best_score)
print ("Best parameters: ", best_parameters)
print ("test set score with best paramters: ",test_score)

Best score on validation set:  0.964285714286
Best parameters:  {'C': 100, 'gamma': 1}
test set score with best paramters:  0.973684210526


In [64]:
# grid-search with cross-validation

best_score = 0
for gamma in [0.001, 0.01,0.1,1,10,100]:
    for C in [0.001, 0.01,0.1,1,10,100]:
        # for each combination of paramters
        # train as SVC
        svm = SVC(gamma = gamma, C = C)
        # perform cross-validation
        scores = cross_val_score(svm, x_trainval, y_trainval, cv = 5)
        # compute mean cross_validation accuracy
        score = np.mean(scores)
        # if we got a better score, store the score and paramters
        if score > best_score:
            best_score = score
            best_paramters = {"C":C, "gamma":gamma}
        
# rebuild a model on the combined training and validation set
svm = SVC(**best_paramters)
svm.fit(x_trainval, y_trainval)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [66]:
svm.get_params().keys()

dict_keys(['C', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])

In [73]:
param_grid = {'gamma' :[0.001, 0.01,0.1,1,10,100], 'C' : [0.001, 0.01,0.1,1,10,100]}

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
grid_search = GridSearchCV(SVC(), param_grid , cv = 5)

In [74]:
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state =0)

In [76]:
grid_search.get_params().keys()

dict_keys(['cv', 'error_score', 'estimator__C', 'estimator__cache_size', 'estimator__class_weight', 'estimator__coef0', 'estimator__decision_function_shape', 'estimator__degree', 'estimator__gamma', 'estimator__kernel', 'estimator__max_iter', 'estimator__probability', 'estimator__random_state', 'estimator__shrinking', 'estimator__tol', 'estimator__verbose', 'estimator', 'fit_params', 'iid', 'n_jobs', 'param_grid', 'pre_dispatch', 'refit', 'return_train_score', 'scoring', 'verbose'])

In [77]:
grid_search.fit(x_train, y_train)
GridSearchCV(cv = 5, error_score = 'raise', estimator = SVC(C = 1.0, cache_size = 200, 
                                                           class_weight = None, coef0 = 0.0,
                                                           decision_function_shape = None,
                                                           degree = 3, gamma = 'auto', kernel = 'rbf',
                                                           max_iter = -1, probability = False, random_state = None, 
                                                           shrinking = True, tol = 0.001, verbose = False), 
            fit_params = {}, iid = True, n_jobs = 1, param_grid = {'C': [0.001,0.01,0.1,1,10,100],
                                                                  'gamma': [0.001,0.01,0.1,1,10,100]},
            pre_dispatch = '2 *n_jobs', refit = True, scoring = None, verbose = 0)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2 *n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [78]:
grid_search.score(x_test, y_test)

0.97368421052631582

In [79]:
print (grid_search.best_params_)
print (grid_search.best_score_)

{'C': 100, 'gamma': 0.01}
0.973214285714


In [80]:
grid_search.best_estimator_

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

# Analyzing the result of cross_validation

In [82]:
import warnings
warnings.filterwarnings('ignore')
grid_search.grid_scores_

[mean: 0.36607, std: 0.01137, params: {'C': 0.001, 'gamma': 0.001},
 mean: 0.36607, std: 0.01137, params: {'C': 0.001, 'gamma': 0.01},
 mean: 0.36607, std: 0.01137, params: {'C': 0.001, 'gamma': 0.1},
 mean: 0.36607, std: 0.01137, params: {'C': 0.001, 'gamma': 1},
 mean: 0.36607, std: 0.01137, params: {'C': 0.001, 'gamma': 10},
 mean: 0.36607, std: 0.01137, params: {'C': 0.001, 'gamma': 100},
 mean: 0.36607, std: 0.01137, params: {'C': 0.01, 'gamma': 0.001},
 mean: 0.36607, std: 0.01137, params: {'C': 0.01, 'gamma': 0.01},
 mean: 0.36607, std: 0.01137, params: {'C': 0.01, 'gamma': 0.1},
 mean: 0.36607, std: 0.01137, params: {'C': 0.01, 'gamma': 1},
 mean: 0.36607, std: 0.01137, params: {'C': 0.01, 'gamma': 10},
 mean: 0.36607, std: 0.01137, params: {'C': 0.01, 'gamma': 100},
 mean: 0.36607, std: 0.01137, params: {'C': 0.1, 'gamma': 0.001},
 mean: 0.69643, std: 0.01333, params: {'C': 0.1, 'gamma': 0.01},
 mean: 0.91964, std: 0.04442, params: {'C': 0.1, 'gamma': 0.1},
 mean: 0.95536, std

In [83]:
scores = [score.mean_validation_score for score in grid_search.grid_scores_]
scores = np.array(scores).reshape(6,6)

# Using different cross-validation strategies with grid search

# Nested cross-validation

In [90]:
score = cross_val_score(GridSearchCV(SVC(), param_grid, cv = 5), iris.data, iris.target, cv = 5)
print ("Cross-validation scores: ", scores)
print ("mean cross_validation score: ", scores.mean())

Cross-validation scores:  [[ 0.36607143  0.36607143  0.36607143  0.36607143  0.36607143  0.36607143]
 [ 0.36607143  0.36607143  0.36607143  0.36607143  0.36607143  0.36607143]
 [ 0.36607143  0.69642857  0.91964286  0.95535714  0.36607143  0.36607143]
 [ 0.69642857  0.92857143  0.96428571  0.94642857  0.91964286  0.50892857]
 [ 0.92857143  0.96428571  0.96428571  0.9375      0.91964286  0.5625    ]
 [ 0.96428571  0.97321429  0.95535714  0.94642857  0.91964286  0.5625    ]]
mean cross_validation score:  0.65625


In [95]:
def nested_cv(x, y , inner_cv, outer_cv, Classifier, parameter_grid):
    outer_scores = []
    # for each split of the data in the outer cross-validation
    # (split method return indices)
    for training_samples, test_samples in outer_cv.split(x,y):
        # find best parameter using inner cross-validation;
        best_param = []
        best_score = -np.inf
        # interate over parameters
        for parameters in parameter_grid:
            # accumnulate score over inner splits
            cv_scores = []
            # interate over inner cross_validation
            for inner_train, inner_test in inner_cv.split(x[training_samples], y[training_samples]):
                # build classifier given paramters and training data
                clf = Classifier(**parameters)
                clf.fit(x[inner_train],y[inner_train])
                # evaluate on inner test set
                score = clf.score(x[inner_test], y[inner_test])
                cv_scores.append(score)
            # compute mean score over inner folds
            mean_score = np.mean(cv_scores)
            if mean_score > best_score:
                # if better than so far, remember parameters
                best_score = mean_score
                best_params = parameters
        #build classifier on best paramters using outer training set
        clf= Classifier(**best_params)
        clf.fit(x[training_samples], y[training_samples])
        # evaluate
        outer_scores.append(clf.score(x[test_samples], y[test_samples]))
    return outer_scores

In [96]:
from sklearn.model_selection import ParameterGrid, StratifiedKFold
nested_cv(iris.data, iris.target, StratifiedKFold(5), StratifiedKFold(5), SVC, ParameterGrid(param_grid))

[0.96666666666666667, 1.0, 0.96666666666666667, 0.96666666666666667, 1.0]

# Parallelizing cross-validation and grid-search

In [97]:
from sklearn.datasets import load_digits
digits = load_digits()
y = digits.target == 9

In [104]:
x_train, x_test, y_train, y_test = train_test_split(digits.data, y, random_state= 0)

In [107]:
from sklearn.dummy import DummyClassifier
dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(x_train, y_train)
pred_most_frequent = dummy_majority.predict(x_test)
print ("predicted labels: %s" % np.unique(pred_most_frequent))
print ("Score: %f" % dummy_majority.score(x_test, y_test))

predicted labels: [False]
Score: 0.895556


In [108]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth = 2).fit(x_train, y_train)
pred_tree = tree.predict(x_test)
tree.score(x_test, y_test)

0.9177777777777778

In [109]:
dummy_majority.get_params().keys()

dict_keys(['constant', 'random_state', 'strategy'])

In [111]:
from sklearn.linear_model import LogisticRegression

In [112]:
dummy = DummyClassifier().fit(x_train, y_train)
pred_dummy = dummy.predict(x_test)
print ('dummy score: %f' %dummy.score(x_test, y_test))

logreg = LogisticRegression(C = 0.1).fit(x_train,y_train)
pred_logreg = logreg.predict(x_test)
print ('logreg score: %f'% logreg.score(x_test,y_test))


dummy score: 0.800000
logreg score: 0.977778


# confusion matrices

In [113]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test, pred_logreg)
print (confusion)

[[401   2]
 [  8  39]]


In [114]:
print ("most frequent class: ")
print (confusion_matrix(y_test, pred_most_frequent))
print ("\nDummy model:")
print (confusion_matrix(y_test, pred_dummy))
print ("\nDecision tree: ")
print (confusion_matrix(y_test, pred_tree))
print ("\nLogistic Regression")
print (confusion_matrix(y_test, pred_logreg))

most frequent class: 
[[403   0]
 [ 47   0]]

Dummy model:
[[354  49]
 [ 42   5]]

Decision tree: 
[[390  13]
 [ 24  23]]

Logistic Regression
[[401   2]
 [  8  39]]


In [117]:
#confusion_matrix
import pandas as pd
df = pd.DataFrame(confusion_matrix, index = ['true" not 4"', 'true "4"'], 
                  columns =['predicted "not 4"', 'predicted "4"'])

In [119]:
plt.figure(figsize = (2,2))
sns.heatmap(df,annot = True)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [121]:
help (sns.heatmap)

Help on function heatmap in module seaborn.matrix:

heatmap(data, vmin=None, vmax=None, cmap=None, center=None, robust=False, annot=None, fmt='.2g', annot_kws=None, linewidths=0, linecolor='white', cbar=True, cbar_kws=None, cbar_ax=None, square=False, ax=None, xticklabels=True, yticklabels=True, mask=None, **kwargs)
    Plot rectangular data as a color-encoded matrix.
    
    This function tries to infer a good colormap to use from the data, but
    this is not guaranteed to work, so take care to make sure the kind of
    colormap (sequential or diverging) and its limits are appropriate.
    
    This is an Axes-level function and will draw the heatmap into the
    currently-active Axes if none is provided to the ``ax`` argument.  Part of
    this Axes space will be taken and used to plot a colormap, unless ``cbar``
    is False or a separate Axes is provided to ``cbar_ax``.
    
    Parameters
    ----------
    data : rectangular dataset
        2D dataset that can be coerced into a

In [123]:
help (sns)

Help on package seaborn:

NAME
    seaborn - # Capture the original matplotlib rcParams

PACKAGE CONTENTS
    algorithms
    apionly
    axisgrid
    categorical
    crayons
    distributions
    external (package)
    linearmodels
    matrix
    miscplot
    palettes
    rcmod
    tests (package)
    timeseries
    utils
    widgets
    xkcd_rgb

DATA
    crayons = {'Almond': '#EFDECD', 'Antique Brass': '#CD9575', 'Apricot':...
    division = _Feature((2, 2, 0, 'alpha', 2), (3, 0, 0, 'alpha', 0), 8192...
    xkcd_rgb = {'acid green': '#8ffe09', 'adobe': '#bd6c48', 'algae': '#54...

VERSION
    0.7.1

FILE
    c:\users\acer\anaconda3\lib\site-packages\seaborn\__init__.py




In [125]:
# precison and recall, accuracy on the binary classificaton datasets.

from sklearn.metrics import f1_score
print ("f1 score most frequent: %.2f" %f1_score(y_test, pred_most_frequent))
print ("f1 score dummy: %.2f" %f1_score(y_test, pred_dummy))
print ("f1 score tree: %.2f" % f1_score(y_test, pred_tree))
print ("f1 score logreg: %.2f" % f1_score(y_test, pred_logreg))

f1 score most frequent: 0.00
f1 score dummy: 0.10
f1 score tree: 0.55
f1 score logreg: 0.89


In [126]:
from sklearn.metrics import classification_report
print (classification_report(y_test,pred_most_frequent, target_names = ['not nine', 'nine']))

             precision    recall  f1-score   support

   not nine       0.90      1.00      0.94       403
       nine       0.00      0.00      0.00        47

avg / total       0.80      0.90      0.85       450



In [129]:
print (classification_report(y_test,pred_dummy, target_names = ['not nine', 'nine']))

             precision    recall  f1-score   support

   not nine       0.89      0.88      0.89       403
       nine       0.09      0.11      0.10        47

avg / total       0.81      0.80      0.80       450



In [130]:
print (classification_report(y_test,pred_logreg, target_names = ['not nine', 'nine']))

             precision    recall  f1-score   support

   not nine       0.98      1.00      0.99       403
       nine       0.95      0.83      0.89        47

avg / total       0.98      0.98      0.98       450



In [150]:
from sklearn.datasets import make_blobs
x, y = make_blobs(n_samples = 400, centers = (2,0), cluster_std = [7.0,2], random_state = 22)
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state = 0)
svc = SVC(gamma = .05).fit(x_train, y_train)
sklearn.plots.plot_decision_threshold()

ValueError: The number of classes has to be greater than one; got 1

In [152]:
!pip install mglearn

Collecting mglearn


  Retrying (Retry(total=4, connect=None, read=None, redirect=None)) after connection broken by 'NewConnectionError('<pip._vendor.requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x00000000046D4DA0>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed',)': /simple/mglearn/
  Retrying (Retry(total=3, connect=None, read=None, redirect=None)) after connection broken by 'NewConnectionError('<pip._vendor.requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x00000000046D4FD0>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed',)': /simple/mglearn/
  Retrying (Retry(total=2, connect=None, read=None, redirect=None)) after connection broken by 'NewConnectionError('<pip._vendor.requests.packages.urllib3.connection.VerifiedHTTPSConnection object at 0x00000000046D4A90>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed',)': /simple/mglearn/
  Retrying (Retry(total=1, connect=None, read=None, redire

In [153]:
from sklearn.metrics import precision_recall_curve
precision, recall, threshold = precision_recall_curve(y_test, svc.decision_function(x_test))

NameError: name 'svc' is not defined

In [155]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100, random_state = 0, max_features = 100)
rf.fit(x_train, y_train)

# RandomForestClassifier has predict_proba, but not decison_function
precision_rf, recall_rf, threshold_rf = precision_recall_curve(y_test, rf.predict_proba(x_test[:,1]))
plt.plot(precision, recall,label = 'svc')
plt.plot(precision[close_zero], recall[close_zero], 'o',markersize = 10, label = 'threshold zero svc', fillstyle = 'none',
        c = 'k', mew = 2)
plt.plot(precision_rf, recall_rf, label = 'rf')

ValueError: max_features must be in (0, n_features]