In [164]:
# Chapter 6

In [165]:
# Classification with KNN

In [166]:
from sklearn.datasets import load_iris
import sklearn.model_selection as ms
from sklearn.model_selection import GridSearchCV
from sklearn import neighbors

In [167]:
iris = load_iris()

In [168]:
X = iris.data
Y = iris.target

In [169]:
XTrain, XTest, YTrain, YTest =\
ms.train_test_split(X,Y, test_size = 0.3, random_state = 7)

In [170]:
# With KNN we need to find the appropriate value of k. and we can do this 
# with the help of GridSearchCV

In [171]:
k_neighbours = list(range(1,21,2)) # list of even numbers as candidates
n_grid = [{'n_neighbors':k_neighbours}]

In [172]:
model = neighbors.KNeighborsClassifier()

In [173]:
# We are performing cross-validation with 10 folds in our training data

In [174]:
cv_knn = GridSearchCV(estimator = model, param_grid =  n_grid,\
                     cv = ms.KFold(n_splits=10))

In [175]:
cv_knn.fit(XTrain, YTrain)

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=False),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [176]:
best_k = cv_knn.best_params_['n_neighbors']

In [177]:
print('The best parameter is k = {0}'.format(best_k))

The best parameter is k = 11


In [178]:
# For visualisation purposes we will concentrate on only two features:
# sepal width and petal lenght

knnclf = neighbors.KNeighborsClassifier(n_neighbors=best_k)

knnclf.fit(XTrain[:, 2:4], YTrain) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=11, p=2,
           weights='uniform')

In [179]:
y_pred = knnclf.predict(XTest[:, 2:4])

In [180]:
from sklearn.metrics import confusion_matrix

In [181]:
confusion_matrix(YTest, y_pred)

array([[12,  0,  0],
       [ 0, 14,  2],
       [ 0,  2, 15]])

In [182]:
# 4 instances have been missclassified. 

In [183]:
from sklearn.metrics import classification_report

In [184]:
print(classification_report(YTest, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        12
          1       0.88      0.88      0.88        16
          2       0.88      0.88      0.88        17

avg / total       0.91      0.91      0.91        45



In [185]:
# Classification with Logistic Regression

In [186]:
import pandas as pd

In [187]:
bc = pd.read_csv('../data/breast-cancer-wisconsin.csv')

In [188]:
# Let's clean up wrongly formatted data

In [189]:
bc['bare_nucleoli'] = pd.to_numeric(bc['bare_nucleoli'], errors='coerce')

In [190]:
# Remove Null values

In [191]:
bc = bc.dropna()

In [192]:
# convert class to categorical value

In [193]:
bc['class'] = bc['class'].astype('category')
bc['class'].describe()

count     683
unique      2
top         2
freq      444
Name: class, dtype: int64

In [194]:
# Let's prepare our data by separating the labels from the rest of the dataset

In [195]:
X = bc.drop(['class'], axis=1)

In [196]:
X = X.values

In [197]:
Y_raw = bc['class'].values

In [198]:
# Let's change the labels to be 0 or 1 using LabelEncoder

In [199]:
from sklearn import preprocessing

In [200]:
label_enc = preprocessing.LabelEncoder()
label_enc.fit(Y_raw)

LabelEncoder()

In [201]:
Y = label_enc.transform(Y_raw)

In [202]:
import sklearn.model_selection as ms
import numpy as np

In [203]:
XTrain, XTest, YTrain, YTest = \
ms.train_test_split(X,Y, test_size=0.3, random_state = 1)

In [204]:
from sklearn.linear_model import LogisticRegression

In [205]:
pen_val = ['l1', 'l2']

In [206]:
C_val = 2. ** np.arange(-5, 10, step = 2)

In [207]:
grid_s = [{'C': C_val, 'penalty': pen_val}]

In [208]:
model = LogisticRegression()

In [209]:
cv_logr = GridSearchCV(estimator = model, param_grid = grid_s, \
                       cv = ms.KFold(n_splits = 10))

In [210]:
cv_logr.fit(XTrain, YTrain)

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=False),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'penalty': ['l1', 'l2'], 'C': array([  3.12500e-02,   1.25000e-01,   5.00000e-01,   2.00000e+00,
         8.00000e+00,   3.20000e+01,   1.28000e+02,   5.12000e+02])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [211]:
best_c = cv_logr.best_params_['C']
best_penalty = cv_logr.best_params_['penalty']

In [None]:
# Extra: test how KNN, logistic, Naive Bayes and RF perform on same db?