# Supervised Learning, Pt. 1

### Preamble and Review

In [1]:
#Import the essentials
%matplotlib notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

np.set_printoptions(precision=2)

In [2]:
fruits = pd.read_table('fruit_data_with_colors.txt')
fruits.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [3]:
#train test split 
feature_names_fruits = ['height','width','mass','color_score']
X_fruits = fruits[feature_names_fruits]
y_fruits = fruits['fruit_label']

target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon']

X_fruits_2d = fruits[['height','width']]
y_fruits_2d = fruits['fruit_label']

X_train, X_test, y_train, y_test = train_test_split(X_fruits,y_fruits, random_state=0)
X_test.head()

Unnamed: 0,height,width,mass,color_score
26,9.2,9.6,362,0.74
35,7.9,7.1,150,0.75
43,10.3,7.2,194,0.7
28,7.1,6.7,140,0.72
11,7.6,7.1,172,0.92


In [4]:
#scale the train test data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[0.8 , 1.12, 1.02, 0.51],
       [0.6 , 0.38, 0.26, 0.54],
       [0.97, 0.41, 0.42, 0.41],
       [0.48, 0.26, 0.23, 0.46],
       [0.55, 0.38, 0.34, 1.  ],
       [0.49, 0.47, 0.36, 0.14],
       [0.58, 0.53, 0.24, 0.54],
       [0.95, 0.44, 0.5 , 0.43],
       [0.54, 0.38, 0.28, 0.62],
       [0.48, 0.44, 0.23, 0.86],
       [0.09, 0.06, 0.03, 0.65],
       [0.51, 0.32, 0.32, 1.03],
       [0.54, 0.38, 0.29, 0.65],
       [0.65, 0.53, 0.37, 0.65],
       [0.63, 0.5 , 0.41, 0.51]])

In [5]:
# fit knn, print accuracy
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train_scaled, y_train)

print('Accuracy of k-NN classifier on training set: {:.2f}'.format(knn.score(X_train_scaled,y_train)))
print('Accuracy of k-NN classifier on test set: {:.2f}'.format(knn.score(X_test_scaled,y_test)))

Accuracy of k-NN classifier on training set: 0.95
Accuracy of k-NN classifier on test set: 1.00


In [6]:
# make a prediction on an new example
example_fruit = [[5.5, 2.2, 10, 0.70]]
example_fruit_scaled = scaler.transform(example_fruit)
print('Predicted fruit type for ', example_fruit, ' is ', target_names_fruits[knn.predict(example_fruit_scaled)[0]-1])

Predicted fruit type for  [[5.5, 2.2, 10, 0.7]]  is  mandarin


### Datasets

In [7]:
# import 
from sklearn.datasets import make_classification, make_blobs
from matplotlib.colors import ListedColormap
from sklearn.datasets import load_breast_cancer
from adspy_shared_utilities import load_crime_dataset

cmap_bold = ListedColormap(['#FFFF00','#00FF00','#0000FF','#000000'])

In [8]:
# simple regression
from sklearn.datasets import make_regression
plt.figure()
plt.title('Sample regression problem with one input variable')
X_R1, y_R1 = make_regression(n_samples = 100, n_features = 1, n_informative = 1, bias = 150.0, noise = 30, random_state =0)
plt.scatter(X_R1, y_R1, marker = 'o', s=50)
plt.show()

<IPython.core.display.Javascript object>

In [9]:
# complex regression
from sklearn.datasets import make_friedman1
plt.figure()
plt.title('Complex regression problem with one input variable')
X_F1, y_F1 = make_friedman1(n_samples = 100, n_features = 7, random_state =0)
plt.scatter(X_F1[:,2], y_F1, marker = 'o', s=50)
plt.show()

<IPython.core.display.Javascript object>

In [10]:
# binary classification
plt.figure()
plt.title('Sample binary classification problem with two informative features')
X_C2, y_C2 = make_classification(n_samples = 100, n_features = 2, n_redundant=0, 
                                 n_informative=2, n_clusters_per_class=1,flip_y=0.1,class_sep=0.5,random_state=0)
plt.scatter(X_C2[:,0], X_C2[:,1],c=y_C2, marker='o',s=50)
plt.show()

<IPython.core.display.Javascript object>

In [11]:
# binary classifcation with 'non-linearly separable' classes 
X_D2, y_D2 = make_blobs(n_samples = 100, n_features =2, centers = 8, cluster_std = 1.3, random_state =4)
y_D2 = y_D2 % 2
plt.figure()
plt.title('Sample binary classification problem with non-linearly separable classes')
plt.scatter(X_D2[:,0], X_D2[:,1],c=y_D2,marker='o',s=50)
plt.show()

<IPython.core.display.Javascript object>

In [12]:
# breast cancer dataset for classification
cancer = load_breast_cancer()
(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)

In [13]:
# # Communities and Crime dataset (python version too old)
# (X_crime, y_crime) = load_crime_dataset()

### K-Nearest Neighbors

In [14]:
# this section is based on adspy_shared_utilities that contains code 
# that is no longer supported by the current version of python

In [15]:
# basically, the idea is that as the # of neiborghers increase from 1 to 11, the line between borders
# become more smooth and appears less overfit

### Regression

In [16]:
#split, fit(n=5), predict
from sklearn.neighbors import KNeighborsRegressor
X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, random_state = 0)
knnreg = KNeighborsRegressor(n_neighbors=5).fit(X_train, y_train)

print(knnreg.predict(X_test))
print('R-squared test score: {:.3f}'.format(knnreg.score(X_test, y_test)))


[231.71 148.36 150.59 150.59  72.15 166.51 141.91 235.57 208.26 102.1
 191.32 134.5  228.32 148.36 159.17 113.47 144.04 199.23 143.19 166.51
 231.71 208.26 128.02 123.14 141.91]
R-squared test score: 0.425


In [17]:
# comparing different # of neighbors and the prediction line over trained data

fig, subaxes = plt.subplots(1,2,figsize=(8,4))
X_predict_input = np.linspace(-3,3,50).reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5], y_R1[0::5], random_state=0)

for thisaxis, K in zip(subaxes, [1,3]):
    knnreg = KNeighborsRegressor(n_neighbors=K).fit(X_train, y_train)
    y_predict_output = knnreg.predict(X_predict_input)
    thisaxis.set_xlim([-2.5,0.75])
    thisaxis.plot(X_predict_input, y_predict_output, '^', markersize=10,label ='Predicted',alpha=0.8)
    thisaxis.plot(X_train, y_train, 'o',label ='True Value',alpha=0.8)
    thisaxis.set_xlabel('Input Feature')
    thisaxis.set_ylabel('Target Value')
    thisaxis.set_title('KNN regression (K={})'.format(K))
    thisaxis.legend()
    
plt.tight_layout()

<IPython.core.display.Javascript object>

### Regression model complexity as a function of K

In [18]:
#tweaking the # of K to find the best fit in a given model
fig, subaxes = plt.subplots(5,1,figsize=(5,20))

X_predict_input = np.linspace(-3,3,500).reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, random_state=0)

for thisaxis, K in zip(subaxes, [1,3,7,15,55]):
    knnreg = KNeighborsRegressor(n_neighbors = K).fit(X_train, y_train)

    y_predict_output = knnreg.predict(X_predict_input)
    train_score = knnreg.score(X_train, y_train)
    test_score = knnreg.score(X_test, y_test)
    
    thisaxis.plot(X_predict_input, y_predict_output)

    thisaxis.plot(X_train, y_train, 'o',label ='Train',alpha=0.9)

    thisaxis.plot(X_test, y_test, '^',label ='Test',alpha=0.9)

    thisaxis.set_xlabel('Input Feature')
    thisaxis.set_ylabel('Target Value')
    thisaxis.set_title('KNN regression (K={})\n\
Train $R^2 = {:.3f}$, Test $R^2={:.3f}$'.format(K, train_score, test_score))
    thisaxis.legend()
    plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

<IPython.core.display.Javascript object>

## Linear Models for Regression

## Linear Regression

In [19]:
from sklearn.linear_model import LinearRegression
X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, random_state=0)
linreg = LinearRegression().fit(X_train, y_train)
print('Coefficient: '+ str(linreg.coef_))
print('Intercept: '+ str(linreg.intercept_))
print('R squared score training: '+ str(linreg.score(X_train, y_train)))
print('R squared score test: '+ str(linreg.score(X_test, y_test)))

Coefficient: [45.71]
Intercept: 148.44575345658873
R squared score training: 0.6785950771141656
R squared score test: 0.49159615934930306


### Linear regression: example plot

In [20]:
# plot the linear regression line 
plt.figure(figsize=(5,4))
plt.scatter(X_R1, y_R1, marker = 'o', s=50, alpha=0.8)
plt.plot(X_R1, linreg.coef_ * X_R1 + linreg.intercept_, 'r-')
plt.title('Least-squares linear regression')
plt.xlabel('Feature value (x)')
plt.ylabel('Target value (y)')
plt.show()

<IPython.core.display.Javascript object>

In [21]:
# the module uses crime data instead
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer,
                                                   random_state = 0)
linreg = LinearRegression().fit(X_train, y_train)

print('Coefficient: '+ str(linreg.coef_))
print('Intercept: '+ str(linreg.intercept_))
print('R squared score training: '+ str(linreg.score(X_train, y_train)))
print('R squared score test: '+ str(linreg.score(X_test, y_test)))

Coefficient: [ 1.96e-01 -1.25e-02 -1.84e-02 -3.09e-04 -6.59e-01  4.67e+00 -1.41e+00
 -1.80e+00 -1.02e+00 -8.17e-01 -6.96e-01 -2.65e-02  6.53e-02  3.22e-04
 -1.16e+01 -4.44e-02  3.04e+00 -8.32e+00 -2.45e+00  1.97e+01 -1.79e-01
 -2.00e-04 -4.03e-03  1.18e-03 -4.05e-01 -1.15e-01 -4.12e-01 -9.11e-01
 -1.13e-01 -5.02e+00]
Intercept: 3.2643236829496214
R squared score training: 0.7824123695930645
R squared score test: 0.7291758706114035


### Lasso Regression

In [22]:
#import lasso
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [23]:
#train the model and lasso it
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state=0)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
linlasso = Lasso(alpha=2.0, max_iter = 10000).fit(X_train_scaled, y_train)

print('Cancer dataset')
print('lasso intercept: ', linlasso.intercept_)
print('lasso coefficient: ', linlasso.coef_)
print('non-zero features: ', np.sum(linlasso.coef_ != 0))
print('R^2 score training: ', linlasso.score(X_train_scaled, y_train))
print('R^2 score test: ', linlasso.score(X_test_scaled, y_test))


Cancer dataset
lasso intercept:  0.6267605633802817
lasso coefficient:  [-0. -0. -0. -0. -0. -0. -0. -0. -0.  0. -0.  0. -0. -0.  0. -0. -0. -0.
  0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0. -0.]
non-zero features:  0
R^2 score training:  0.0
R^2 score test:  -2.9204963742035517e-05


### Lasso Regression with Regularizaiton Parameter: alpha

In [24]:
# shows that for crime dataset the test score is the highest when alpha = 3
for alpha in [0.5, 1, 2, 3, 5, 10, 20, 50]:
    linlasso = Lasso(alpha, max_iter = 10000).fit(X_train_scaled, y_train)
    r2_train = linlasso.score(X_train_scaled, y_train)
    r2_test = linlasso.score(X_test_scaled, y_test)
    print('Alpha = {:.2f}\nFeatures kept:{}, r-squared training:{:.2f}, \
r-squared test: {:.2f}\n'
        .format(alpha, np.sum(linlasso.coef_ != 0),r2_train, r2_test))

Alpha = 0.50
Features kept:0, r-squared training:0.00, r-squared test: -0.00

Alpha = 1.00
Features kept:0, r-squared training:0.00, r-squared test: -0.00

Alpha = 2.00
Features kept:0, r-squared training:0.00, r-squared test: -0.00

Alpha = 3.00
Features kept:0, r-squared training:0.00, r-squared test: -0.00

Alpha = 5.00
Features kept:0, r-squared training:0.00, r-squared test: -0.00

Alpha = 10.00
Features kept:0, r-squared training:0.00, r-squared test: -0.00

Alpha = 20.00
Features kept:0, r-squared training:0.00, r-squared test: -0.00

Alpha = 50.00
Features kept:0, r-squared training:0.00, r-squared test: -0.00



### Ridge Regression

In [25]:
# train test split and ridge fit
from sklearn.linear_model import Ridge
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)
linridge = Ridge(alpha=20.0).fit(X_train, y_train)

print('Coefficient: '+ str(linridge.coef_))
print('Intercept: '+ str(linridge.intercept_))
print('R squared score training: '+ str(linridge.score(X_train, y_train)))
print('R squared score test: '+ str(linridge.score(X_test, y_test)))
print('sum non-zeo features: ' + str(np.sum(linridge.coef_ != 0)))

Coefficient: [ 8.95e-02  4.97e-03 -6.53e-03 -9.39e-06 -1.71e-02 -2.49e-02 -4.94e-02
 -2.77e-02 -3.07e-02 -4.70e-03 -4.65e-02 -1.10e-02 -1.67e-02 -3.45e-04
 -2.54e-03  2.62e-04 -1.17e-03 -2.55e-03 -5.31e-03 -7.79e-05 -1.61e-01
 -1.56e-02 -6.49e-03  1.01e-03 -3.16e-02 -8.47e-02 -1.41e-01 -5.22e-02
 -6.76e-02 -1.53e-02]
Intercept: 2.8876581523658835
R squared score training: 0.7178577584599565
R squared score test: 0.7009508798797448
sum non-zeo features: 30


### Ridge Regression with Feature Normalization

In [26]:
# train test split, scale, ridge fit, then print out the relevant numbers
# scale is needed when the input features are on different scales, 
# so much that they impact the accuracy of the L2 penalty from the ridge calculations
# b/c L2 penalty is a sum of squares of all the coefficients

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from sklearn.linear_model import Ridge


X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train)
print('Coefficient: '+ str(linridge.coef_))
print('Intercept: '+ str(linridge.intercept_))

# score seems off on the cancer set
print(linridge.score(X_train, y_train))
print(linridge.score(X_test, y_test))


Coefficient: [-0.16 -0.14 -0.16 -0.11 -0.07 -0.08 -0.12 -0.19 -0.07  0.07 -0.08 -0.
 -0.06 -0.03  0.03  0.05  0.03 -0.03 -0.    0.05 -0.2  -0.17 -0.18 -0.12
 -0.14 -0.11 -0.14 -0.28 -0.13 -0.05]
Intercept: 1.4075287620294374
-298485.4184569747
-275529.6882368455


### Ridge Regression with Regularization Parameter: alpha

In [27]:
# with different set of alpha, seems alpha = 0 works the best
print('Ridge regression: effect of alpha regularization parameter')
for this_alpha in [0,1,10,20,50,100,1000]:
    linridge = Ridge(alpha=this_alpha).fit(X_train_scaled, y_train)
    r2_train = linridge.score(X_train_scaled, y_train)
    r2_test = linridge.score(X_test_scaled, y_test)
    num_coeff_bigger = np.sum(abs(linridge.coef_)>1.0)
    print('alpha: ', this_alpha, ';coeff bigger:', num_coeff_bigger, ';r2 train: ', r2_train, ';r2 test', r2_test)

Ridge regression: effect of alpha regularization parameter
alpha:  0 ;coeff bigger: 8 ;r2 train:  0.7824123695930644 ;r2 test 0.7291758706114039
alpha:  1 ;coeff bigger: 0 ;r2 train:  0.7498105503859351 ;r2 test 0.7264216651801635
alpha:  10 ;coeff bigger: 0 ;r2 train:  0.7168693273714186 ;r2 test 0.7087921529227057
alpha:  20 ;coeff bigger: 0 ;r2 train:  0.697966651239101 ;r2 test 0.6964803857832611
alpha:  50 ;coeff bigger: 0 ;r2 train:  0.6462524931946894 ;r2 test 0.6530606421203689
alpha:  100 ;coeff bigger: 0 ;r2 train:  0.5670200212615004 ;r2 test 0.5780314052318248
alpha:  1000 ;coeff bigger: 0 ;r2 train:  0.15668714050732602 ;r2 test 0.16237688983468124


### Polynomial Regression

In [28]:
# standard procedure: split, fit, print key variables
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures

X_train, X_test, y_train, y_test = train_test_split(X_F1, y_F1, random_state = 0)
linreg = LinearRegression().fit(X_train, y_train)

print('Coefficient: '+ str(linreg.coef_))
print('Intercept: '+ str(linreg.intercept_))
print('R squared score training: '+ str(linreg.score(X_train, y_train)))
print('R squared score test: '+ str(linreg.score(X_test, y_test)))

Coefficient: [ 4.42  6.    0.53 10.24  6.55 -2.02 -0.32]
Intercept: 1.5425091975373135
R squared score training: 0.7223750207373034
R squared score test: 0.7221339576925416


In [29]:
#poly the train data, end up with better results in this specific case
poly = PolynomialFeatures(degree=2)
X_F1_poly = poly.fit_transform(X_F1)

X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1, random_state = 0)
linreg = LinearRegression().fit(X_train, y_train)

print('Poly Deg 2 Coefficient: '+ str(linreg.coef_))
print('Poly Deg 2 Intercept: '+ str(linreg.intercept_))
print('Poly Deg 2 R squared score training: '+ str(linreg.score(X_train, y_train)))
print('Poly Deg 2 R squared score test: '+ str(linreg.score(X_test, y_test)))

Poly Deg 2 Coefficient: [ 3.41e-12  1.66e+01  2.67e+01 -2.21e+01  1.24e+01  6.93e+00  1.05e+00
  3.71e+00 -1.34e+01 -5.73e+00  1.62e+00  3.66e+00  5.05e+00 -1.46e+00
  1.95e+00 -1.51e+01  4.87e+00 -2.97e+00 -7.78e+00  5.15e+00 -4.65e+00
  1.84e+01 -2.22e+00  2.17e+00 -1.28e+00  1.88e+00  1.53e-01  5.62e-01
 -8.92e-01 -2.18e+00  1.38e+00 -4.90e+00 -2.24e+00  1.38e+00 -5.52e-01
 -1.09e+00]
Poly Deg 2 Intercept: -3.205674398929167
Poly Deg 2 R squared score training: 0.9685996830172936
Poly Deg 2 R squared score test: 0.8046437550509971


In [30]:
# Poly Deg 2 + Ridge, a little increase in the test score
X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1, random_state = 0)
linreg = Ridge().fit(X_train, y_train)

print('Poly Deg 2 + Ridge Coefficient: '+ str(linreg.coef_))
print('Poly Deg 2 + Ridge Intercept: '+ str(linreg.intercept_))
print('Poly Deg 2 + Ridge R squared score training: '+ str(linreg.score(X_train, y_train)))
print('Poly Deg 2 + Ridge R squared score test: '+ str(linreg.score(X_test, y_test)))

Poly Deg 2 + Ridge Coefficient: [ 0.    2.23  4.73 -3.15  3.86  1.61 -0.77 -0.15 -1.75  1.6   1.37  2.52
  2.72  0.49 -1.94 -1.63  1.51  0.89  0.26  2.05 -1.93  3.62 -0.72  0.63
 -3.16  1.29  3.55  1.73  0.94 -0.51  1.7  -1.98  1.81 -0.22  2.88 -0.89]
Poly Deg 2 + Ridge Intercept: 5.418093049255203
Poly Deg 2 + Ridge R squared score training: 0.8257620613919463
Poly Deg 2 + Ridge R squared score test: 0.8251115209759516


# Linear Models for Classification

## Logistic Regression

### Logistic regression for binary classification on fruits dataset using height, width features(positive class: apple, negative class: others)

In [31]:
# binary problem: apple verses anything else
from sklearn.linear_model import LogisticRegression

# fig, subaxes = plt.subplots(1,1,figsize=(7,5))
y_fruits_apple = y_fruits_2d == 1
X_train, X_test, y_train, y_test = train_test_split(X_fruits_2d, 
                                                    y_fruits_apple, random_state = 0)

clf = LogisticRegression(C=100).fit(X_train, y_train)
clf

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
# predict a fruist with h = 6, w = 8
print(clf.predict([[6,8]])[0])

# predict a fruist with h = 10, w = 7
print(clf.predict([[10,7]])[0])

# predict a fruist with h = 3, w = 3
print(clf.predict([[3,3]])[0])

True
False
False


In [33]:
# train, test score
print('train',clf.score(X_train, y_train),'; test',clf.score(X_test, y_test))

train 0.7954545454545454 ; test 0.7333333333333333


### Logistic regression on simple synthetic dataset 

In [34]:
# non binary, not using C parameter
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(X_C2, 
                                                    y_C2, random_state = 0)
clf = LogisticRegression().fit(X_train, y_train)
print('train',clf.score(X_train, y_train),'; test',clf.score(X_test, y_test))

train 0.8133333333333334 ; test 0.84


### Logistic regression regularization: C parameter

In [35]:
# results with different values of C
X_train, X_test, y_train, y_test = train_test_split(X_fruits_2d, 
                                                    y_fruits_apple, random_state = 0)
for this_C in [0.1, 1, 5, 10, 100]:
    clf = LogisticRegression(C=this_C).fit(X_train, y_train)
    print('C=',this_C, '; train',clf.score(X_train, y_train),'; test',clf.score(X_test, y_test))

C= 0.1 ; train 0.6590909090909091 ; test 0.6666666666666666
C= 1 ; train 0.75 ; test 0.6666666666666666
C= 5 ; train 0.7727272727272727 ; test 0.7333333333333333
C= 10 ; train 0.7954545454545454 ; test 0.7333333333333333
C= 100 ; train 0.7954545454545454 ; test 0.7333333333333333


### Application to real dataset

In [36]:
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)
clf = LogisticRegression().fit(X_train, y_train)
print('Cancer Dataset')
print('train',clf.score(X_train, y_train),'; test',clf.score(X_test, y_test))

Cancer Dataset
train 0.9577464788732394 ; test 0.958041958041958


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Support Vector Machines

### Linear Support Vector Machine

In [37]:
# standard procedure
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state = 0)
clf = SVC(kernel = 'linear', C=1.0).fit(X_train, y_train)
print('train',clf.score(X_train, y_train),'; test',clf.score(X_test, y_test))
print('Coefficient: '+ str(clf.coef_))
print('Intercept: '+ str(clf.intercept_))

train 0.8 ; test 0.76
Coefficient: [[1.71 0.18]]
Intercept: [-0.05]


### Linear Support Vector Machine: C parameter

In [38]:
from sklearn.svm import LinearSVC
X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state = 0)

for this_C in [0.00001, 100]:
    clf = LinearSVC(C=this_C).fit(X_train, y_train)
    print('train',clf.score(X_train, y_train),'; test',clf.score(X_test, y_test))
    print()

train 0.76 ; test 0.76

train 0.8 ; test 0.76





### Application to Real Dataset

In [39]:
from sklearn.svm import LinearSVC
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)
clf = LinearSVC().fit(X_train, y_train)

print('train',clf.score(X_train, y_train),'; test',clf.score(X_test, y_test))
print()

train 0.9225352112676056 ; test 0.9440559440559441





## Multi-class Classfication with Linear Models

### LinearSVC with M Classes Generates M One vs Rest Classifiers

In [40]:
from sklearn.svm import LinearSVC
X_train, X_test, y_train, y_test = train_test_split(X_fruits_2d, y_fruits_2d, random_state = 0)
clf = LinearSVC(C=5, random_state=67).fit(X_train, y_train)

print('train',clf.score(X_train, y_train),'; test',clf.score(X_test, y_test))
print('Coefficient: '+ str(clf.coef_))
print('Intercept: '+ str(clf.intercept_))

train 0.6363636363636364 ; test 0.7333333333333333
Coefficient: [[-0.28  0.69]
 [-1.63  1.15]
 [-0.01  0.4 ]
 [ 1.26 -1.66]]
Intercept: [-3.32  1.2  -2.67  1.16]




### Multi-class results on the fruit dataset

In [41]:
plt.figure(figsize=(6,6))

plt.scatter(X_fruits_2d[['height']],X_fruits_2d[['width']], alpha=.7)
x_0_range = np.linspace(-10,15)

for w,b in zip(clf.coef_,clf.intercept_):
    plt.plot(x_0_range, -(x_0_range*w[0]+b)/w[1], alpha=.8)
    
        
plt.legend(target_names_fruits)
plt.xlabel('height')
plt.ylabel('width')
plt.xlim(-2, 12)
plt.ylim(-2, 15)
plt.show()

<IPython.core.display.Javascript object>

## Kernelized Support Vector Machines

### Classification

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)
clf = SVC().fit(X_train, y_train)
print('default SVC:', 'train',clf.score(X_train, y_train),'; test',clf.score(X_test, y_test))

clf_kernel = SVC(kernel = 'poly', degree = 3).fit(X_train, y_train)
print('polynomial kernel SVC:','train',clf_kernel.score(X_train, y_train),'; test',clf_kernel.score(X_test, y_test))

default SVC: train 0.7466666666666667 ; test 0.68
polynomial kernel SVC: train 0.64 ; test 0.68


### Support Vector Machine with RBF kernel: gamma parameter

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)

for this_gamma in [0.01,1.0,10]:
    clf = SVC(kernel = 'rbf', gamma=this_gamma).fit(X_train, y_train)
    print('rbf kernel SVC:', 'gamma: ', this_gamma, '; train',clf.score(X_train, y_train),'; test',clf.score(X_test, y_test))
    print()

polynomial kernel SVC: gamma:  0.01 ; train 0.7466666666666667 ; test 0.68

polynomial kernel SVC: gamma:  1.0 ; train 0.9466666666666667 ; test 0.72

polynomial kernel SVC: gamma:  10 ; train 0.9866666666666667 ; test 0.6



### Support Vector Machine with RBF kernel: using both C and gamma parameter

In [47]:
for this_gamma in [0.01,1.0,5]:
    for this_C in [0.1,1,15,250]:
        clf = SVC(kernel = 'rbf', gamma=this_gamma, C=this_C).fit(X_train, y_train)
        print('rbf kernel SVC:', 'gamma:', this_gamma,'C:', this_C, '; train:',clf.score(X_train, y_train),'; test:',clf.score(X_test, y_test))
        print()

rbf kernel SVC: gamma: 0.01 C: 0.1 ; train: 0.5866666666666667 ; test: 0.64

rbf kernel SVC: gamma: 0.01 C: 1 ; train: 0.7466666666666667 ; test: 0.68

rbf kernel SVC: gamma: 0.01 C: 15 ; train: 0.8933333333333333 ; test: 0.8

rbf kernel SVC: gamma: 0.01 C: 250 ; train: 0.9066666666666666 ; test: 0.8

rbf kernel SVC: gamma: 1.0 C: 0.1 ; train: 0.5066666666666667 ; test: 0.48

rbf kernel SVC: gamma: 1.0 C: 1 ; train: 0.9466666666666667 ; test: 0.72

rbf kernel SVC: gamma: 1.0 C: 15 ; train: 0.9733333333333334 ; test: 0.88

rbf kernel SVC: gamma: 1.0 C: 250 ; train: 0.9866666666666667 ; test: 0.76

rbf kernel SVC: gamma: 5 C: 0.1 ; train: 0.5066666666666667 ; test: 0.48

rbf kernel SVC: gamma: 5 C: 1 ; train: 0.9866666666666667 ; test: 0.68

rbf kernel SVC: gamma: 5 C: 15 ; train: 0.9866666666666667 ; test: 0.72

rbf kernel SVC: gamma: 5 C: 250 ; train: 1.0 ; test: 0.76



### Application of SVMs to a Real Dataset: Unnormalized Data

In [49]:
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)
clf = SVC(C=10).fit(X_train,y_train)
print('train',clf.score(X_train, y_train),'; test',clf.score(X_test, y_test))

train 0.9154929577464789 ; test 0.9370629370629371


### Application of SVMs to a Real Dataset: Normalized Data with Feature Preprocessing using MinMax Scaling

In [51]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = SVC(C=10).fit(X_train_scaled,y_train)
print('train',clf.score(X_train_scaled, y_train),'; test',clf.score(X_test_scaled, y_test))

train 0.9882629107981221 ; test 0.972027972027972


In [52]:
from sklearn.model_selection import cross_val_score

clf = KNeighborsClassifier(n_neighbors = 5)
X = X_fruits_2d.as_matrix()
y = y_fruits_2d.as_matrix()
cv_scores = cross_val_score(clf, X, y)

print('Cross-validation scores (3-fold):', cv_scores)
print('Mean cross-validation score (3-fold): {:.3f}'
     .format(np.mean(cv_scores)))

from sklearn.svm import SVC
from sklearn.model_selection import validation_curve

param_range = np.logspace(-3, 3, 4)
train_scores, test_scores = validation_curve(SVC(), X, y,
                                            param_name='gamma',
                                            param_range=param_range, cv=3)

# This code based on scikit-learn validation_plot example
#  See:  http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html
plt.figure()

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title('Validation Curve with SVM')
plt.xlabel('$\gamma$ (gamma)')
plt.ylabel('Score')
plt.ylim(0.0, 1.1)
lw = 2

plt.semilogx(param_range, train_scores_mean, label='Training score',
            color='darkorange', lw=lw)

plt.fill_between(param_range, train_scores_mean - train_scores_std,
                train_scores_mean + train_scores_std, alpha=0.2,
                color='darkorange', lw=lw)

plt.semilogx(param_range, test_scores_mean, label='Cross-validation score',
            color='navy', lw=lw)

plt.fill_between(param_range, test_scores_mean - test_scores_std,
                test_scores_mean + test_scores_std, alpha=0.2,
                color='navy', lw=lw)

plt.legend(loc='best')
plt.show()

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_decision_tree
from sklearn.model_selection import train_test_split


iris = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 3)
clf = DecisionTreeClassifier().fit(X_train, y_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

AttributeError: 'DataFrame' object has no attribute 'as_matrix'