In [11]:
# Import scikit learn
from sklearn.datasets import load_iris

In [12]:
# This database is about plants. There's 4 measurements of the plants.
iris = load_iris()

# Collect the data
X = iris.data
y = iris.target

In [13]:
# Each row represent a sample, the 4 values are measurements.
# The columns are called 'features'
# The rows are called 'samples'.
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [14]:
# There are 3 classes. We see the encoded values of the classes
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [15]:
# Selects the K closest neighbors form a given point
# So we say: Take into account 'n' nearest neighbors
# From those 'n' 5 are dogs & 4 are cats, so the label it assigns is 'dogs'
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=1)
model

In [19]:
# Here we are training the model with X data
model.fit(X,y)

# We are predicting with X data
y_model = model.predict(X)

# We calculate accuracy = Total positives / Total number(Positive + Negative)
from sklearn.metrics import accuracy_score
accuracy_score(y,y_model)

1.0

In [20]:
from sklearn.model_selection import train_test_split
# 50%
# Train test split is being fed with 50% of the data to train,
X1, X2, y1, y2 = train_test_split(X,
                                  y,
                                  random_state=0,
                                  train_size=0.5)

# Now the training data is not seing any of the testing data
model.fit(X1, y1)
y2_model = model.predict(X2)
accuracy_score(y2, y2_model)

0.9066666666666666

In [22]:
# Here we are the 'other' data to train & the 'other' data to test
model.fit(X2, y2)
y1_model = model.predict(X1)
accuracy_score(y1, y1_model)

# So as a result we get somewhat a 'cross-validation'
# The standard deviation gives us a notion of how different '0.906' and '0.96' really are

0.96

In [24]:
from sklearn.model_selection import cross_val_score
cross_val_score(model, X, y, cv=5)

array([0.96666667, 0.96666667, 0.93333333, 0.93333333, 1.        ])

In [25]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
kf = KFold(n_splits=4)
for train_index, validation_index in kf.split(X):
    print(validation_index)
    print(train_index)
    X_train, X_validation = X[train_index], X[validation_index]
    y_train, y_validation = y[train_index], y[validation_index]
    model.fit(X_train, y_train)
    y_model = model.predict(X_validation)
    print(accuracy_score(y_validation, y_model))


[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37]
[ 38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55
  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73
  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91
  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109
 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
 146 147 148 149]
1.0
[38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
 62 63 64 65 66 67 68 69 70 71 72 73 74 75]
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91
  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109
 110 111 112 1

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=0,
                                                    train_size=0.8)
from sklearn.model_selection import KFold
kf = KFold(n_splits=4)
best_acc = 0
for train_index, validation_index in kf.split(X_train):
    X_train2, X_validation = X[train_index], X[validation_index]
    y_train2, y_validation = y[train_index], y[validation_index]
    model.fit(X_train2, y_train2)
    y_model = model.predict(X_validation)
    acc = accuracy_score(y_validation, y_model)
    if acc > best_acc:
        X_best, y_best = X_train2, y_train2
        best_acc = acc

    print(acc)
    y_model = model.fit(X_best, y_best)
    y_model = model.predict(X_test)
    print(accuracy_score(y_model, y_test))


1.0


InvalidParameterError: The 'y_true' parameter of accuracy_score must be an array-like or a sparse matrix. Got KNeighborsClassifier(n_neighbors=1) instead.