In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dataset = load_wine()
data = dataset['data']
target = dataset['target'].reshape((-1, 1))
columns = dataset['feature_names']
columns.append('tg')
df = pd.DataFrame(np.hstack([data, target]), columns = columns)
df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,tg
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2.0


In [3]:
print(load_wine()['DESCR'])

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [3]:
from sklearn.model_selection import train_test_split
data_train, data_test, label_train, label_test = train_test_split(data, target, test_size=0.2)
print(len(data_train),' samples in training data\n', len(data_test),' samples in test data\n', )

142  samples in training data
 36  samples in test data



In [4]:
from sklearn.metrics.pairwise import euclidean_distances

class myKNeighborsClassifier:
    
    def __init__(self, n_neighbors=5):
        self.k = n_neighbors
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y        
        return self
    
    def predict(self, X):
        r = euclidean_distances(X, self.X_train) # расстояния от тестовых точек u до точек учебного набора X
        ri = np.argsort(r) # номера соседей в порядке увеличения расстояния 
        nc = self.y_train[ ri.flatten() ].reshape(ri.shape) # номера классов для соседей в порядке увеличения расстояния 
        knc = nc[:,:self.k] # классы ближайших k соседей
        cntr = np.vstack([np.sum(knc==0, axis=1), np.sum(knc==1, axis=1), np.sum(knc==2, axis=1)]).T
        prediction = np.argmax(cntr, axis=1)
        return prediction
    

In [5]:
def my_k_fold(n, n_folds=3, shuffle=False):
    all_idx = np.arange(n)
    if shuffle:
        np.random.shuffle(all_idx)
    folds = np.array_split(all_idx, n_folds)
    res = []
    for k in range(n_folds):
        if k == 0:
            edu = np.hstack(folds[k+1:])
        elif k == n_folds - 1:
            edu = np.hstack(folds[:k])
        else:
            edu = np.hstack((np.hstack(folds[:k]), np.hstack(folds[k+1:])))
        val = folds[k]
        res.append((edu, val))
    return res

def my_cross_val_score(clf, X, y, cv=3):
    folds = my_k_fold(X.shape[0], n_folds=cv, shuffle=True)
    acc = []
    
    for fold in folds:
        X_train = X[fold[0]]
        y_train = y[fold[0]]
        X_test = X[fold[1]]
        y_test = y[fold[1]]

        clf.fit(X_train, y_train)
        y_predict = clf.predict(X_test)

        accuracy = np.sum(y_predict == y_test.ravel()) / X_test.shape[0]
        acc.append(accuracy)
        
    return acc

In [6]:
scores = my_cross_val_score(myKNeighborsClassifier(7), data, target, cv=5)
print(scores, np.mean(scores))

[0.6666666666666666, 0.7222222222222222, 0.6944444444444444, 0.7714285714285715, 0.6571428571428571] 0.7023809523809523


In [50]:
n_sample = 5
x_new = np.zeros((n_sample, data.shape[1]))
for i in range(data.shape[1]):
    x_new[:, i] = np.random.uniform(np.min(data[:, i]), np.max(data[:, i]), n_sample)

In [51]:
clf = myKNeighborsClassifier().fit(data, target)
y_new = clf.predict(x_new)
print(y_new)

[0 2 2 1 2]


In [280]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [15]:
data_train, data_test, label_train, label_test = train_test_split(data, target, test_size=0.2)
print(len(data_train),' samples in training data\n', len(data_test),' samples in test data\n', )

142  samples in training data
 36  samples in test data



In [85]:
clf = KNeighborsClassifier().fit(data_train, label_train.ravel())

In [281]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, data, target.ravel(), cv=5)
print(scores, np.mean(scores))

TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator <__main__.myKNeighborsClassifier object at 0x000001F957D36C48> does not.

In [48]:
from sklearn.model_selection import ShuffleSplit
n_samples = data.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
cross_val_score(clf, data, target.ravel(), cv=cv)

array([0.75925926, 0.75925926, 0.66666667, 0.7037037 , 0.64814815])

In [110]:
x_new

array([[1.34768680e+01, 1.11514724e+00, 1.40397625e+00, 2.58672983e+01,
        1.13754980e+02, 3.79111909e+00, 3.97901705e+00, 2.39364325e-01,
        1.72583055e+00, 9.85611500e+00, 6.41871976e-01, 3.55116656e+00,
        3.86748446e+02],
       [1.37355309e+01, 2.48510163e+00, 2.87086888e+00, 1.15012366e+01,
        1.47263636e+02, 2.01207054e+00, 4.22276178e-01, 1.91535826e-01,
        1.18556791e+00, 6.10310267e+00, 5.89489760e-01, 1.81124700e+00,
        7.58292811e+02],
       [1.32185231e+01, 2.42228029e+00, 2.25284583e+00, 1.14829519e+01,
        1.10230691e+02, 3.83912502e+00, 3.28450915e+00, 6.22670364e-01,
        1.14224517e+00, 4.95527970e+00, 1.10952267e+00, 2.31722383e+00,
        1.50767843e+03],
       [1.37464940e+01, 4.05677991e+00, 2.57147406e+00, 2.39754808e+01,
        1.01580393e+02, 1.96242649e+00, 3.30251371e+00, 2.45164417e-01,
        2.58072359e+00, 3.07081837e+00, 5.48421108e-01, 2.69957565e+00,
        6.19666769e+02],
       [1.24393820e+01, 5.21763580e+

In [124]:
idx = np.random.randint(data.shape[0], size=15)
x_test = data[idx, :]

y_test = clf.predict(x_test).flatten()
print(y_test)

y_tg = target[idx].flatten()
print(y_tg)

[0 1 1 1 0 0 1 0 0 0 1 2 0 0 2]
[0 1 1 1 1 0 1 1 2 0 1 2 2 0 1]


In [96]:
from sklearn.metrics.pairwise import euclidean_distances
r = euclidean_distances(data_test, data_train) # расстояния от тестовых точек u до точек учебного набора X

In [97]:
ri = np.argsort(r) # номера соседей в порядке увеличения расстояния 

In [98]:
nc = label_train[ ri.flatten() ].reshape(ri.shape) # номера классов для соседей в порядке увеличения расстояния 
nc

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 2, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [2, 0, 2, ..., 0, 0, 0]])

In [99]:
k=3
knc = nc[:,:k] # классы ближайших k соседей
knc

array([[1, 1, 1],
       [1, 1, 1],
       [1, 2, 1],
       [2, 1, 1],
       [2, 2, 0],
       [2, 2, 1],
       [0, 0, 0],
       [2, 1, 1],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [2, 1, 1],
       [0, 1, 0],
       [1, 1, 1],
       [0, 0, 0],
       [2, 2, 1],
       [0, 0, 0],
       [1, 1, 1],
       [0, 0, 0],
       [0, 0, 0],
       [1, 1, 1],
       [1, 2, 1],
       [0, 2, 0],
       [1, 2, 2],
       [2, 1, 1],
       [0, 0, 0],
       [1, 2, 1],
       [0, 0, 2],
       [1, 1, 1],
       [1, 1, 1],
       [2, 0, 2]])

In [100]:
o = (knc.sum(axis=1) > k/2).astype(int) # какого класса больше в ближайших k соседях
o

array([1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1])

In [101]:
label_test.ravel()

array([1, 2, 2, 1, 0, 2, 1, 2, 0, 0, 0, 2, 1, 1, 1, 1, 2, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 2, 1, 1, 0, 2, 2, 2, 1, 2])

In [102]:
scores = np.mean(o == label_test.ravel())
scores

0.6388888888888888

In [258]:
clf = myKNeighborsClassifier().fit(data_train, label_train)

In [259]:
y_pred = clf.predict(data_test)
y_pred

array([1, 0, 0, 2, 1, 1, 1, 0, 0, 2, 0, 2, 1, 0, 0, 2, 0, 1, 0, 0, 2, 0,
       0, 1, 0, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0], dtype=int64)

In [260]:
scores = np.mean(y_pred == label_test.ravel())
scores

0.6944444444444444

In [224]:
clf = KNeighborsClassifier().fit(data_train, label_train.ravel())

In [225]:
y_pred = clf.predict(data_test)

In [226]:
scores = np.mean(y_pred == label_test.ravel())
scores

0.6944444444444444

In [346]:
def my_k_fold(n, n_folds=3, shuffle=False):
    all_idx = np.arange(n)
    if shuffle:
        np.random.shuffle(all_idx)
    folds = np.array_split(all_idx, n_folds)
    res = []
    for k in range(n_folds):
        if k == 0:
            edu = np.hstack(folds[k+1:])
        elif k == n_folds - 1:
            edu = np.hstack(folds[:k])
        else:
            edu = np.hstack((np.hstack(folds[:k]), np.hstack(folds[k+1:])))
        val = folds[k]
        res.append((edu, val))
    return res

def my_cross_val_score(clf, X, y, cv=3):
    folds = my_k_fold(X.shape[0], n_folds=cv, shuffle=True)
    acc = []
    
    for fold in folds:
        X_train = X[fold[0]]
        y_train = y[fold[0]]
        X_test = X[fold[1]]
        y_test = y[fold[1]]

        clf.fit(X_train, y_train)
        y_predict = clf.predict(X_test)

        accuracy = np.sum(y_predict == y_test.ravel()) / X_test.shape[0]
        acc.append(accuracy)
        
    return acc

In [263]:
my_k_fold(10, shuffle=True)

[(array([9, 1, 4, 0, 6, 5]), array([2, 7, 3, 8])),
 (array([2, 7, 3, 8, 0, 6, 5]), array([9, 1, 4])),
 (array([2, 7, 3, 8, 9, 1, 4]), array([0, 6, 5]))]

In [264]:
my_k_fold(10)

[(array([4, 5, 6, 7, 8, 9]), array([0, 1, 2, 3])),
 (array([0, 1, 2, 3, 7, 8, 9]), array([4, 5, 6])),
 (array([0, 1, 2, 3, 4, 5, 6]), array([7, 8, 9]))]

In [318]:
%%timeit
my_cross_val_score(myKNeighborsClassifier(), data, target)

1.44 ms ± 5.83 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [319]:
%%timeit
cross_val_score(KNeighborsClassifier(), data, target.ravel(), cv=3)

8.31 ms ± 183 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [356]:
cross_val_score(KNeighborsClassifier(7), data, target.ravel(), cv=5)

array([0.66666667, 0.61111111, 0.61111111, 0.74285714, 0.77142857])