In [9]:
# Basic imports
# from __future__ import print_function
import numpy as np

In [33]:
COLOUR_FIGURE = False

from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap

In [34]:
# This function was called ``learn_model`` in the first edition
def fit_model(k, features, labels):
    '''Learn a k-nn model'''
    # There is no model in k-nn, just a copy of the inputs
    return k, features.copy(), labels.copy()


def plurality(xs):
    '''Find the most common element in a collection'''
    from collections import defaultdict
    counts = defaultdict(int)
    for x in xs:
        counts[x] += 1
    maxv = max(counts.values())
    for k, v in counts.items():
        if v == maxv:
            return k

# This function was called ``apply_model`` in the first edition
def predict(model, features):
    '''Apply k-nn model'''
    k, train_feats, labels = model
    results = []
    for f in features:
        label_dist = []
        # Compute all distances:
        for t, ell in zip(train_feats, labels):
            label_dist.append((np.linalg.norm(f - t), ell))
        label_dist.sort(key=lambda d_ell: d_ell[0])
        label_dist = label_dist[:k]
        results.append(plurality([ell for _, ell in label_dist]))
    return np.array(results)


def accuracy(features, labels, model):
    preds = predict(model, features)
    return np.mean(preds == labels)


In [35]:
def load_dataset(dataset_name):
    '''
    data,labels = load_dataset(dataset_name)

    Load a given dataset

    Returns
    -------
    data : numpy ndarray
    labels : list of str
    '''
    data = []
    labels = []
    with open('./data/{0}.tsv'.format(dataset_name)) as ifile:
        for line in ifile:
            tokens = line.strip().split('\t')
            data.append([float(tk) for tk in tokens[:-1]])
            labels.append(tokens[-1])
    data = np.array(data)
    labels = np.array(labels)
    return data, labels

In [37]:
feature_names = [
    'area',
    'perimeter',
    'compactness',
    'length of kernel',
    'width of kernel',
    'asymmetry coefficien',
    'length of kernel groove',
]

In [39]:
def plot_decision(features, labels):
    '''Plots decision boundary for KNN

    Parameters
    ----------
    features : ndarray
    labels : sequence

    Returns
    -------
    fig : Matplotlib Figure
    ax  : Matplotlib Axes
    '''
    y0, y1 = features[:, 2].min() * .9, features[:, 2].max() * 1.1
    x0, x1 = features[:, 0].min() * .9, features[:, 0].max() * 1.1
    X = np.linspace(x0, x1, 100)
    Y = np.linspace(y0, y1, 100)
    X, Y = np.meshgrid(X, Y)

    model = fit_model(1, features[:, (0, 2)], np.array(labels))
    C = predict(
        np.vstack([X.ravel(), Y.ravel()]).T, model).reshape(X.shape)
    if COLOUR_FIGURE:
        cmap = ListedColormap([(1., .6, .6), (.6, 1., .6), (.6, .6, 1.)])
    else:
        cmap = ListedColormap([(1., 1., 1.), (.2, .2, .2), (.6, .6, .6)])
    fig,ax = plt.subplots()
    ax.set_xlim(x0, x1)
    ax.set_ylim(y0, y1)
    ax.set_xlabel(feature_names[0])
    ax.set_ylabel(feature_names[2])
    ax.pcolormesh(X, Y, C, cmap=cmap)
    if COLOUR_FIGURE:
        cmap = ListedColormap([(1., .0, .0), (.0, 1., .0), (.0, .0, 1.)])
        ax.scatter(features[:, 0], features[:, 2], c=labels, cmap=cmap)
    else:
        for lab, ma in zip(range(3), "Do^"):
            ax.plot(features[labels == lab, 0], features[
                     labels == lab, 2], ma, c=(1., 1., 1.))
    return fig,ax


features, labels = load_dataset('seeds')
names = sorted(set(labels))
labels = np.array([names.index(ell) for ell in labels])

fig,ax = plot_decision(features, labels)
fig.savefig('figure4.png')

features -= features.mean(0)
features /= features.std(0)
fig,ax = plot_decision(features, labels)
fig.savefig('figure5.png')

ValueError: too many values to unpack (expected 3)

In [11]:
from sklearn.neighbors import KNeighborsClassifier

In [23]:
features, labels = load_dataset('seeds')
classifier = KNeighborsClassifier(n_neighbors=4)

In [28]:
n = len(features)
print(n)
correct = 0.0
for ei in range(n):
    training = np.ones(n, bool)
    
    training[ei] = 0
    testing = ~training
    
    classifier.fit(features[training], labels[training])
    pred = classifier.predict(features[ei])
    correct += (pred == labels[ei])
print('Result of leave-one-out: {}'.format(correct/n))

210


ValueError: Expected 2D array, got 1D array instead:
array=[15.26  14.84   0.871  5.763  3.312  2.221  5.22 ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [6]:
from sklearn.cross_validation import KFold
kf = KFold(len(features),n_folds = 5, shuffle=True)
# 'means' - список усредненных верностей (по одной для каждой группы)
means = []
for training,testing in kf:
    # Обучаем модель на этой группе, затем применяем ее к
    # тестовым дынным с помощью метода 'predict':
    classifier.fit(features[training], labels[training])
    prediction = classifier.predict(features[testing])
    
    # метод np.mean, примененный к массиву булевых величин,
    # возвращает долю правильных решений для данной группы
    curmean = np.mean(prediction == labels[testing])
    means.append(curmean)
    print("Средняя верность: {:.1%}".format(np.mean(means)))

ModuleNotFoundError: No module named 'sklearn.cross_validation'