# Classification and clustering using distances (Part 1)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

## $k$-nearest neighbors

$k$-nearest neighbors is an algorithm for classifying based on a notion of distance.

<font color = 'green'> __Distance/metric__: 

<font color = 'green'> __Euclidean metric__: 

In [None]:
pets = pd.read_csv('pets.csv', index_col=0)

In [None]:
plt.plot(pets.loc[pets['label'] == 'cat', 'weight'], pets.loc[pets['label'] == 'cat', 'height'], 'o', label = 'cat')
plt.plot(pets.loc[pets['label'] == 'dog', 'weight'], pets.loc[pets['label'] == 'dog', 'height'], 's', label = 'dog')
plt.plot(pets.loc[pets['label'] == 'pony', 'weight'], pets.loc[pets['label'] == 'pony', 'height'], '^', label = 'pony')
plt.title('Pets by height, weight, class')
plt.ylabel('Height (cm)')
plt.xlabel('Weight (kg)')
plt.legend()

In [None]:
new_pets = pd.DataFrame([[31, 30.2, 8.2], [32, 22.2, 5.2], [33, 25.8, 17.8]], columns = ['pet #','height', 'weight'])
new_pets

In [None]:
# Drop ponies from the plot because clearly none of the new pets are ponies
plt.plot(pets.loc[pets['label'] == 'cat', 'weight'], pets.loc[pets['label'] == 'cat', 'height'], 'o', label = 'cat')
plt.plot(pets.loc[pets['label'] == 'dog', 'weight'], pets.loc[pets['label'] == 'dog', 'height'], 's', label = 'dog')
#plt.plot(pets.loc[pets['label'] == 'pony', 'weight'], pets.loc[pets['label'] == 'pony', 'height'], '^', label = 'pony')
plt.plot(new_pets['weight'], new_pets['height'], '*', label = 'unknown')
plt.title('Pets by height, weight, class')
plt.ylabel('Height (cm)')
plt.xlabel('Weight (kg)')
plt.legend()

<font color = 'green'> __Nearest neighbors:__ 

<font color = 'green'> __$k$-nearest neighbors algorithm:__ 

In [None]:
def get_nearest_neighbors(train, instance, features, k):
    distances = pd.Series([
        sum((train.loc[i, features] - instance.loc[features]) ** 2) for i in train.index
    ], index = train.index)
    indices = sorted(train.index, key = lambda i:distances[i])[:k]
    return train.loc[indices]

In [None]:
instance = new_pets.loc[1]
features = ['weight', 'height']
instance.loc[features]

In [None]:
get_nearest_neighbors(pets, new_pets.loc[0], features, 5)

In [None]:
def majority_vote(nearest_neighbors):
    label = sorted(nearest_neighbors['label'], key = lambda x: sum(nearest_neighbors['label'] == x), reverse = True)[0]
    return(label)

In [None]:
nn = get_nearest_neighbors(pets, new_pets.loc[1], features, 8)
majority_vote(nn)

## Parameter tuning: how to choose $k$

<font color = 'green'> __Hyperparameter__: 

<font color = 'green'> __Parameter tuning__: 

<font color = 'green'> __Training and test data__: 

In [None]:
# Data from a survey of Japanese forests
# Classes: 's': Sugi
#          'h': Hinoki
#          'd': mixed deciduous
#          'o': other non-forest land
training = pd.read_csv('training.csv', usecols=range(0,10))
y = training['class']
X = training.drop('class', axis = 1)
training

In [None]:
test = pd.read_csv('testing.csv', usecols = range(0, 10))
test_y = test['class']
test_X = test.drop('class', axis = 1)

In [None]:
clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
clf.fit(X,y)

In [None]:
pred = clf.predict(test_X)
confusion_matrix(test_y, pred)

In [None]:
# Try different values of k, 1 through 12
acc = []
for k in range(1, 13):
    clfr = KNeighborsClassifier(n_neighbors=k, weights='uniform')
    clfr.fit(X, y)
    acc.append(sum(clfr.predict(test_X) == test_y) / len(test_y))
plt.plot(range(1, 13), acc, 'o')

<font color = 'green'> __Cross-validation__: 

<font color = 'green'> __$k$-fold cross-validation__: 

In [None]:
# 18-fold cross-validation
folds = np.concatenate([np.repeat(i, len(y) // 18) for i in range(1, 19)])
np.random.shuffle(folds)
folds

In [None]:
# Try different values of k, 1 through 12
mean_acc = []
for k in range(1, 13):
    acc = []
    for i in range(1, 19):
        clf = KNeighborsClassifier(n_neighbors=k, weights='uniform')
        X0 = training.loc[folds != i].drop('class', axis = 1)
        y0 = training.loc[folds != i]['class']
        X1 = training.loc[folds == i].drop('class', axis = 1)
        y1 = training.loc[folds == i]['class']
        clf.fit(X0, y0)
        acc.append(sum(clf.predict(X1) == y1) / len(y1))
    mean_acc.append(np.average(acc))
plt.plot(range(1, 13), mean_acc, 'o')