In [368]:
# To access iris dataset
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Pandas, numpy, matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Counter to help with mode calculations
from collections import Counter

# Load Data

In [339]:
# Load iris data and store in dataframe

iris = datasets.load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

In [340]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [341]:
# Separate X and y data

X = df.drop('target', axis=1)
y = df.target

# Functions / Code Outline

Things that need to happen:

1) Function to calculate distance between points

2) Calculate distance between a test point and all known data points

3) Sort distance measurements to see closest points (nearest neighbors)

4) Use labels of those closest points to predict label of test point

5) Perform steps 1 - 4 for all test data points

## 1) Calculate distance between two points (Minkowski distance)

In [350]:
# Calculate distance between two points

def minkowski_distance(a, b, p=1):
    
    # Store the number of dimensions
    dim = len(a)
    
    # Set initial distance to 0
    distance = 0
    
    # Calculate minkowski distance using parameter p
    for d in range(dim):
        distance += abs(a[d] - b[d])**p
        
    distance = distance**(1/p)
    
    return distance

# Test the function

minkowski_distance(a=X.iloc[0], b=X.iloc[1], p=1)

0.6999999999999993

## 2) Calculate distance between a test point and all known data points

In [343]:
# Define an arbitrary test point

test_pt = [4.8, 2.7, 2.5, 0.7]

In [351]:
# Calculate distance between test_pt and all points in X

distances = []

for i in X.index:
    
    distances.append(minkowski_distance(test_pt, X.iloc[i]))
    
df_dists = pd.DataFrame(data=distances, index=X.index, columns=['dist'])
df_dists.head()

Unnamed: 0,dist
0,2.7
1,2.0
2,2.3
3,2.1
4,2.7


## 3) Sort distance measurements to see the closest points (nearest neighbors)

In [354]:
# Limit to 5 nearest points for now

df_nn = df_dists.sort_values(by=['dist'], axis=0)[:5]
df_nn

Unnamed: 0,dist
98,1.4
57,1.5
93,1.7
24,1.8
30,1.8


## 4) Use labels of those closest points to predict label of test point

In [355]:
# Use df_nn.index to get labels for nearest neighbors

y[df_nn.index]

98    1
57    1
93    1
24    0
30    0
Name: target, dtype: int32

In [356]:
# Create counter object to track the labels

counter = Counter(y[nn.index])

# Get most common label of all the nearest neighbors

counter.most_common()[0][0]

1

## 5) Perform steps 1 - 4 for all test data points

In [366]:
# Split the data - 75% train, 25% test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                   random_state=42)

In [367]:
# Check shape of data after split

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((112, 4), (38, 4), (112,), (38,))

In [371]:
# Scale the X data

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [375]:
# Make predicts on the test data

y_hat_test = []

for test_point in X_test:
    
    for train_point in X_train:
        
    
    

[-1.01827123  1.2864604  -1.39338902 -1.3621769 ]
[-0.7730102   2.43545215 -1.33550342 -1.49647603]
[-0.03722712 -0.78172474  0.74837808  0.92090833]
[0.20803391 0.8268637  0.4010645  0.51801093]
[1.06644751 0.13746866 0.51683569 0.3837118 ]
[-0.52774918  1.97585545 -1.45127462 -1.09357864]
[-0.52774918  1.51625875 -1.33550342 -1.3621769 ]
[-0.40511866 -1.47111979 -0.06202028 -0.28778385]
[ 0.57592545 -0.55192639  0.74837808  0.3837118 ]
[0.69855596 0.13746866 0.97992047 0.7866092 ]
[ 0.94381699 -0.09232969  0.3431789   0.24941267]
[1.67960008 1.2864604  1.32723405 1.72670311]
[-0.15985763 -0.32212804  0.2274077   0.11511354]
[ 2.17012213 -0.09232969  1.61666204  1.18950659]
[-0.28248815 -0.09232969  0.4010645   0.3837118 ]
[-0.89564072  1.05666205 -1.39338902 -1.3621769 ]
[ 2.29275265 -0.55192639  1.67454764  1.05520746]
[-0.03722712 -0.78172474  0.16952211 -0.28778385]
[-0.7730102   0.8268637  -1.39338902 -1.3621769 ]
[-1.01827123  1.05666205 -1.45127462 -1.22787777]
[-0.89564072  1.