KNN
Import packages and load data

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target


Split data into training and testing

In [2]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% testing


Initialize KNN classifier

In [3]:
# Initialize KNN classifier
# Number of neighbors is often set to 5 as a default, but it's a hyperparameter you can tune
knn = KNeighborsClassifier(n_neighbors=5)


Train model

In [4]:
# Train the model using the training set
knn.fit(X_train, y_train)


In [5]:
# Predict on the test set
y_pred = knn.predict(X_test)


Validation


In [6]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", accuracy_score(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Detailed classification report
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9777777777777777
Confusion Matrix:
 [[18  0  0]
 [ 0 13  1]
 [ 0  0 13]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       1.00      0.93      0.96        14
           2       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



What can you play with the parameter in the KNeighboursClassifier?

What is covered in the lecture?
1. Distance metric (which distance formula to be used?)
2. Any weight (further away point less important)
3. Algorithm (did NOT cover in the lecture, you can explore it by yourself)

In [13]:
# Power of the Minkowski distance, 1 means Manhattan distance (l1), 2 means Euclidean distance (L2), and you can put any integer
#knn = KNeighborsClassifier(n_neighbors=5, p=1)

#####################################################
## For distance metric, you can choose by changing metric, default = 'minkowski'
#knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski')
#####################################################
# Let's try to use cosine similarity as a measure of the distance (that is to use 1-cosine similarity)
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances
# Create a NearestNeighbors instance with cosine distance metric
nearest_neighbors = NearestNeighbors(metric='cosine')

# Fit the model using X_train
nearest_neighbors.fit(X_train)

# Find the K nearest neighbors of the X_test points, including their distances
distances, indices = nearest_neighbors.kneighbors(X_test, n_neighbors=3)

# Convert cosine similarity to cosine distance for the training set
cosine_distance_matrix = pairwise_distances(X_train, X_train, metric='cosine')

# Train a KNeighborsClassifier with the precomputed cosine distance matrix
knn = KNeighborsClassifier(metric='precomputed', n_neighbors=5)

# Fit the classifier
knn.fit(cosine_distance_matrix, y_train)

# Compute the distance matrix between the test set and the training set
test_distance_matrix = pairwise_distances(X_test, X_train, metric='cosine')

# Use the distance matrix to make predictions
predictions = knn.predict(test_distance_matrix)

# You can now evaluate the predictions as needed, for example:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

######################################################
# weights default is 'uniform', and you can choose 'distance' or even you can define another callable function
#knn = KNeighborsClassifier(n_neighbors=5, weights='distance')

# Algorithm is not covered in the lecture, you can choose
# 'auto' (default), 'ball_tree', 'kd_tree', 'brute'
#knn = KNeighborsClassifier(n_neighbors=5, algorithm='auto')

# you also have n_jobs here, same as Random Forest, for parallel cores to run the program


Accuracy: 0.9777777777777777
