# Assignment #1: KNN Classifier


In [397]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [398]:
import numpy as np
import numpy.typing as npt
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
wine_quality = fetch_ucirepo(id=186) 
  
# data (as pandas dataframes) 
features = wine_quality.data.features.copy()
# features = features.drop(["density", "pH", "chlorides"], axis=1)
targets = wine_quality.data.targets.copy()

# transform quality column into binary "good" and "bad"
targets["quality"] = np.where(targets["quality"] < 5, 0, 1)


features = features.to_numpy()
targets = targets.to_numpy()

print(features)
print(targets)

[[ 7.4   0.7   0.   ...  3.51  0.56  9.4 ]
 [ 7.8   0.88  0.   ...  3.2   0.68  9.8 ]
 [ 7.8   0.76  0.04 ...  3.26  0.65  9.8 ]
 ...
 [ 6.5   0.24  0.19 ...  2.99  0.46  9.4 ]
 [ 5.5   0.29  0.3  ...  3.34  0.38 12.8 ]
 [ 6.    0.21  0.38 ...  3.26  0.32 11.8 ]]
[[1]
 [1]
 [1]
 ...
 [1]
 [1]
 [1]]


### 1. Write a function to calculate and return the Minkowski distance with optional argument p defaulting to ‘p=2’ (Euclidean) of two vectors where a vector represents a data point.

Note: Minkowski Distance is the generalized form of distance calculations with p=1 representing Manhattan distance and p=2 representing Euclidean

In [399]:
def minkowski_dist(v1: np.ndarray, v2: np.ndarray, p: int=2):
    return np.sum((np.absolute(np.power(np.subtract(v1, v2), p))))**(1.0/p)

In [400]:
# minkowski_dist([1,3,5,7],[2,4,6,8], p=1)

In [401]:
# minkowski_dist([1,3,5,7],[2,4,6,8])

In [402]:
# minkowski_dist([1,3,5,7],[2,4,6,8], p=3)

In [403]:
# minkowski_dist([1,3,5,7],[2,4,6,8], p=4)

### 2. Write a function to calculate and return the accuracy of two vectors.

In [573]:
def accuracy(v1: np.ndarray, v2: np.ndarray):
    assert len(v1) == len(v2)
    return np.sum(v1 == v2)/np.size(v1)
    # return sum([X[i] == Y[i] for i in range(len(X))])/len(X)

In [405]:
# X = np.asarray(list(range(20)))
# Y = np.asarray(list(range(20)))
# Y[3], Y[10], X[6] = 10, 3, 2

In [406]:
# accuracy(X, Y)

### 3. Write three functions to compute: precision, recall and F1 score.

In [407]:
def precision(v1: np.ndarray, v2: np.ndarray):
    # true positive over predicted positive
    # precision measures how accurate your positive predictions are
    # which percentage of your positive predictions are correct
    # !! How many retrieved items are relevant?
    # true positives out of retrieved
    true_pos = 0
    false_pos = 0
    for i in range(len(v1)):
        if v1[i] and v2[i]:
            true_pos += 1
        if not v1[i] and v2[i]:
            false_pos += 1

    return 1.0 * true_pos / (true_pos + false_pos)

In [408]:
def recall(v1: np.ndarray, v2: np.ndarray):
    # true positive over real positive
    # recall measures how well you find all the actual positives
    # which percentage of actual positive samples were correctly classified
    # !! How many relevant items are retrieved?
    # true positives out of all actual positives
    true_pos = 0
    false_neg = 0
    
    for i in range(len(v1)):
        if v1[i] and v2[i]:
            true_pos += 1
        if v1[i] and not v2[i]:
            false_neg += 1
    return true_pos/(true_pos + false_neg)

In [409]:
def F1(v1: np.ndarray, v2: np.ndarray):
    pre = precision(v1, v2)
    rec = recall(v1, v2)
    
    return 2*(pre*rec)/(pre + rec)

In [410]:
# X = np.asarray([0, 0, 1, 1, 0, 0, 0, 1])
# Y = np.asarray([0, 0, 1, 0, 1, 0, 1, 0])
# print(accuracy(X, Y))
# print(precision(X, Y))
# print(recall(X, Y))
# print(F1(X, Y))

## 4. Write a function to compute the confusion matrix of two vectors.

In [411]:
def confusion_matrix(X: np.ndarray, Y: np.ndarray):
    true_neg = 0
    false_pos = 0
    false_neg = 0
    true_pos = 0
    
    for i in range(len(X)):
        if X[i] == Y[i] and not X[i]:
            true_neg += 1
        if X[i] != Y[i] and not X[i]:
            false_pos += 1
        if X[i] != Y[i] and X[i]:
            false_neg += 1
        if X[i] == Y[i] and X[i]:
            true_pos += 1
    return [[true_neg, false_pos], [false_neg, true_pos]]

In [412]:
# print(confusion_matrix(X, Y))

## 5. Write a function to generate the Receiver Operating Characteristic (ROC) curve.

In [413]:
def roc():
    pass

## 6. Write a function to compute area under curve (AUC) for the ROC curve.

In [414]:
def auc():
    pass

## 7. Write a function to generate the precision-recall curve.

In [415]:
def precision_recall():
    pass

## 8. Implement a KNN_Classifier model class. It should have the following three methods.

#### a) __init__(self,) It’s a standard python initialization function so we can instantiate the class. Just “pass” this.

#### b) fit(self, X, Y) This method simply needs to store the relevant values as instance variables.

#### c) predict(self, X,threshold=.5) This method will use the instance variables stored by the fit method.

In [565]:
class KNN_Classifier:
    def __init__(self, n_neighbors: int, weights: str="uniform", p: int=2) -> None:
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.p = p
        self.X_ = None
        self.Y_ = None

        
    def fit(self, X, Y) -> None:
        self.X_ = X
        self.Y_ = Y
    
    def predict(self, X: np.ndarray, threshold: float=.5) -> np.ndarray:
        probabilities = self.predict_proba(X)
        predictions = []
        for prob in probabilities:
            if prob >= threshold:
                predictions.append([1])
            else:
                predictions.append([0])
        return np.asarray(predictions)
    
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        probabilities = []
        # looping through every x we want to predict
        for x in X:
            distances = []
            # looping through training data rows
            for i in range(len(self.X_)):
                # find distance
                distance = minkowski_dist(x, self.X_[i], self.p)
                # weight points based on weight metric
                factor = 1 if self.weights == "distance" else 1.0/distance
                # add tuple of x, y, and distance
                distances.append((self.X_[i], self.Y_[i], factor*distance))
            # find the k nearest neighbors
            neighbors = sorted(distances, key=lambda tup: tup[2])[:self.n_neighbors]
            # calculate and store positive class probability
            probabilities.append(sum([n[1] for n in neighbors])/self.n_neighbors)
        return np.asarray(probabilities)
    
    def get_params(self):
        return {"n_neighbors": self.n_neighbors, "weights": self.weights, "p": self.p}
    
    def set_params(self, **params: dict) -> None:
        self.n_neighbors = params.get("n_neighbors", self.n_neighbors)
        self.weights = params.get("weights", self.weights)
        self.p = params.get("p", self.p)
    

In [566]:
def partition(feature: np.ndarray, target: np.ndarray, t: float, shuffle: bool=True) -> tuple:
    training_size = int(t*len(feature))
    samples = [(feature[i], target[i]) for i in range(len(feature))]
    if shuffle:
        p = np.random.permutation(len(feature))
        feature = feature[p]
        target = target[p]
    return (feature[:training_size], feature[training_size:], target[:training_size], target[training_size:])

In [418]:
# partition(np.asarray(([1,1,1,1], [2,2,2,2], [3,3,3,3])), np.asarray((1,2,3)), 1/3)

## 14. Use your “partition” function to split the data into 80% train and 20% test. 

In [567]:
features_training, features_test, target_training, target_test = partition(features, targets, .8)
print(len(features_training), len(features_test), len(target_training), len(target_test))

5197 1300 5197 1300


In [568]:
classifier = KNN_Classifier(n_neighbors=5, weights="distance", p=2)

In [569]:
classifier.fit(features_training, target_training)

In [570]:
classifier.get_params()

{'n_neighbors': 5, 'weights': 'distance', 'p': 2}

In [571]:
b = classifier.predict(features_test)

In [None]:
# accuracies = []
# classifier = KNN_Classifier(n_neighbors=5, weights="distance", p=2)
# for i in range(5):
#     features_training, features_test, target_training, target_test = partition(features, targets, .8)
#     classifier.fit(features_training, target_training)
#     prediction = classifier.predict(features_test)
#     accuracies.append(accuracy(prediction, target_test))
# print(accuracies)
# print(sum(accuracies)/len(accuracies))

In [578]:
# print(sum([0.9723076923076923, 0.9646153846153847, 0.9607692307692308, 0.9653846153846154, 0.9638461538461538])/5)

0.9653846153846153


In [579]:
# classifier = KNN_Classifier(n_neighbors=5, weights="distance", p=2)
# classifier.fit(features, targets)
# prediction = classifier.predict(features)
# print(accuracy(prediction, targets))


0.9645990457134062


In [580]:
# print(F1(prediction, targets))

0.981904012588513


In [581]:
# print(recall(prediction, targets))

0.9660938225731538


In [582]:
# print(precision(prediction, targets))

0.9982402815549513
