<a href="https://colab.research.google.com/github/enginpaksoy/ML/blob/main/KNN2%20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd

In [7]:
class KNN:
    def __init__(self, k):
        """
        Initialize the KNN class with k neighbors.
        Args:
            k (int): Number of nearest neighbors.
        """
        self.k = k
        if(k % 2 != 1):
            raise ValueError("k must be an odd number")

    def fit(self, training_features, training_labels):
        """
        Store the training data.
        Args:
            training_features (numpy array): Features of the training data.
            training_labels (numpy array): Labels of the training data.
        """
        self.training_features = training_features
        self.training_labels = training_labels

    def predict(self, test_features):
        """
        Predict the labels for the test data.
        Args:
            test_features (numpy array): Features of the test data.
        Returns:
            numpy array: Predicted labels for the test data.
        """
        predictions = []
        for test_instance in test_features:
            distances = []
            # Calculate distance from the test instance to all training instances
            for training_instance in self.training_features:
                distances.append(self.euclidean_distance(test_instance, training_instance))

            # Find the k nearest neighbors
            nearest_neighbors = np.argsort(distances)[:self.k]
            # records the indices with the lowest distance between them

            nearest_labels = [self.training_labels[i] for i in nearest_neighbors]
            # nearest_neighbors contains indices of the nearest neighbors
            # Create a list of labels for these nearest neighbors by looking up the indices in self.training_labels

            # Determine the most common label (voting)
            predicted_label = max(set(nearest_labels), key=nearest_labels.count)
            predictions.append(predicted_label)

        return np.array(predictions)

    def train_test_split(self, data, test_size=0.3):
        """
        Split the data into training and test sets.
        Args:
            data (numpy array): Data to be split.
            test_size (float): Size of the test set.
        Returns:
            tuple: Training and test sets.
        """
        num_samples = data.shape[0]  # Gets the number of samples in the data array.
        num_test_samples = int(num_samples * test_size)  # Determines the size of the test set.
        indices = np.random.permutation(num_samples)  # Creates an array of length num_samples and randomizes the elements of this array.
        test_indices = indices[:num_test_samples]  # Takes the first n(length num_samples) indices for the test set.
        train_indices = indices[num_test_samples:]  # Takes the remaining indices for the training set.
        self.test_indices = test_indices
        self.train_indices = train_indices
        return data[train_indices], data[test_indices]  # Returns the training and test sets.

    def kFold_cross_validation(self, data, kFold):
        """
        Perform k-fold cross validation.
        Args:
            data (numpy array): Data to be split.
            k-fold (int): Number of folds.
        Returns:
            tuple: Training and test sets.
        """
        num_samples = data.shape[0]  # Gets the number of samples in the data array.
        fold_size = int(num_samples // k)  # Determines the size of each fold.
        accuracy_scores = []

        for fold in range(k):
            test_indices = np.arange(fold * fold_size, (fold + 1) * fold_size)  # Creates an array of indices for the test set.
            train_indices = np.concatenate((np.arange(0, fold * fold_size), np.arange((fold + 1) * fold_size, len(data))))  # Creates an array of indices for the training set.
            train_features, train_labels = data[train_indices], labels[train_indices]
            test_features, test_labels = data[test_indices], labels[test_indices]

            self.fit(train_features, train_labels)
            predictions = self.predict(test_features)
            accuracy = np.mean(predictions == test_labels)
            accuracy_scores.append(accuracy)

        self.predictions = predictions

        return np.mean(accuracy_scores), accuracy_scores

    def euclidean_distance(self, instance1, instance2):
        """
        Calculate the Euclidean distance between two instances.
        Args:
            instance1 (numpy array): The first instance.
            instance2 (numpy array): The second instance.
        Returns:
            float: The Euclidean distance between the two instances.
        """
        instance1_new = np.array(instance1[:-1])
        instance2_new = np.array(instance2[:-1])
        # The code creates new arrays from instance1 and instance2, excluding their last elements.

        return np.sqrt(np.sum((instance1_new - instance2_new) ** 2))


In [10]:
data = pd.read_csv('/content/sample_data/iris.csv', sep=',', skiprows=0)
data_array = data.to_numpy()

In [19]:
#### PREDICTING WITHOUT K-FOLDS METHOD ####
# Create a KNN model with k=5 neighbors
knn = KNN(k=5)

# Seperating training data and testing data
# 30% of the data is reserved for testing, 70% is reserved for training
train_set, test_set = knn.train_test_split(data_array, test_size=0.3)

# Separate features and labels
# [:, :-1]  # All rows, all columns except the last
# [:, -1]   # All rows, only the last column
train_features, train_labels = train_set[:, :-1], train_set[:, -1]
test_features, test_labels = test_set[:, :-1], test_set[:, -1]

# Train the model
knn.fit(train_features, train_labels)

# Predict the labels for the test set
predictions = knn.predict(test_features)

# Evaluate the accuracy
accuracy = np.mean(predictions == test_labels)
# If the value is equal it gives 1, if not it gives 0. and creates an array from this data and takes the average of this array

print(f'# Accuracy: {accuracy:.2f}', end = "\n\n")

# Which indices are reserved for training
print("# Train Indices: #", train_features.shape[0])
print(np.sort(knn.train_indices), end = "\n\n")

# Which indices are reserved for testing
print("# Test Indices: #", test_features.shape[0])
print(np.sort(knn.test_indices), end = "\n\n")

# Different predictions from the real ones in the test data
print("# Different predictions from the real ones in the test data: ", end="\n")
for i in range(test_features.shape[0]):
    if(test_labels[i] != predictions[i]):
        print(f'Indices: {knn.test_indices[i]}, Test Labels: {test_labels[i]}, Predictions: {predictions[i]}', end = "\n")

# Accuracy: 0.98

# Train Indices: # 105
[  0   1   2   3   4   5   9  11  15  16  17  18  19  20  22  24  27  28
  29  30  31  32  34  36  37  41  42  43  44  46  47  48  49  51  53  54
  56  57  58  59  61  64  65  67  68  69  70  73  76  77  78  79  80  81
  82  83  84  87  90  91  92  93  94  95  96  97  99 100 101 104 105 106
 107 108 111 112 113 114 115 116 117 119 120 121 123 124 125 126 127 128
 130 131 133 134 135 137 138 139 140 142 143 144 145 147 148]

# Test Indices: # 45
[  6   7   8  10  12  13  14  21  23  25  26  33  35  38  39  40  45  50
  52  55  60  62  63  66  71  72  74  75  85  86  88  89  98 102 103 109
 110 118 122 129 132 136 141 146 149]

# Different predictions from the real ones in the test data: 
Indices: 72, Test Labels: Iris-versicolor, Predictions: Iris-virginica


In [None]:
#### PREDICTING WITH K-FOLDS METHOD ####


In [22]:
knnn = {}
predictions = {}
accuracy = {}
best_k, max_accuracy = 0, 0

for i in range(1, 151, 2):
    knnn[i] = KNN(k=i)
    knnn[i].fit(train_features, train_labels)
    predictions[i] = knnn[i].predict(test_features)
    accuracy[i] = np.mean(predictions[i] == test_labels)

    if(accuracy[i] > max_accuracy):
        max_accuracy = accuracy[i]
        best_k = i

    print(f'# Accuracy for k = {i}: {accuracy[i]:.4f}')

print(f'Best k: {best_k}, Max Accuracy: {max_accuracy:.2f}')

# Accuracy for k = 1: 0.9556
# Accuracy for k = 3: 0.9778
# Accuracy for k = 5: 0.9778
# Accuracy for k = 7: 0.9556
# Accuracy for k = 9: 0.9778
# Accuracy for k = 11: 0.9778
# Accuracy for k = 13: 0.9778
# Accuracy for k = 15: 0.9333
# Accuracy for k = 17: 0.9556
# Accuracy for k = 19: 0.9333
# Accuracy for k = 21: 0.9333
# Accuracy for k = 23: 0.9333
# Accuracy for k = 25: 0.9111
# Accuracy for k = 27: 0.9111
# Accuracy for k = 29: 0.9111
# Accuracy for k = 31: 0.9111
# Accuracy for k = 33: 0.9111
# Accuracy for k = 35: 0.9111
# Accuracy for k = 37: 0.9111
# Accuracy for k = 39: 0.9111
# Accuracy for k = 41: 0.9111
# Accuracy for k = 43: 0.9111
# Accuracy for k = 45: 0.9111
# Accuracy for k = 47: 0.9111
# Accuracy for k = 49: 0.8889
# Accuracy for k = 51: 0.8889
# Accuracy for k = 53: 0.8889
# Accuracy for k = 55: 0.8889
# Accuracy for k = 57: 0.8889
# Accuracy for k = 59: 0.8889
# Accuracy for k = 61: 0.8889
# Accuracy for k = 63: 0.9111
# Accuracy for k = 65: 0.8667
# Accuracy for 