<a href="https://colab.research.google.com/github/enginpaksoy/ML/blob/main/KNN_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv('/content/sample_data/iris.csv', sep=',', skiprows=0)
data_array = data.to_numpy()
data_array2 = data_array

In [None]:
class KNN:
    def __init__(self, data, k):
        """
        Initialize the KNN class with k neighbors.
        Args:
            k (int): Number of nearest neighbors.
        """
        self.data_size = data.shape[0]
        self.k = k
        if(k % 2 != 1):
            raise ValueError("k must be an odd number")

    def fit(self, training_features, training_labels):
        """
        Store the training data.
        Args:
            training_features (numpy array): Features of the training data.
            training_labels (numpy array): Labels of the training data.
        """
        self.training_features = training_features
        self.training_labels = training_labels

        return

    def predict(self, test_features):
        """
        Predict the labels for the test data.
        Args:
            test_features (numpy array): Features of the test data.
        Returns:
            numpy array: Predicted labels for the test data.
        """
        predictions = []
        for test_instance in test_features:
            distances = []
            # Calculate distance from the test instance to all training instances
            for training_instance in self.training_features:
                distances.append(self.euclidean_distance(test_instance, training_instance))

            # Find the k nearest neighbors
            nearest_neighbors = np.argsort(distances)[:self.k]
            # records the indices with the lowest distance between them

            nearest_labels = [self.training_labels[i] for i in nearest_neighbors]
            # nearest_neighbors contains indices of the nearest neighbors
            # Create a list of labels for these nearest neighbors by looking up the indices in self.training_labels

            # Determine the most common label (voting)
            predicted_label = max(set(nearest_labels), key=nearest_labels.count)
            predictions.append(predicted_label)

        return np.array(predictions)


    def euclidean_distance(self, instance1, instance2):
        """
        Calculate the Euclidean distance between two instances.
        Args:
            instance1 (numpy array): The first instance.
            instance2 (numpy array): The second instance.
        Returns:
            float: The Euclidean distance between the two instances.
        """
        instance1_new = np.array(instance1[:-1])
        instance2_new = np.array(instance2[:-1])
        # The code creates new arrays from instance1 and instance2, excluding their last elements.

        return np.sqrt(np.sum((instance1_new - instance2_new) ** 2))


In [None]:
class KNN_without_kFold(KNN):
    def __init__(self, data, k):
        super().__init__(data, k)

    def train_test_split(self, data, test_size=0.3):
        """
        Split the data into training and test sets.
        Args:
            data (numpy array): Data to be split.
            test_size (float): Size of the test set.
        Returns:
            tuple: Training and test sets.
        """
        num_samples = data.shape[0]  # Gets the number of samples in the data array.
        num_test_samples = int(num_samples * test_size)  # Determines the size of the test set.
        indices = np.random.permutation(num_samples)  # Creates an array of length num_samples and randomizes the elements of this array.
        test_indices = indices[:num_test_samples]  # Takes the first n(length num_samples) indices for the test set.
        train_indices = indices[num_test_samples:]  # Takes the remaining indices for the training set.

        self.test_indices = test_indices
        self.train_indices = train_indices

        return data[train_indices], data[test_indices]  # Returns the training and test sets.



In [None]:
class KNN_with_KFold(KNN):
    def __init__(self, data, k, kFold):
        super().__init__(data, k)
        self.kFold = kFold

    def train_test_split(self, data, fold):
        """
        Split the data into training and test sets.
        Args:
            data (numpy array): Data to be split.
        """
        num_samples = data.shape[0]  # Gets the number of samples in the data array.
        fold_size = num_samples // self.kFold  # Determines the size of each fold.
        remainder = num_samples % self.kFold  # Calculates the remainder to handle non-even splits

        # Determine the start and end indices of the test fold
        test_start_idx = fold * fold_size + min(fold, remainder)
        test_end_idx = test_start_idx + fold_size + (1 if fold < remainder else 0)

        test_indices = np.arange(test_start_idx, test_end_idx)
        train_indices = np.concatenate((np.arange(0, test_start_idx), np.arange(test_end_idx, num_samples)))
        #print(test_indices)
        self.train_features, self.train_labels = data[train_indices, :-1], data[train_indices, -1]
        self.test_features, self.test_labels = data[test_indices, :-1], data[test_indices, -1]
        #print(self.test_labels)



    def cross_validation(self, data):
        """
        Perform k-fold cross validation.
        Args:
            data (numpy array): Data to be split.
            k-fold (int): Number of folds.
        """
        predictions_list = []

        for fold in range(self.kFold):
            self.train_test_split(data_array, fold)
            #self.seperate_features_labels(self.train_features, self.test_features)
            self.fit(self.train_features, self.train_labels)
            fold_predictions = self.predict(self.test_features)
            predictions_list.extend(fold_predictions)  # Collect predictions from all folds


        predictions_list_array = np.array(predictions_list)
        all_real_labels = data_array[:, -1]
        accuracy = np.mean(predictions_list_array == all_real_labels)

        self.predicted_label = predictions_list_array
        self.accuracy = accuracy

In [None]:
#### PREDICTING WITHOUT K-FOLDS METHOD ####
# Create a KNN model with k=5 neighbors
knn = KNN_without_kFold(data_array, k=5)

# Seperating training data and testing data
# 30% of the data is reserved for testing, 70% is reserved for training
train_set, test_set = knn.train_test_split(data_array, test_size=0.3)

# Separate features and labels
# Use after deciding which ones goes to test, which ones goes to training
# [:, :-1]  # All rows, all columns except the last
# [:, -1]   # All rows, only the last column
train_features, train_labels = train_set[:, :-1], train_set[:, -1]
test_features, test_labels = test_set[:, :-1], test_set[:, -1]

# Train the model
knn.fit(train_features, train_labels)

# Predict the labels for the test set
predictions = knn.predict(test_features)

# Evaluate the accuracy
accuracy = np.mean(predictions == test_labels)
# If the value is equal it gives 1, if not it gives 0. and creates an array from this data and takes the average of this array

print(f'# Accuracy: {accuracy:.2f}', end = "\n\n")

# Which indices are reserved for training
print("# Train Indices: #", train_features.shape[0])
print(np.sort(knn.train_indices), end = "\n\n")

# Which indices are reserved for testing
print("# Test Indices: #", test_features.shape[0])
print(np.sort(knn.test_indices), end = "\n\n")

# Different predictions from the real ones in the test data
print("# Different predictions from the real ones in the test data: ", end="\n")
for i in range(test_features.shape[0]):
    if(test_labels[i] != predictions[i]):
        print(f'Indices: {knn.test_indices[i]}, Test Labels: {test_labels[i]}, Predictions: {predictions[i]}', end = "\n")

In [None]:
#### PREDICTING WITH K-FOLDS METHOD ####
knnn = []
predictions = []
accuracy = []
best_k, max_accuracy = 0, 0
best_index = 0

# Train the model for different values of k
for i in range(1, 151, 2):
    model = KNN_with_KFold(data_array, k=i, kFold=4)  # Create the model instance
    model.cross_validation(data_array)   # Perform cross-validation
    #print(model.accuracy)
    accuracy.append(model.accuracy)
    knnn.append(model)  # Append the model to the list
    if(model.accuracy > max_accuracy):
        best_k = i
        best_index = int((i - 1)/2)
        max_accuracy = model.accuracy


# Print the accuracy of the best model and the next 2 indices following it
for i in range(best_k, best_k + 5, 2):
    index = (i - 1) // 2  # Calculate the index for accuracy list
    print(f"# Accuracy for k = {i}: {accuracy[index]:.4f}", end='')
    if best_k == i:
        print(' # BEST #', end='')
    print()

In [None]:
best_3_models = []

for i in range(3):
    best_3_models.append(knnn[int(best_index) + i])

def print_matching_columns(best_3_models, data_size):
    all_real_labels = data_array[:, -1]
    for i in range(data_size):
        count = 0
        for j in range(3):
            if np.char.equal(best_3_models[j].predicted_label[i], all_real_labels[i]):
                count += 1

        if count <= 2:
            print(f"{best_3_models[0].predicted_label[i]:>19}", end=" ")
            print(f"{best_3_models[1].predicted_label[i]:>19}", end=" ")
            print(f"{best_3_models[2].predicted_label[i]:>19}", end=" ")
            print(f" {all_real_labels[i]:>19}", end=" ")
            if count == 2:
                print(f"{'Consistently correct':>25}", end=" ")
            if count == 1:
                print(f"{'Consistently incorrect':>25}", end=" ")
            if count == 0:
                print(f"{'Inconsistent':>25}", end=" ")
            print(f" {i:>6}", end="\n")


In [None]:
print(f"          KNN-{best_k}               KNN-{best_k+2}              KNN-{best_k+4}              REAL-LABEL            STATUS             INDEX()")
print_matching_columns(best_3_models, best_3_models[0].data_size)