# KNN ALGORITHM

In [48]:
#imports

import numpy as np
import pandas as pd
import math
from collections import Counter
from scipy.spatial.distance import cdist

from sklearn import datasets
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

### Loading the dataset

In [42]:
# Load test data
# Load test data
test_folder = '../data/test/'
test_files = os.listdir(test_folder)
test_df = pd.DataFrame()
for file in test_files:
    df = pd.read_csv(test_folder + file)
    test_df = pd.concat([test_df, df], axis=1)

# Load train data
train_folder = '../data/train/'
train_files = os.listdir(train_folder)
train_df = pd.DataFrame()
for file in train_files:
    df = pd.read_csv(train_folder + file)
    train_df = pd.concat([train_df, df], axis=1)


# Preprocess the data
def preprocess_data(df):
    # Drop duplicate columns
    df = df.loc[:, ~df.columns.duplicated()]

    # Fill missing values
    numeric_cols = df.select_dtypes(include=['number']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns

    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].median())

    for col in categorical_cols:
        df[col] = df[col].fillna(df[col].mode()[0])

    # Encode categorical columns
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col])

    # Standardize numeric columns
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df

# Preprocess train and test data
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

# Separate features and labels
X_train = train_df.drop(['attack_cat', 'label'], axis=1)
y_train = train_df['label']

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mode()[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = le.fit_transform(df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

## KNN IMPLEMENTATION

In [53]:
class KNNClassifier:
    def __init__(self, k=3, metric="euclidean", weights="uniform", batch_size=1000):
        """
        Initialize the Memory-Efficient KNN Classifier.
        
        Parameters:
        -----------
        k : int, default=3
            Number of neighbors to use
        metric : str, default="euclidean"
            Distance metric to use
        weights : str, default="uniform"
            Weight function used in prediction
        batch_size : int, default=1000
            Number of test points to process in each batch
        """
        self.k = k
        self.metric = metric
        self.weights = weights
        self.batch_size = batch_size
        self.train_data = None
        self.train_labels = None
    
    def fit(self, train_data, train_labels):
        """
        Store the training data and labels.
        """
        self.train_data = np.asarray(train_data, dtype=np.float32)
        self.train_labels = np.asarray(train_labels)
        return self
    
    def _compute_batch_distances(self, test_batch):
        """
        Compute distances for a batch of test points.
        """
        if self.metric == 'euclidean':
            distances = cdist(test_batch, self.train_data, metric='euclidean')
        elif self.metric == 'manhattan':
            distances = cdist(test_batch, self.train_data, metric='cityblock')
        elif self.metric == 'minkowski':
            distances = cdist(test_batch, self.train_data, metric='minkowski')
        else:
            raise ValueError(f"Unsupported metric: {self.metric}")
        return distances
    
    def predict(self, test_points):
        """
        Predict labels for test points using batched processing.
        """
        # Validate that model has been trained
        if self.train_data is None:
            raise ValueError("Model has not been trained. Call 'fit' first.")
        
        # Convert test points to numpy array
        test_points = np.asarray(test_points, dtype=np.float32)
        
        # Prepare for batched prediction
        predictions = []
        
        # Process test points in batches
        for i in range(0, len(test_points), self.batch_size):
            # Get current batch
            test_batch = test_points[i:i+self.batch_size]
            
            # Compute distances for current batch
            distances = self._compute_batch_distances(test_batch)
            
            # Predict for each point in the batch
            batch_predictions = []
            for point_distances in distances:
                # Get indices of k nearest neighbors
                k_indices = np.argpartition(point_distances, self.k)[:self.k]
                
                # Get labels of k nearest neighbors
                k_labels = self.train_labels[k_indices]
                k_dist = point_distances[k_indices]
                
                # Apply weighting if specified
                if self.weights == 'distance':
                    # Avoid division by zero
                    weights = 1.0 / (k_dist + 1e-8)
                    unique_labels, _ = np.unique(k_labels, return_counts=True)
                    weighted_votes = []
                    for label in unique_labels:
                        label_mask = (k_labels == label)
                        weighted_vote = np.sum(weights[label_mask])
                        weighted_votes.append((label, weighted_vote))
                    prediction = max(weighted_votes, key=lambda x: x[1])[0]
                else:
                    # Uniform weighting (most common label)
                    prediction = Counter(k_labels).most_common(1)[0][0]
                
                batch_predictions.append(prediction)
            
            # Add batch predictions to overall predictions
            predictions.extend(batch_predictions)
        
        return np.array(predictions)
    
    def score(self, X_test, y_test):
        """
        Compute the accuracy of the classifier.
        """
        predictions = self.predict(X_test)
        return np.mean(predictions == y_test)
    
    def __repr__(self):
        return (f"MemoryEfficientKNNClassifier(k={self.k}, "
                f"metric='{self.metric}', "
                f"weights='{self.weights}', "
                f"batch_size={self.batch_size})")

In [None]:
# Train the model
knn = KNNClassifier(k=3, metric="euclidean", weights="distance", batch_size=500)
knn.fit(X_train, y_train)

# Evaluate the model
train_accuracy = knn.score(X_train, y_train)