# KNN ALGORITHM

In [7]:
#imports

import numpy as np
import pandas as pd
import math
from collections import Counter
from scipy.spatial.distance import cdist
import scipy.stats as stats
import multiprocessing

from sklearn import datasets
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

### Loading the dataset

In [2]:
def load_data_from_folder(folder_path):
    files = os.listdir(folder_path)
    
    df_list = []
    
    for file in files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        df_list.append(df)
    
    combined_df = df_list[0]
    
    for df in df_list[1:]:
        combined_df = pd.merge(combined_df, df, on='id', how='outer')
    
    return combined_df

test_folder = '../data/test/'
test_df = load_data_from_folder(test_folder)

train_folder = '../data/train/'
train_df = load_data_from_folder(train_folder)


In [3]:
def preprocess_data(train_df, test_size=0.3, random_state=42):
    X = train_df.drop(['attack_cat', 'label'], axis=1) 
    y = train_df['attack_cat'] 
    
    numeric_columns = X.select_dtypes(include=['number']).columns
    categorical_columns = X.select_dtypes(exclude=['number']).columns
    
    for col in numeric_columns:
        X[col] = X[col].fillna(X[col].median())
    
    for col in categorical_columns:
        X[col] = X[col].fillna(X[col].mode()[0])
    
    scaler = StandardScaler()
    X[numeric_columns] = scaler.fit_transform(X[numeric_columns])
    
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )
    
    smote = SMOTE(random_state=random_state)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    return X_train_resampled, X_test, y_train_resampled, y_test, scaler, label_encoders

In [4]:
def preprocess_test_data(test_df, scaler, label_encoders):
    X_test = test_df.copy()
    
    numeric_columns = X_test.select_dtypes(include=['number']).columns
    categorical_columns = X_test.select_dtypes(exclude=['number']).columns
    
    for col in numeric_columns:
        X_test[col] = X_test[col].fillna(X_test[col].median())
    
    for col in categorical_columns:
        X_test[col] = X_test[col].fillna(X_test[col].mode()[0])

    X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

    for col in categorical_columns:
        if col in label_encoders:
            le = label_encoders[col]
            if 'unknown' not in le.classes_:
                le.classes_ = np.append(le.classes_, 'unknown')
            
            X_test[col] = X_test[col].apply(lambda x: x if x in le.classes_ else 'unknown')
            X_test[col] = le.transform(X_test[col])
    
    return X_test


## KNN IMPLEMENTATION

In [11]:
class ImprovedKNNClassifier:
    def __init__(self, k=3, metric="euclidean", weights="distance", 
                 batch_size=1000, n_jobs=-1, algorithm='auto'):
        """
        Enhanced Memory-Efficient KNN Classifier with multiple optimizations.
        
        Parameters:
        -----------
        k : int, default=3
            Number of neighbors to use
        metric : str, default="euclidean"
            Distance metric to use ('euclidean', 'manhattan', 'minkowski')
        weights : str, default="distance"
            Weight function used in prediction ('uniform' or 'distance')
        batch_size : int, default=1000
            Number of test points to process in each batch
        n_jobs : int, default=-1
            Number of parallel jobs (-1 uses all available cores)
        algorithm : str, default='auto'
            Algorithm for nearest neighbor search ('auto', 'brute', 'kd_tree', 'ball_tree')
        """
        self.k = k
        self.metric = metric
        self.weights = weights
        self.batch_size = batch_size
        self.n_jobs = n_jobs if n_jobs != -1 else multiprocessing.cpu_count()
        self.algorithm = algorithm
        
        self.train_data = None
        self.train_labels = None
        self._kdtree = None
        self._balltree = None
    
    def fit(self, train_data, train_labels):

        self.train_data = np.asarray(train_data, dtype=np.float32, order='C')
        self.train_labels = np.asarray(train_labels)

        if self.algorithm in ['kd_tree', 'auto']:
            try:
                from sklearn.neighbors import KDTree
                self._kdtree = KDTree(self.train_data, metric=self.metric)
            except ImportError:
                self.algorithm = 'brute'
        
        if self.algorithm in ['ball_tree', 'auto']:
            try:
                from sklearn.neighbors import BallTree
                self._balltree = BallTree(self.train_data, metric=self.metric)
            except ImportError:
                self.algorithm = 'brute'
        
        return self
    
    def _parallel_distance_computation(self, test_batch):
        """
        Compute distances using parallel processing.
        """
        if self.algorithm == 'kd_tree' and self._kdtree:
            distances, indices = self._kdtree.query(test_batch, k=self.k)
            return distances, indices
        
        if self.algorithm == 'ball_tree' and self._balltree:
            distances, indices = self._balltree.query(test_batch, k=self.k)
            return distances, indices
        
        # Fallback to manual distance computation
        if self.metric == 'euclidean':
            distances = cdist(test_batch, self.train_data, metric='euclidean')
        elif self.metric == 'manhattan':
            distances = cdist(test_batch, self.train_data, metric='cityblock')
        elif self.metric == 'minkowski':
            distances = cdist(test_batch, self.train_data, metric='minkowski')
        else:
            raise ValueError(f"Unsupported metric: {self.metric}")
        
        # Get indices of k nearest neighbors for each point
        k_indices = np.argsort(distances, axis=1)[:, :self.k]
        k_distances = np.take_along_axis(distances, k_indices, axis=1)
        
        return k_distances, k_indices
    
    def _weighted_voting(self, k_labels, k_distances):
        """
        Advanced voting mechanism with sophisticated weight calculation.
        """
        if self.weights == 'uniform':
            # Simple majority voting
            return stats.mode(k_labels)[0][0]
        
        elif self.weights == 'distance':
            # Weighted voting with inverse distance
            inv_distances = 1.0 / (k_distances + 1e-8)
            normalized_weights = inv_distances / np.sum(inv_distances)
            
            # Compute weighted votes
            unique_labels = np.unique(k_labels)
            weighted_votes = []
            for label in unique_labels:
                label_mask = (k_labels == label)
                weighted_vote = np.sum(normalized_weights[label_mask])
                weighted_votes.append((label, weighted_vote))
            
            return max(weighted_votes, key=lambda x: x[1])[0]
    
    def predict(self, test_points):
        """
        Predict labels for test points with enhanced efficiency.
        """
        # Validate training data
        if self.train_data is None:
            raise ValueError("Model has not been trained. Call 'fit' first.")
        
        # Prepare test points
        test_points = np.asarray(test_points, dtype=np.float32)
        predictions = []
        
        # Process in batches with potential parallel processing
        for i in range(0, len(test_points), self.batch_size):
            test_batch = test_points[i:i+self.batch_size]
            
            # Compute distances and nearest neighbor indices
            k_distances, k_indices = self._parallel_distance_computation(test_batch)
            
            # Predict for each point in the batch
            batch_predictions = []
            for distances, indices in zip(k_distances, k_indices):
                # Get labels of k nearest neighbors
                k_labels = self.train_labels[indices]
                
                # Apply advanced voting
                prediction = self._weighted_voting(k_labels, distances)
                batch_predictions.append(prediction)
            
            predictions.extend(batch_predictions)
        
        return np.array(predictions)
    
    def score(self, X_test, y_test):
        """
        Compute the accuracy of the classifier.
        """
        predictions = self.predict(X_test)
        return np.mean(predictions == y_test)
    
    def __repr__(self):
        return (f"MemoryEfficientKNNClassifier(k={self.k}, "
                f"metric='{self.metric}', "
                f"weights='{self.weights}', "
                f"batch_size={self.batch_size})")
        
    def submit(self, X_test, filename='submission.csv'):
        """
        Make predictions on test data and save them to a CSV file.
        """
        predictions = self.predict(X_test)
        submission_df = pd.DataFrame({'id': test_df['id'], 'attack_cat': predictions})
        submission_df.to_csv(filename, index=False)
        return submission_df

In [12]:
X_train, X_test, y_train, y_test, scaler, label_encoders = preprocess_data(train_df)
X_test = preprocess_test_data(test_df, scaler, label_encoders)

knn = ImprovedKNNClassifier(k=5, metric='euclidean', weights='distance', batch_size=1000, n_jobs=-1, algorithm='auto')

knn.fit(X_train, y_train)

knn.submit(X_test, filename='submission.csv')

ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.
