# KNN ALGORITHM

In [7]:
#imports

import numpy as np
import pandas as pd
import math
from collections import Counter
from scipy.spatial.distance import cdist
import scipy.stats as stats
import multiprocessing

from sklearn import datasets
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

### Loading the dataset

In [2]:
def load_data_from_folder(folder_path):
    files = os.listdir(folder_path)
    
    df_list = []
    
    for file in files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        df_list.append(df)
    
    combined_df = df_list[0]
    
    for df in df_list[1:]:
        combined_df = pd.merge(combined_df, df, on='id', how='outer')
    
    return combined_df

test_folder = '../data/test/'
test_df = load_data_from_folder(test_folder)

train_folder = '../data/train/'
train_df = load_data_from_folder(train_folder)


In [3]:
def preprocess_data(train_df, test_size=0.3, random_state=42):
    X = train_df.drop(['attack_cat', 'label'], axis=1) 
    y = train_df['attack_cat'] 
    
    numeric_columns = X.select_dtypes(include=['number']).columns
    categorical_columns = X.select_dtypes(exclude=['number']).columns
    
    for col in numeric_columns:
        X[col] = X[col].fillna(X[col].median())
    
    for col in categorical_columns:
        X[col] = X[col].fillna(X[col].mode()[0])
    
    scaler = StandardScaler()
    X[numeric_columns] = scaler.fit_transform(X[numeric_columns])
    
    label_encoders = {}
    for col in categorical_columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )
    
    smote = SMOTE(random_state=random_state)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    return X_train_resampled, X_test, y_train_resampled, y_test, scaler, label_encoders

In [4]:
def preprocess_test_data(test_df, scaler, label_encoders):
    X_test = test_df.copy()
    
    numeric_columns = X_test.select_dtypes(include=['number']).columns
    categorical_columns = X_test.select_dtypes(exclude=['number']).columns
    
    for col in numeric_columns:
        X_test[col] = X_test[col].fillna(X_test[col].median())
    
    for col in categorical_columns:
        X_test[col] = X_test[col].fillna(X_test[col].mode()[0])

    X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

    for col in categorical_columns:
        if col in label_encoders:
            le = label_encoders[col]
            if 'unknown' not in le.classes_:
                le.classes_ = np.append(le.classes_, 'unknown')
            
            X_test[col] = X_test[col].apply(lambda x: x if x in le.classes_ else 'unknown')
            X_test[col] = le.transform(X_test[col])
    
    return X_test


## KNN IMPLEMENTATION

In [11]:
class KNNClassifier:
    def __init__(self, k=3, metric="euclidean", weights="uniform", batch_size=1000):
        """
        Initialize the KNN Classifier.
                           
        Parameters:
        -----------
        k : int, default=3
            Number of neighbors to use
        metric : str, default="euclidean"
            Distance metric to use
        weights : str, default="uniform"
            Weight function used in prediction
        batch_size : int, default=1000
            Number of test points to process in each batch
        """
        self.k = k
        self.metric = metric
        self.weights = weights
        self.batch_size = batch_size
        self.train_data = None
        self.train_labels = None
    
    def fit(self, train_data, train_labels):
        """
        Store the training data and labels.
        """
        self.train_data = np.asarray(train_data, dtype=np.float32)
        self.train_labels = np.asarray(train_labels)
        return self
    
    def _compute_batch_distances(self, test_batch):
        """
        Compute distances for a batch of test points.
        """
        if self.metric == 'euclidean':
            distances = cdist(test_batch, self.train_data, metric='euclidean')
        elif self.metric == 'manhattan':
            distances = cdist(test_batch, self.train_data, metric='cityblock')
        elif self.metric == 'minkowski':
            distances = cdist(test_batch, self.train_data, metric='minkowski')
        else:
            raise ValueError(f"Unsupported metric: {self.metric}")
        return distances
    
    def predict(self, test_points):
        """
        Predict labels for test points.
        """
        if self.train_data is None:
            raise ValueError("Model has not been trained. Call 'fit' first.")
        
        test_points = np.asarray(test_points, dtype=np.float32)
        
        predictions = []
        
        # Process test points in batches
        for i in range(0, len(test_points), self.batch_size):
            
            test_batch = test_points[i:i+self.batch_size]
            
            # Compute distances for current batch
            distances = self._compute_batch_distances(test_batch)
            
            batch_predictions = []
            for point_distances in distances:
                # Get indices of k nearest neighbors
                k_indices = np.argpartition(point_distances, self.k)[:self.k]
                
                # Get labels of k nearest neighbors
                k_labels = self.train_labels[k_indices]
                k_dist = point_distances[k_indices]
                
                if self.weights == 'distance':
                    weights = 1.0 / (k_dist + 1e-8)
                    unique_labels, _ = np.unique(k_labels, return_counts=True)
                    weighted_votes = []
                    for label in unique_labels:
                        label_mask = (k_labels == label)
                        weighted_vote = np.sum(weights[label_mask])
                        weighted_votes.append((label, weighted_vote))
                    prediction = max(weighted_votes, key=lambda x: x[1])[0]
                else:
                    # Uniform weighting (most common label)
                    prediction = Counter(k_labels).most_common(1)[0][0]
                
                batch_predictions.append(prediction)
            
            predictions.extend(batch_predictions)
        
        return np.array(predictions)
    
    def score(self, X_test, y_test):
        predictions = self.predict(X_test)
        return np.mean(predictions == y_test)
    
    def submit(self, X_test, filename='submission.csv'):
        predictions = self.predict(X_test)
        submission_df = pd.DataFrame({'id': test_df['id'], 'attack_cat': predictions})
        submission_df.to_csv(filename, index=False)
        return submission_df
    
    def __repr__(self):
        return (f"KNNClassifier(k={self.k}, "
                f"metric='{self.metric}', "
                f"weights='{self.weights}', "
                f"batch_size={self.batch_size})")
        

## Perbandingan metric dan weights yang digunakan

In [None]:
X_train, X_test, y_train, y_test, scaler, label_encoders = preprocess_data(train_df)

for metric in ['euclidean', 'manhattan', 'minkowski']:
    for weights in ['uniform', 'distance']:
        knn = KNNClassifier(k=3, metric=metric, weights=weights)
        knn.fit(X_train, y_train)
        print(f"Metric: {metric}, Weights: {weights}, Accuracy: {knn.score(X_test, y_test)}")

# Perbandingan dengan library

In [None]:
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test, scaler, label_encoders = preprocess_data(train_df)

knn_lib = KNeighborsClassifier(n_neighbors=3, metric='euclidean', weights='uniform')

knn_lib.fit(X_train, y_train)

accuracy = knn_lib.score(X_test, y_test)
print(f"Library KNN Accuracy: {accuracy}")