In [19]:
import numpy as np
import pandas as pd

In [20]:
# Define the KNN class
class KNN:
    def __init__(self, k=5, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # TODO: Implement the predict method
        probabilities = []

        distances = self.compute_distance(X, self.X_train)

        # Iterate over each sample's distances
        for dist in distances:
            k_indices = np.argsort(dist)[:self.k]
            k_nearest_labels = self.y_train[k_indices].astype(int)

            prob_class_1 = np.sum(k_nearest_labels == 1) / self.k
            probabilities.append(prob_class_1)

        return np.array(probabilities)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1[:, np.newaxis] - X2)**2, axis=2))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1[:, np.newaxis] - X2), axis=2)
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")

In [21]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Drop unnecessary columns
    drop_cols = ['CustomerId', 'Surname']
    train_data.drop(columns=drop_cols, inplace=True)
    test_data.drop(columns=drop_cols, inplace=True)

    # Separate features and target
    X_train = train_data.drop('Exited', axis=1)
    y_train = train_data['Exited']

    # Reset index to avoid misalignment issues
    X_train.reset_index(drop=True, inplace=True)
    test_data.reset_index(drop=True, inplace=True)

    # Identify numerical and categorical columns
    numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = X_train.select_dtypes(include=['object']).columns

    # --- Manually Scale Numerical Columns ---
    def manual_robust_scaler(data):
        median = np.median(data, axis=0)
        q1 = np.percentile(data, 25, axis=0)
        q3 = np.percentile(data, 75, axis=0)
        iqr = q3 - q1
        return (data - median) / (iqr + 1e-9)

    # Apply scaling to numerical columns
    X_train_num = manual_robust_scaler(X_train[numerical_cols].values)
    X_test_num = manual_robust_scaler(test_data[numerical_cols].values)

    # --- Manually One-Hot Encode Categorical Columns ---
    def manual_one_hot_encoding(train, test):
        train_encoded = []
        test_encoded = []

        for col in train.columns:
            unique_vals = train[col].unique()
            # Skip first category to avoid dummy variable
            for val in unique_vals[1:]:
                train_encoded.append((train[col] == val).astype(int))
                test_encoded.append((test[col] == val).astype(int))

        return np.column_stack(train_encoded), np.column_stack(test_encoded)

    # Apply one-hot encoding to categorical columns
    X_train_cat, X_test_cat = manual_one_hot_encoding(X_train[categorical_cols], test_data[categorical_cols])

    # Concatenate numerical and categorical columns
    X_train = np.hstack([X_train_num, X_train_cat])
    X_test = np.hstack([X_test_num, X_test_cat])

    return np.array(X_train), np.array(y_train), np.array(X_test)

In [22]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    fold_size = len(X) // n_splits
    roc_auc_scores = []

    for i in range(n_splits):
        # Create training and validation indices
        val_start = i * fold_size
        val_end = (i + 1) * fold_size if (i + 1) * fold_size < len(X) else len(X)

        # Split the data into training and validation sets
        X_train = np.concatenate((X[:val_start], X[val_end:]), axis=0)
        y_train = np.concatenate((y[:val_start], y[val_end:]), axis=0)
        X_val = X[val_start:val_end]
        y_val = y[val_start:val_end]

        # Train the model
        knn.fit(X_train, y_train)

        # Predict probabilities on the validation set
        y_pred_prob = knn.predict(X_val)

        # Sort probabilities and true labels
        sorted_indices = np.argsort(y_pred_prob)[::-1]
        y_true_sorted = y_val[sorted_indices]
        y_pred_sorted = y_pred_prob[sorted_indices]

        # Calculate AUC using the trapezoidal rule
        tp = 0  # True positives
        fp = 0  # False positives
        auc = 0.0
        prev_tpr = 0  # Previous True Positive Rate
        prev_fpr = 0  # Previous False Positive Rate

        pos_count = np.sum(y_true_sorted)
        total_negatives = len(y_true_sorted) - pos_count

        for label in y_true_sorted:
            if label == 1:
                tp += 1
            else:
                fp += 1
                # Current True Positive Rate and False Positive Rate
                tpr = tp / pos_count if pos_count > 0 else 0
                fpr = fp / total_negatives if total_negatives > 0 else 0

                # Incremental AUC calculation using the trapezoidal rule
                auc += (fpr - prev_fpr) * (tpr + prev_tpr) / 2
                prev_tpr = tpr
                prev_fpr = fpr

        # Handle the last point (final TPR and FPR)
        tpr = tp / pos_count if pos_count > 0 else 0
        fpr = fp / total_negatives if total_negatives > 0 else 0
        auc += (fpr - prev_fpr) * (tpr + prev_tpr) / 2

        # Store the computed AUC score for this fold
        roc_auc_scores.append(auc)

    return np.mean(roc_auc_scores)

In [23]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
k_values = range(1, 31)
roc_auc_scores_euclidean = []
roc_auc_scores_manhattan = []

# Perform cross-validation
print("Cross-validation scores:")
for k in k_values:
    # Evaluate with Euclidean distance
    knn_euclidean = KNN(k=k, distance_metric='euclidean')
    scores_euclidean = cross_validate(X, y, knn_euclidean, n_splits=5)
    mean_score_euclidean = np.mean(scores_euclidean)
    roc_auc_scores_euclidean.append(mean_score_euclidean)

    # Evaluate with Manhattan distance
    knn_manhattan = KNN(k=k, distance_metric='manhattan')
    scores_manhattan = cross_validate(X, y, knn_manhattan, n_splits=5)
    mean_score_manhattan = np.mean(scores_manhattan)
    roc_auc_scores_manhattan.append(mean_score_manhattan)

    # Print both scores in one line
    print(f'k = {k}, Euclidean ROC-AUC: {mean_score_euclidean:.4f}, Manhattan ROC-AUC: {mean_score_manhattan:.4f}')

Cross-validation scores:
k = 1, Euclidean ROC-AUC: 0.7510, Manhattan ROC-AUC: 0.7569
k = 2, Euclidean ROC-AUC: 0.8136, Manhattan ROC-AUC: 0.8188
k = 3, Euclidean ROC-AUC: 0.8446, Manhattan ROC-AUC: 0.8462
k = 4, Euclidean ROC-AUC: 0.8607, Manhattan ROC-AUC: 0.8637
k = 5, Euclidean ROC-AUC: 0.8708, Manhattan ROC-AUC: 0.8715
k = 6, Euclidean ROC-AUC: 0.8777, Manhattan ROC-AUC: 0.8770
k = 7, Euclidean ROC-AUC: 0.8827, Manhattan ROC-AUC: 0.8842
k = 8, Euclidean ROC-AUC: 0.8876, Manhattan ROC-AUC: 0.8869
k = 9, Euclidean ROC-AUC: 0.8885, Manhattan ROC-AUC: 0.8902
k = 10, Euclidean ROC-AUC: 0.8912, Manhattan ROC-AUC: 0.8925
k = 11, Euclidean ROC-AUC: 0.8919, Manhattan ROC-AUC: 0.8946
k = 12, Euclidean ROC-AUC: 0.8939, Manhattan ROC-AUC: 0.8968
k = 13, Euclidean ROC-AUC: 0.8959, Manhattan ROC-AUC: 0.8987
k = 14, Euclidean ROC-AUC: 0.8967, Manhattan ROC-AUC: 0.8988
k = 15, Euclidean ROC-AUC: 0.8974, Manhattan ROC-AUC: 0.8996
k = 16, Euclidean ROC-AUC: 0.8981, Manhattan ROC-AUC: 0.9011
k = 17, 

In [24]:
# TODO: hyperparamters tuning
best_k_euclidean = k_values[np.argmax(roc_auc_scores_euclidean)]
best_k_manhattan = k_values[np.argmax(roc_auc_scores_manhattan)]

if roc_auc_scores_euclidean[np.argmax(roc_auc_scores_euclidean)] >= roc_auc_scores_manhattan[np.argmax(roc_auc_scores_manhattan)]:
    best_k = best_k_euclidean
    best_distance_metric = 'euclidean'
else:
    best_k = best_k_manhattan
    best_distance_metric = 'manhattan'

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric='manhattan')
knn.fit(X, y)

BATCH_SIZE = 1000

test_predictions = []

for i in range(0, len(X_test), BATCH_SIZE):
    batch = X_test[i:i + BATCH_SIZE]
    batch_predictions = knn.predict(batch)
    test_predictions.extend(batch_predictions)

test_predictions = np.array(test_predictions)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)
print("Predictions saved into \'submissions.csv\'")

Predictions saved into 'submissions.csv'
