In [1]:
import numpy as np
import pandas as pd

In [2]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.linalg.norm(X2 - X1, axis=1)
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X2 - X1), axis=1)
        else:
            raise ValueError("Unsupported distance metric")

    def predict(self, X):
        return np.apply_along_axis(self.predict_single, 1, X.values)

    def predict_single(self, x):
        distances = self.compute_distance(x, self.X_train.values)
        k_indices = np.argpartition(distances, self.k)[:self.k]
        k_nearest_labels = self.y_train.iloc[k_indices]
        return np.mean(k_nearest_labels)


In [3]:
def preprocess_data(train_path, test_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    full_data = pd.concat([train_df, test_df], ignore_index=True)

    full_data = pd.get_dummies(full_data, columns=['Geography', 'Gender'], drop_first=True)

    selected_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'] + \
                        [col for col in full_data.columns if col.startswith('Geography_') or col.startswith('Gender_')]

    full_data[selected_features] = (full_data[selected_features] - full_data[selected_features].mean()) / full_data[selected_features].std()

    X_train = full_data.loc[:len(train_df)-1, selected_features]
    y_train = train_df['Exited']
    X_test = full_data.loc[len(train_df):, selected_features]

    return X_train, y_train, X_test


In [4]:
def cross_validate(X, y, knn, n_splits=5):
    np.random.seed(42)
    indices = np.random.permutation(len(X))
    fold_size = len(X) // n_splits
    scores = []

    for i in range(n_splits):
        val_indices = indices[i * fold_size:(i + 1) * fold_size]
        train_indices = np.setdiff1d(indices, val_indices)

        X_train, X_val = X.iloc[train_indices], X.iloc[val_indices]
        y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]

        knn.fit(X_train, y_train)
        y_est_prob = knn.predict(X_val)
        score = compute_auc_score(y_val, y_est_prob)
        scores.append(score)

    return scores

def compute_auc_score(y_true, y_est_prob):
    positive_indices = np.where(y_true == 1)[0]
    negative_indices = np.where(y_true == 0)[0]

    if len(positive_indices) == 0 or len(negative_indices) == 0:
        return 0.5

    positive_predictions = y_est_prob[positive_indices]
    negative_predictions = y_est_prob[negative_indices]

    correct_order = np.sum(positive_predictions[:, None] > negative_predictions)
    return correct_order / (len(positive_indices) * len(negative_indices))

In [5]:
X, y, X_test = preprocess_data('./train.csv', './test.csv')

knn = KNN(k=5, distance_metric='euclidean')

cross_val_scores = cross_validate(X, y, knn)

print("Cross-validation AUC scores:", cross_val_scores)
print("Mean AUC score:", np.mean(cross_val_scores))

k_values = [3, 5, 8, 10, 12, 16]  
distance_metrics = ['euclidean', 'manhattan']
best_auc = 0
best_hyperparams = {}

for k in k_values:
    for metric in distance_metrics:
        knn_model = KNN(k=k, distance_metric=metric)
        auc_scores = cross_validate(X, y, knn_model)
        avg_auc = np.mean(auc_scores)

        if avg_auc > best_auc:
            best_auc = avg_auc
            best_hyperparams = {'k': k, 'distance_metric': metric}

print("Best hyperparameters:", best_hyperparams)
print("Best AUC score:", best_auc)

knn = KNN(**best_hyperparams)
knn.fit(X, y)

test_predictions = knn.predict(X_test)

rounded_preds = np.round(test_predictions, 2)
pd.DataFrame({'id': pd.read_csv('./test.csv')['id'], 'Exited': rounded_preds.ravel()}).to_csv('submissions.csv', index=False)

Cross-validation AUC scores: [0.7827393099966703, 0.819753998373543, 0.8114446802742773, 0.7861315496098105, 0.8229182427660009]
Mean AUC score: 0.8045975562040603
Best hyperparameters: {'k': 16, 'distance_metric': 'manhattan'}
Best AUC score: 0.8783080816498605
