In [37]:
import numpy as np
import pandas as pd

In [78]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # TODO: Implement the predict method
        distances = self.compute_distance(self.X_train, X)
        predictions = []
        for i in range(X.shape[0]):
            k_indices = np.argsort(distances[:, i])[:self.k]
            k_nearest_labels = self.y_train[k_indices].astype(int)  # Cast to int
            prediction = np.argmax(np.bincount(k_nearest_labels))
            predictions.append(prediction)
        return np.array(predictions)

    def compute_distance(self, X_train, X_test):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum(X_train**2, axis=1).reshape(-1, 1) +np.sum(X_test**2, axis=1) - 2 * np.dot(X_train, X_test.T))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X_train[:, np.newaxis] - X_test), axis=2)
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")

In [79]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    train_data = train_data.drop(['CustomerId', 'Surname'], axis=1)
    test_data = test_data.drop(['CustomerId', 'Surname'], axis=1)

    train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True)

    test_data = test_data.reindex(columns=train_data.columns.drop('Exited'), fill_value=0)

    X_train = train_data.drop('Exited', axis=1).values
    y_train = train_data['Exited'].values
    X_test = test_data.values
    for i in range(X_train.shape[1]):
        mean = np.mean(X_train[:, i])
        std = np.std(X_train[:, i])
        if std == 0:
            std = 1
        X_train[:, i] = (X_train[:, i] - mean) / std
        X_test[:, i] = (X_test[:, i] - mean) / std
    return X_train, y_train, X_test

In [80]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    n = len(y)
    indices = np.arange(n)
    unique_classes, y_indices = np.unique(y, return_inverse=True)
    fold_indices = [[] for _ in range(n_splits)]
    for c in unique_classes:
        class_indices = indices[y == c]
        np.random.shuffle(class_indices)
        for i, idx in enumerate(np.array_split(class_indices, n_splits)):
            fold_indices[i].extend(idx)
    auc_scores = []
    for i, val_idx in enumerate(fold_indices):
        train_idx = np.setdiff1d(indices, val_idx)
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        knn.fit(X_train, y_train)
        
        y_pred_proba = []
        distances = knn.compute_distance(X_train, X_val)
        for j in range(X_val.shape[0]):
            k_indices = np.argsort(distances[:, j])[:knn.k]
            k_nearest_labels = y_train[k_indices]
            prob_class_1 = np.sum(k_nearest_labels == 1) / knn.k
            y_pred_proba.append(prob_class_1)

        y_pred_proba = np.array(y_pred_proba)
        
        if len(np.unique(y_val)) > 1:
            auc = roc_auc(y_val, y_pred_proba)
            auc_scores.append(abs(auc))
        else:
            auc_scores.append(0.5)
    
    return auc_scores

def roc_auc(y_true, y_pred_proba):
    pos_label = 1
    thresholds = np.sort(np.unique(y_pred_proba))
    tpr_list, fpr_list = [], []

    for threshold in thresholds:
        y_pred = (y_pred_proba >= threshold).astype(int)
        tp = np.sum((y_true == pos_label) & (y_pred == pos_label))
        fp = np.sum((y_true != pos_label) & (y_pred == pos_label))
        fn = np.sum((y_true == pos_label) & (y_pred != pos_label))
        tn = np.sum((y_true != pos_label) & (y_pred != pos_label))

        tpr = tp / (tp + fn) if tp + fn > 0 else 0
        fpr = fp / (fp + tn) if fp + tn > 0 else 0

        tpr_list.append(tpr)
        fpr_list.append(fpr)

    tpr_list = np.array(tpr_list)
    fpr_list = np.array(fpr_list)

    auc = np.trapz(tpr_list, fpr_list)
    return auc

In [86]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparameters tuning
best_auc = 0
best_k = 1
best_distance = 'euclidean'

# Loop over both 'euclidean' and 'manhattan' metrics
for distance_metric in ['euclidean', 'manhattan']:
    for k in range(15, 28):
        knn = KNN(k=k, distance_metric=distance_metric)
        auc_scores = cross_validate(X, y, knn)
        avg_auc = np.mean(auc_scores)
        if avg_auc > best_auc:
            best_auc = avg_auc
            best_k = k
            best_distance = distance_metric

print(f"Best K: {best_k}, Best Distance Metric: {best_distance}, Best AUC: {best_auc}")

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_distance)
knn.fit(X, y)
test_predictions = []
distances_test = knn.compute_distance(knn.X_train, X_test)

for j in range(X_test.shape[0]):
    k_indices = np.argsort(distances_test[:, j])[:knn.k]
    k_nearest_labels = knn.y_train[k_indices]
    prob_class_1 = np.sum(k_nearest_labels == 1) / knn.k
    test_predictions.append(prob_class_1)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: [0.8794511677326211, 0.8576503724990676, 0.8624633489633067, 0.875579419621862, 0.8623729276396089]
Best K: 27, Best Distance Metric: manhattan, Best AUC: 0.904740291527725
