In [1]:
import numpy as np
import pandas as pd

In [2]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = []
        for test_point in X:
            # Compute distances between the test point and all training points
            distances = [self.compute_distance(test_point, train_point) for train_point in self.X_train]
            
            # Get the indices of the k-nearest neighbors
            k_indices = np.argsort(distances)[:self.k]
            
            # Get the labels of the k-nearest neighbors
            k_nearest_labels = [self.y_train[i] for i in k_indices]
            
            # Calculate the probability of churn (class 1) as the proportion of neighbors with label 1
            prob_churn = sum(k_nearest_labels) / self.k
            
            # Append the probability to the predictions list
            predictions.append(prob_churn)
        
        return np.array(predictions)


    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        return np.sqrt(np.sum((X1 - X2) ** 2))

In [3]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # One-hot encode categorical variables (Geography and Gender)
    X_train = train_data.drop(columns=['Exited', 'CustomerId', 'Surname', 'id'], errors='ignore')
    y_train = train_data['Exited']
    X_test = test_data.drop(columns=['CustomerId', 'Surname'], errors='ignore')  # Keep 'id' in test set for final output later

    # Encode Gender (Male: 0, Female: 1)
    X_train['Gender'] = X_train['Gender'].map({'Male': 0, 'Female': 1})
    X_test['Gender'] = X_test['Gender'].map({'Male': 0, 'Female': 1})

    # One-hot encode Geography
    X_train = pd.concat([X_train, pd.get_dummies(X_train['Geography'], drop_first=True)], axis=1)
    X_test = pd.concat([X_test, pd.get_dummies(X_test['Geography'], drop_first=True)], axis=1)

    # Align columns between X_train and X_test
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

    # Drop Geography column
    X_train.drop(columns=['Geography'], inplace=True)
    X_test.drop(columns=['Geography'], inplace=True)

    # Feature scaling (standardization: (value - mean) / std) using training data mean/std
    numeric_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
    for col in numeric_columns:
        mean = X_train[col].mean()
        std = X_train[col].std()
        X_train[col] = (X_train[col] - mean) / std
        X_test[col] = (X_test[col] - mean) / std

    # Check for missing or invalid data
    if X_train.isnull().values.any() or X_test.isnull().values.any():
        raise ValueError("Preprocessed data contains missing values!")

    # Ensure all columns are numeric
    assert np.isreal(X_train).all(), "X_train contains non-numeric values!"
    assert np.isreal(X_test).all(), "X_test contains non-numeric values!"

    # Return only numpy arrays for training and test sets
    return X_train.values, y_train.values, X_test.values

In [4]:
def calculate_roc_auc(y_true, y_prob):
    # Sort the true labels based on the predicted probabilities
    sorted_indices = np.argsort(y_prob)[::-1]
    y_true_sorted = y_true[sorted_indices]
    
    # Calculate total number of positive and negative examples
    P = np.sum(y_true_sorted == 1)
    N = np.sum(y_true_sorted == 0)
    
    # Initialize True Positive Rate (TPR) and False Positive Rate (FPR)
    tpr = 0
    fpr = 0
    
    # Keep track of previous point for trapezoidal calculation
    prev_tpr = 0
    prev_fpr = 0
    auc = 0
    
    # Iterate through sorted true labels
    for i in range(len(y_true_sorted)):
        if y_true_sorted[i] == 1:
            tpr += 1 / P  # Increment TPR
        else:
            fpr += 1 / N  # Increment FPR
            
        # Compute area using the trapezoidal rule
        auc += (fpr - prev_fpr) * (tpr + prev_tpr) / 2
        
        # Update previous TPR and FPR
        prev_tpr = tpr
        prev_fpr = fpr
    
    return auc

In [5]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    
    fold_size = len(X) // n_splits
    roc_auc_scores = []
    
    for i in range(n_splits):
        # Create train/test split for this fold
        start = i * fold_size
        end = start + fold_size
        test_indices = indices[start:end]
        train_indices = np.concatenate([indices[:start], indices[end:]])
        
        # Index numpy arrays directly
        X_test_fold = X[test_indices]
        y_test_fold = y[test_indices]
        X_train_fold = X[train_indices]
        y_train_fold = y[train_indices]
        
        # Fit the KNN model on the training fold
        knn.fit(X_train_fold, y_train_fold)
        
        # Predict probabilities for the test fold
        y_prob = knn.predict(X_test_fold)
        
        # Compute the ROC AUC score for this fold using a custom calculate_roc_auc function
        auc_score = calculate_roc_auc(y_test_fold, y_prob)
        roc_auc_scores.append(auc_score)
    
    # Return the average ROC AUC score across all folds
    return np.mean(roc_auc_scores)

In [6]:
# Load and preprocess data
print("Loading and preprocessing data...")
X, y, X_test = preprocess_data('train.csv', 'test.csv')
print("Data preprocessing complete.")

# Create and evaluate model
print("Initializing KNN model...")
knn = KNN(k=5, distance_metric='euclidean')

k_values = [3, 6, 10, 20, 50, 100, 120]
# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning

best_k = None
best_score = 0
scores = []

print("Tuning hyperparameters...")
for k in k_values:
    knn = KNN(k=k, distance_metric='euclidean')
    cv_score = cross_validate(X, y, knn)  # Cross-validate to get ROC AUC score for each k
    scores.append(cv_score)
    print(f"k={k}: ROC AUC={cv_score:.4f}")
    if cv_score > best_score:
        best_score = cv_score
        best_k = k

print(f"Best k: {best_k} with ROC AUC score: {best_score:.4f}")

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric='euclidean')
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Loading and preprocessing data...
Data preprocessing complete.
Initializing KNN model...
Cross-validation scores: 0.8780783461357181
Tuning hyperparameters...
k=3: ROC AUC=0.8573
k=6: ROC AUC=0.8831
k=10: ROC AUC=0.8968
k=20: ROC AUC=0.9086
k=50: ROC AUC=0.9130
k=100: ROC AUC=0.9107
k=120: ROC AUC=0.9098
Best k: 50 with ROC AUC score: 0.9130
