In [41]:
import numpy as np
import pandas as pd

In [42]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        """Store the training data and labels"""
        self.X_train = X
        self.y_train = y

    def compute_distance(self, X1, X2):
        """Compute the distance between two points using the specified metric"""
        if self.distance_metric == 'euclidean':
            diff = X2 - X1
            return np.sqrt(np.sum(diff * diff, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.abs(X2 - X1).sum(axis=1)
        else:
            raise ValueError(f"Distance metric '{self.distance_metric}' is not supported")

    def predict(self, X):
        """Predict the labels for a given set of input data"""
        predictions = np.zeros(X.shape[0])
        for idx, row in enumerate(X.values):
            predictions[idx] = self._predict_single(row)
        return predictions

    def _predict_single(self, instance):
        """Predict the label for a single instance"""
        dists = self.compute_distance(instance, self.X_train.values)
        nearest_indices = np.argpartition(dists, self.k)[:self.k]
        nearest_labels = self.y_train.iloc[nearest_indices]
        return nearest_labels.mean()  # Returning mean as probability of churn


In [44]:
def preprocess_data(train_path, test_path):
    """Preprocess the dataset by handling missing values, encoding, and scaling"""
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    # Combine train and test data for consistent processing
    combined_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

    # Apply one-hot encoding on categorical variables
    combined_df = pd.get_dummies(combined_df, columns=['Geography', 'Gender'], drop_first=True)

    # Define the features to scale
    features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
                'IsActiveMember', 'EstimatedSalary'] + \
               [col for col in combined_df.columns if col.startswith('Geography_') or col.startswith('Gender_')]

    # Scale the features using standardization
    combined_df[features] = (combined_df[features] - combined_df[features].mean()) / combined_df[features].std()

    # Split back into train and test sets
    X_train = combined_df.loc[:len(train_df)-1, features]
    y_train = train_df['Exited']
    X_test = combined_df.loc[len(train_df):, features]

    return X_train, y_train, X_test


In [45]:
def cross_validate(X, y, knn, n_splits=5):
    """Perform k-fold cross-validation"""
    np.random.seed(42)
    shuffled_indices = np.random.permutation(len(X))
    fold_size = len(X) // n_splits
    auc_scores = []

    for fold_idx in range(n_splits):
        val_indices = shuffled_indices[fold_idx * fold_size: (fold_idx + 1) * fold_size]
        train_indices = np.setdiff1d(shuffled_indices, val_indices)

        X_train_fold = X.iloc[train_indices]
        X_val_fold = X.iloc[val_indices]
        y_train_fold = y.iloc[train_indices]
        y_val_fold = y.iloc[val_indices]

        # Fit the KNN model and predict probabilities
        knn.fit(X_train_fold, y_train_fold)
        y_pred_prob = knn.predict(X_val_fold)

        # Compute the AUC score
        fold_auc = auc_score(y_val_fold, y_pred_prob)
        auc_scores.append(fold_auc)

    return auc_scores


In [46]:
def auc_score(y_true, y_pred_prob):
    """Compute the AUC score based on actual labels and predicted probabilities"""
    pos_idx = np.where(y_true == 1)[0]
    neg_idx = np.where(y_true == 0)[0]

    if len(pos_idx) == 0 or len(neg_idx) == 0:
        return 0.5  # Return 0.5 if there are no positive or negative examples

    pos_preds = y_pred_prob[pos_idx]
    neg_preds = y_pred_prob[neg_idx]

    # Count the number of correct rankings (positive predictions greater than negative)
    num_correct = np.sum(pos_preds[:, None] > neg_preds)
    total_possible = len(pos_idx) * len(neg_idx)

    return num_correct / total_possible

In [47]:
# Load and preprocess data
X, y, X_test = preprocess_data('/content/drive/My Drive/CS 506/assignment5/train.csv',
                               '/content/drive/My Drive/CS 506/assignment5/test.csv')

knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

# Hyperparameter tuning
distance_options = ['euclidean', 'manhattan']
highest_score = 0
optimal_params = {}

# Tune hyperparameters for k
for k_val in range(1,22):
    for dist_metric in distance_options:
        current_model = KNN(k=k_val, distance_metric=dist_metric)
        validation_scores = cross_validate(X, y, current_model)
        avg_score = np.mean(validation_scores)

        # Update best hyperparameters based on the highest score
        if avg_score > highest_score:
            highest_score = avg_score
            optimal_params = {'k': k_val, 'distance_metric': dist_metric}

# Output the best hyperparameters and score
print(f"Best parameters: k = {optimal_params['k']}, metric = {optimal_params['distance_metric']}, Best score: {highest_score}")

# Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(**optimal_params)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions to a CSV file for submission
pd.DataFrame({'id': pd.read_csv('/content/drive/My Drive/CS 506/assignment5/test.csv')['id'],
              'Exited': test_predictions}).to_csv('/content/drive/My Drive/CS 506/assignment5/submissions.csv', index=False)


Best parameters: k = 21, metric = manhattan, Best score: 0.8851120871203924
