In [27]:
import numpy as np
import pandas as pd

In [28]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = np.array(y)  # Convert y to NumPy array to avoid KeyError

    def predict(self, X):
        predictions = []
        for x in X:
            distances = self.compute_distance(self.X_train, x)
            # Get indices of the k nearest neighbors
            k_indices = np.argsort(distances)[:self.k]
            # Get the labels of the k nearest samples
            k_nearest_labels = self.y_train[k_indices].astype(int)
            # Calculate the probability of churn
            churn_prob = np.mean(k_nearest_labels)  # Average of the labels gives probability
            predictions.append(churn_prob)
        return np.array(predictions)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)


In [29]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Drop 'Customer ID' and 'Surname', they are not useful features
    train_data = train_data.drop(columns=['CustomerId', 'Surname'])
    test_data = test_data.drop(columns=['CustomerId', 'Surname'])

    # Separate target from features
    X_train = train_data.drop(columns=['Exited'])
    y_train = train_data['Exited']
    
    # Handle categorical variables (Geography, Gender)
    categorical_features = ['Geography', 'Gender']
    numeric_features = X_train.columns.difference(categorical_features)
    
    # Preprocessing for numeric data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())])

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    # Combine the steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    # Fit and transform training data
    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(test_data)

    return X_train, y_train, X_test


In [30]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

def cross_validate(X, y, knn, n_splits=5):
    kf = KFold(n_splits=n_splits)
    auc_scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)
        auc = roc_auc_score(y_val, y_pred)
        auc_scores.append(auc)

    return auc_scores


In [31]:
# Load and preprocess data
X, y, X_test = preprocess_data('./train.csv', './test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

best_k = None
best_score = 0
for k in range(1, 11):
    knn = KNN(k=k, distance_metric='euclidean')
    scores = cross_validate(X, y, knn)
    mean_score = np.mean(scores)
    
    if mean_score > best_score:
        best_score = mean_score
        best_k = k

print(f'Best K: {best_k}, Best ROC AUC Score: {best_score}')



# Train final model with optimal k
knn = KNN(k=best_k, distance_metric='euclidean')
knn.fit(X, y)

# Make predictions on test set
test_predictions = knn.predict(X_test)

# Save predictions to a CSV file
pd.DataFrame({
    'id': pd.read_csv('./test.csv')['id'], 
    'Exited': test_predictions
}).to_csv('submissions.csv', index=False)


Cross-validation scores: [0.8707447648827682, 0.8845499892270707, 0.8659528011992275, 0.8775597988921934, 0.8703617450706411]
Best K: 10, Best ROC AUC Score: 0.8938806841856296
