In [11]:
import numpy as np
import pandas as pd

In [42]:
import numpy as np

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        """
        Stores the training data.
        """
        self.X_train = X
        self.y_train = y

    def compute_distance(self, X1, X2):
        # Ensure that X1 and X2 are NumPy arrays
        X1 = np.array(X1)
        X2 = np.array(X2)
        
        # Check for non-numeric data and handle it
        if not np.issubdtype(X1.dtype, np.number) or not np.issubdtype(X2.dtype, np.number):
            raise ValueError("Non-numeric data encountered in the distance computation")
        
        # Compute distances based on the specified metric
        if self.distance_metric == 'euclidean':
            distances = np.sqrt(np.sum((X1[:, np.newaxis] - X2) ** 2, axis=2))
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(X1[:, np.newaxis] - X2), axis=2)
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")
        
        return distances

    def predict(self, X):
        """
        Predicts the class labels for the provided data.
        """
        # Compute distances
        distances = self.compute_distance(X, self.X_train)
        
        # Get indices of k nearest neighbors
        neighbor_indices = np.argsort(distances, axis=1)[:, :self.k]
        
        # Retrieve neighbor labels for each sample
        neighbor_labels = np.array([self.y_train[indices] for indices in neighbor_indices])
        
        # Majority vote
        y_pred = np.round(np.mean(neighbor_labels, axis=1)).astype(int)
        return y_pred
    
    def predict_proba(self, X):
        """
        Predicts class probabilities for the provided data.
        """
        # Compute distances
        distances = self.compute_distance(X, self.X_train)
        
        # Get indices of k nearest neighbors
        neighbor_indices = np.argsort(distances, axis=1)[:, :self.k]
        
        # If self.y_train is a NumPy array, we can directly index it using the neighbor indices
        neighbor_labels = np.array([self.y_train[indices] for indices in neighbor_indices])
        
        # Compute probabilities as the mean of neighbor labels
        y_proba = np.mean(neighbor_labels, axis=1)
        
        return y_proba




In [31]:
def preprocess_data(train_path, test_path):
    # Load the datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    # Separate features and target for train data
    X_train = train_data.drop(columns=['id', 'CustomerId', 'Surname', 'Exited'])
    y_train = train_data['Exited']
    
    # Separate features for test data (test data doesn't have target 'Exited')
    X_test = test_data.drop(columns=['id', 'CustomerId', 'Surname'])
    
    # Handle missing values manually without inplace modifications
    numerical_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
    categorical_columns = ['Geography', 'Gender']
    
    for col in numerical_columns:
        X_train[col] = X_train[col].fillna(X_train[col].mean())
        X_test[col] = X_test[col].fillna(X_test[col].mean())
    
    for col in categorical_columns:
        X_train[col] = X_train[col].fillna(X_train[col].mode()[0])
        X_test[col] = X_test[col].fillna(X_test[col].mode()[0])
    
    # One-hot encode categorical variables
    X_train = pd.get_dummies(X_train, columns=categorical_columns, drop_first=True)
    X_test = pd.get_dummies(X_test, columns=categorical_columns, drop_first=True)
    
    # Ensure the test set has the same columns as the training set (handle any missing columns)
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
    
    # Convert boolean columns to integers
    X_train = X_train.astype({col: 'int' for col in X_train.select_dtypes('bool').columns})
    X_test = X_test.astype({col: 'int' for col in X_test.select_dtypes('bool').columns})
    
    # Standardize numerical columns (manually scale data)
    for col in numerical_columns:
        mean = X_train[col].mean()
        std = X_train[col].std()
        X_train[col] = (X_train[col] - mean) / std
        X_test[col] = (X_test[col] - mean) / std
    
    return X_train, y_train, X_test

In [33]:
def cross_validate(X, y, model, n_splits=5):
    """
    Custom cross-validation function. Splits data into n_splits and calculates
    the average performance of the model.
    """
    fold_size = len(X) // n_splits
    scores = []
    
    for i in range(n_splits):
        # Split data into training and validation sets
        X_val = X[i * fold_size:(i + 1) * fold_size]
        y_val = y[i * fold_size:(i + 1) * fold_size]
        X_train = np.concatenate([X[:i * fold_size], X[(i + 1) * fold_size:]], axis=0)
        y_train = np.concatenate([y[:i * fold_size], y[(i + 1) * fold_size:]], axis=0)
        
        # Fit the model
        model.fit(X_train, y_train)
        
        # Predict probabilities
        y_scores = model.predict_proba(X_val)
        
        # Compute ROC AUC for this fold
        roc_auc = compute_roc_auc(y_val, y_scores)
        scores.append(roc_auc)
    
    return scores

def compute_roc_auc(y_true, y_scores):
    """
    Custom ROC AUC calculation without sklearn.
    """
    # Convert y_true to a NumPy array to ensure correct indexing
    y_true = np.array(y_true)
    
    # Sort by predicted probabilities (y_scores)
    sorted_indices = np.argsort(y_scores)[::-1]
    y_true = y_true[sorted_indices]
    
    # Calculate true positive rate (TPR) and false positive rate (FPR)
    tpr = np.cumsum(y_true) / np.sum(y_true)
    fpr = np.cumsum(1 - y_true) / np.sum(1 - y_true)
    
    # Calculate Area Under the Curve (AUC) using the trapezoidal rule
    auc = np.trapz(tpr, fpr)
    return auc




In [43]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Hyperparameter tuning
k_values = [20, 22]

distance_metrics = ['euclidean', 'manhattan']

best_score = 0
best_k = None
best_distance_metric = None
hyperparameter_results = {}

for distance_metric in distance_metrics:
    for k in k_values:
        print(f"Evaluating k={k}, distance_metric='{distance_metric}'")
        
        # Initialize KNN classifier
        knn = KNN(k=k, distance_metric=distance_metric)
        
        # Perform cross-validation
        cv_scores = cross_validate(X, y, knn, n_splits=5)
        average_score = np.mean(cv_scores)
        print(f"Average ROC AUC Score: {average_score:.4f}\n")
        
        # Store the results
        hyperparameter_results[(k, distance_metric)] = average_score
        
        # Update the best hyperparameters
        if average_score > best_score:
            best_score = average_score
            best_k = k
            best_distance_metric = distance_metric

print("Best Hyperparameters Found:")
print(f"k = {best_k}")
print(f"Distance Metric = '{best_distance_metric}'")
print(f"Best Average ROC AUC Score: {best_score:.4f}")

# Train on full dataset with optimal hyperparameters
knn = KNN(k=best_k, distance_metric=best_distance_metric)
knn.fit(X, y)

# Make predictions on the test set
test_probabilities = knn.predict_proba(X_test)  # Returns probabilities for both classes

# Since predict_proba returns probabilities for both classes [prob_class_0, prob_class_1],
# We want to submit only the probabilities for class 1 (churn probability)
churn_probabilities = test_probabilities  # Use this if you're predicting binary classification (1 output column)

# Create a submission dataframe with the 'id' from the test data and the churn probabilities
test_data = pd.read_csv('test.csv')  # Ensure you load the test data for submission
submission = pd.DataFrame({
    'id': test_data['id'],           # Customer IDs
    'Exited': churn_probabilities    # Churn probabilities (use probabilities for class 1)
})

# Save to CSV for submission to Kaggle
submission.to_csv('submissions.csv', index=False)

print("Submission file 'submissions.csv' with predicted probabilities has been created successfully.")

Evaluating k=20, distance_metric='euclidean'


  auc = np.trapz(tpr, fpr)


Average ROC AUC Score: 0.9109

Evaluating k=22, distance_metric='euclidean'
Average ROC AUC Score: 0.9106

Evaluating k=20, distance_metric='manhattan'
Average ROC AUC Score: 0.9088

Evaluating k=22, distance_metric='manhattan'
Average ROC AUC Score: 0.9104

Best Hyperparameters Found:
k = 20
Distance Metric = 'euclidean'
Best Average ROC AUC Score: 0.9109
Submission file 'submissions.csv' with predicted probabilities has been created successfully.
