In [5]:
import numpy as np
import pandas as pd

# Define the KNN class with flexible options
class KNN:
    def __init__(self, k=15, distance_metric='euclidean', weighting='uniform'):
        self.k = k
        self.distance_metric = distance_metric
        self.weighting = weighting  # 'uniform', 'inverse_distance'

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        # Compute class priors
        self.class_priors = {
            0: np.mean(y == 0),
            1: np.mean(y == 1)
        }

    def predict_proba(self, X):
        proba = []
        for x in X:
            distances = self.compute_distance(self.X_train, x)
            k_indices = distances.argsort()[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            k_nearest_distances = distances[k_indices]

            if self.weighting == 'uniform':
                # Simple average
                prob = np.mean(k_nearest_labels)
            elif self.weighting == 'inverse_distance':
                # Avoid division by zero
                weights = 1 / (k_nearest_distances + 1e-5)
                prob = np.sum(weights * k_nearest_labels) / np.sum(weights)
            else:
                raise ValueError("Unsupported weighting scheme")
            proba.append(prob)
        return np.array(proba)

    def predict(self, X, threshold=0.5):
        proba = self.predict_proba(X)
        return (proba >= threshold).astype(int)

    def compute_distance(self, X, x):
        if self.distance_metric == 'euclidean':
            distances = np.sqrt(np.sum((X - x) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(X - x), axis=1)
        elif self.distance_metric == 'cosine':
            # Cosine distance
            dot_product = np.sum(X * x, axis=1)
            norm_x = np.linalg.norm(x)
            norm_X = np.linalg.norm(X, axis=1)
            distances = 1 - (dot_product / (norm_X * norm_x + 1e-5))
        else:
            raise ValueError("Unsupported distance metric")
        return distances

# Function to compute ROC AUC manually
def compute_roc_auc(y_true, y_scores, num_thresholds=100):
    thresholds = np.linspace(0, 1, num_thresholds)
    tpr_list = []
    fpr_list = []
    for thresh in thresholds:
        y_pred = (y_scores >= thresh).astype(int)
        tp = np.sum((y_true == 1) & (y_pred == 1))
        tn = np.sum((y_true == 0) & (y_pred == 0))
        fp = np.sum((y_true == 0) & (y_pred == 1))
        fn = np.sum((y_true == 1) & (y_pred == 0))

        tpr = tp / (tp + fn) if (tp + fn) > 0 else 0  # Sensitivity
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  # 1 - Specificity
        tpr_list.append(tpr)
        fpr_list.append(fpr)
    # Sort false positive rates and true positive rates
    fpr_list, tpr_list = zip(*sorted(zip(fpr_list, tpr_list)))
    # Compute AUC using the trapezoidal rule
    roc_auc = np.trapz(tpr_list, fpr_list)
    return roc_auc

# Function to generate parameter combinations without itertools
def generate_param_combinations(param_grid):
    keys = list(param_grid.keys())
    values = list(param_grid.values())
    combinations = [[]]
    for value_list in values:
        combinations = [x+[y] for x in combinations for y in value_list]
    param_dicts = [dict(zip(keys, combination)) for combination in combinations]
    return param_dicts

# Define data preprocessing function with options
def preprocess_data(train_path, test_path, scaling_method='standard', feature_engineering=True, features_to_drop=None):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Combine train and test data for consistent preprocessing
    data = pd.concat([train_data, test_data], sort=False).reset_index(drop=True)

    # Drop unnecessary columns
    data.drop(['CustomerId', 'Surname'], axis=1, inplace=True)

    # Handle missing values in 'Gender' and 'Geography'
    data['Gender'] = data['Gender'].fillna('Unknown')
    data['Geography'] = data['Geography'].fillna('Unknown')

    # Map 'Gender' values to numeric
    gender_mapping = {'Male': 0, 'Female': 1, 'Unknown': 2}
    data['Gender'] = data['Gender'].map(gender_mapping)

    # One-hot encode 'Geography'
    geography_dummies = pd.get_dummies(data['Geography'], prefix='Geography')
    data = pd.concat([data, geography_dummies], axis=1)
    data.drop('Geography', axis=1, inplace=True)
    geography_columns = geography_dummies.columns.tolist()
    data[geography_columns] = data[geography_columns].astype(int)

    # Convert numeric columns to numeric data types
    numeric_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
                    'EstimatedSalary']
    for col in numeric_cols:
        if col in data.columns:
            data[col] = pd.to_numeric(data[col], errors='coerce')

    # Handle missing values in numeric columns
    data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

    # Convert boolean columns to integers
    boolean_cols = ['HasCrCard', 'IsActiveMember']
    for col in boolean_cols:
        if col in data.columns:
            data[col] = data[col].astype(int)

    # Ensure 'Exited' is integer type
    if 'Exited' in data.columns:
        data['Exited'] = data['Exited'].fillna(0).astype(int)

    # Feature Engineering
    if feature_engineering:
        data['Age_CreditScore_Ratio'] = data['Age'] / (data['CreditScore'] + 1e-5)
        data['Balance_Salary_Ratio'] = data['Balance'] / (data['EstimatedSalary'] + 1e-5)
        data['IsYoung'] = (data['Age'] < 30).astype(int)
        data['IsSenior'] = (data['Age'] > 60).astype(int)
        data['Tenure_By_Age'] = data['Tenure'] / (data['Age'] + 1e-5)

    # Drop specified features
    if features_to_drop:
        data.drop(features_to_drop, axis=1, inplace=True, errors='ignore')

    # Prepare features to scale
    features_to_scale = data.select_dtypes(include=[np.number]).columns.tolist()
    features_to_scale = [col for col in features_to_scale if col not in ['Exited', 'id']]

    # Feature scaling
    for feature in features_to_scale:
        if scaling_method == 'minmax':
            min_val = data[feature].min()
            max_val = data[feature].max()
            if max_val - min_val != 0:
                data[feature] = (data[feature] - min_val) / (max_val - min_val)
            else:
                data[feature] = 0
        elif scaling_method == 'standard':
            mean_val = data[feature].mean()
            std_val = data[feature].std()
            if std_val != 0:
                data[feature] = (data[feature] - mean_val) / std_val
            else:
                data[feature] = 0
        elif scaling_method == 'none':
            pass  # No scaling
        else:
            raise ValueError("Unsupported scaling method")

    # Split data back into train and test sets
    train = data[:len(train_data)].reset_index(drop=True)
    test = data[len(train_data):].reset_index(drop=True)

    X_train = train.drop(['Exited', 'id'], axis=1).values.astype(float)
    y_train = train['Exited'].values.astype(int)
    X_test = test.drop(['Exited', 'id'], axis=1).values.astype(float)

    train_ids = train['id'].values if 'id' in train.columns else None
    test_ids = test['id'].values if 'id' in test.columns else None

    return X_train, y_train, X_test, train_ids, test_ids

# Cross-validation function without external libraries
def cross_validate(X, y, param_grid, n_splits=5):
    results = []
    n_samples = len(X)
    indices = np.arange(n_samples)

    # Stratify the folds based on class labels
    class_indices = {}
    for label in np.unique(y):
        class_indices[label] = indices[y == label]

    fold_indices = [[] for _ in range(n_splits)]
    for label, idx_list in class_indices.items():
        idx_list = idx_list.copy()
        np.random.shuffle(idx_list)
        folds = np.array_split(idx_list, n_splits)
        for i in range(n_splits):
            fold_indices[i].extend(folds[i])

    # Create all combinations of parameters
    all_params = generate_param_combinations(param_grid)
    for param_dict in all_params:
        scores = {'roc_auc': []}
        for i in range(n_splits):
            valid_idx = fold_indices[i]
            train_idx = np.hstack([fold_indices[j] for j in range(n_splits) if j != i])

            X_train_cv, y_train_cv = X[train_idx], y[train_idx]
            X_valid_cv, y_valid_cv = X[valid_idx], y[valid_idx]

            knn = KNN(
                k=param_dict['k'],
                distance_metric=param_dict['distance_metric'],
                weighting=param_dict['weighting'],
            )
            knn.fit(X_train_cv, y_train_cv)
            y_proba = knn.predict_proba(X_valid_cv)

            # Compute ROC AUC manually
            roc_auc = compute_roc_auc(y_valid_cv, y_proba)
            scores['roc_auc'].append(roc_auc)
        avg_scores = {metric_name: np.mean(scores[metric_name]) for metric_name in scores}
        results.append({
            **param_dict,
            **avg_scores
        })
    return pd.DataFrame(results)

# Load and preprocess data
X, y, X_test, train_ids, test_ids = preprocess_data(
    'train.csv', 'test.csv',
    scaling_method='standard',
    feature_engineering=True,
    features_to_drop=None  # You can specify features to drop here
)

# Define parameter grid for cross-validation
param_grid = {
    'k': [5, 20,40],
    'distance_metric': ['euclidean', 'manhattan'],
    'weighting': ['uniform', 'inverse_distance'],
}

# Perform cross-validation
cv_results = cross_validate(X, y, param_grid, n_splits=5)

# Now sort by 'roc_auc'
print("\nCross-validation results (sorted by ROC AUC):")
print(cv_results.sort_values(by='roc_auc', ascending=False).head(10))

# Select the best parameters based on highest ROC AUC
best_result = cv_results.sort_values(by='roc_auc', ascending=False).iloc[0]
best_params = best_result.copy()
print(f"\nBest parameters:")
for key, value in best_params.items():
    if key != 'roc_auc':
        print(f"{key}: {value}")

# Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(
    k=int(best_params['k']),
    distance_metric=best_params['distance_metric'],
    weighting=best_params['weighting'],
)
knn.fit(X, y)
test_proba = knn.predict_proba(X_test)

# Save test predictions (probabilities)
submission = pd.DataFrame({
    'id': test_ids,
    'Exited': test_proba  # Use probabilities instead of class labels
})
submission.to_csv('submissions.csv', index=False)



Cross-validation results (sorted by ROC AUC):
     k distance_metric         weighting   roc_auc
11  40       manhattan  inverse_distance  0.910006
10  40       manhattan           uniform  0.908250
9   40       euclidean  inverse_distance  0.906110
7   20       manhattan  inverse_distance  0.905177
8   40       euclidean           uniform  0.903672
6   20       manhattan           uniform  0.903375
5   20       euclidean  inverse_distance  0.902033
4   20       euclidean           uniform  0.899811
3    5       manhattan  inverse_distance  0.875166
2    5       manhattan           uniform  0.873640

Best parameters:
k: 40
distance_metric: manhattan
weighting: inverse_distance
