In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('/content/train.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               15000 non-null  int64  
 1   CustomerId       15000 non-null  float64
 2   Surname          15000 non-null  object 
 3   CreditScore      15000 non-null  float64
 4   Geography        15000 non-null  object 
 5   Gender           15000 non-null  object 
 6   Age              15000 non-null  float64
 7   Tenure           15000 non-null  float64
 8   Balance          15000 non-null  float64
 9   NumOfProducts    15000 non-null  float64
 10  HasCrCard        15000 non-null  float64
 11  IsActiveMember   15000 non-null  float64
 12  EstimatedSalary  15000 non-null  float64
 13  Exited           15000 non-null  float64
dtypes: float64(10), int64(1), object(3)
memory usage: 1.6+ MB
None


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, features, labels):
        self.train_data = features
        self.train_labels = labels

    def predict(self, data_points):
      group_size = 500
      num_samples = data_points.shape[0]
      result_predictions = np.empty(num_samples)  # Preallocate array

      for start_idx in range(0, num_samples, group_size):
          stop_idx = min(start_idx + group_size, num_samples)
          data_batch = data_points[start_idx:stop_idx]

          computed_distances = self.calculate_distances(data_batch, self.train_data)
          nearest_indices = np.argsort(computed_distances, axis=1)[:, :self.k]
          nearest_labels = self.train_labels[nearest_indices]

          result_predictions[start_idx:stop_idx] = np.mean(nearest_labels, axis=1)

      return result_predictions

    def calculate_distances(self, data1, data2):
        if self.distance_metric == 'euclidean':
           distances = np.sqrt(np.maximum(np.sum(np.square(data1), axis=1, keepdims=True) - 2 * np.dot(data1, data2.T) + np.sum(np.square(data2), axis=1, keepdims=True).T, 0))
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(data1[:, np.newaxis] - data2), axis=2)
        else:
            raise ValueError(f"Invalid distance metric: {self.distance_metric}")
        return distances


In [5]:
def preprocess_data(training_file, testing_file):
    # Load the training and testing datasets
    train_df = pd.read_csv(training_file)
    test_df = pd.read_csv(testing_file)

    # Separate features and target from training data
    features_train = train_df.drop(columns=['Exited', 'CustomerId', 'Surname'])
    target_train = train_df['Exited'].values
    features_test = test_df.drop(columns=['CustomerId', 'Surname'])

    # Combine train and test features for consistent preprocessing
    merged_data = pd.concat([features_train, features_test], axis=0, ignore_index=True)

    # Identify categorical columns for one-hot encoding
    categorical_columns = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
    merged_data = pd.get_dummies(merged_data, columns=categorical_columns, dtype=float)

    # Convert all columns to float
    merged_data = merged_data.astype(float)

    # Normalize features
    feature_names = merged_data.columns.tolist()
    for feature in feature_names:
        feature_min = merged_data[feature].min()
        feature_max = merged_data[feature].max()
        range_value = feature_max - feature_min
        if range_value > 0:
            merged_data[feature] = (merged_data[feature] - feature_min) / range_value
        else:
            merged_data[feature] = 0

    # Split the processed data back into training and testing sets
    processed_train_features = merged_data.iloc[:len(features_train)].values
    processed_test_features = merged_data.iloc[len(features_train):].values

    return processed_train_features, target_train, processed_test_features




In [6]:
def cross_validate(features, target, knn_model, num_folds=5):
    total_samples = features.shape[0]
    sample_indices = np.arange(total_samples)
    np.random.shuffle(sample_indices)

    unique_classes, target_indices = np.unique(target, return_inverse=True)
    folds = [[] for _ in range(num_folds)]

    for unique_class in unique_classes:
        class_indices = sample_indices[target[sample_indices] == unique_class]
        np.random.shuffle(class_indices)
        class_fold_chunks = np.array_split(class_indices, num_folds)

        for fold_index in range(num_folds):
            folds[fold_index].extend(class_fold_chunks[fold_index])

    area_under_curve_scores = []

    for fold_index in range(num_folds):
        validation_indices = np.array(folds[fold_index])
        training_indices = np.setdiff1d(sample_indices, validation_indices)

        X_train_split = features[training_indices]
        y_train_split = target[training_indices]
        X_val_split = features[validation_indices]
        y_val_split = target[validation_indices]

        knn_model.fit(X_train_split, y_train_split)
        predictions_val = knn_model.predict(X_val_split)

        auc_score = calculate_roc_auc(y_val_split, predictions_val)
        area_under_curve_scores.append(auc_score)

    return np.mean(area_under_curve_scores)

def calculate_tpr_fpr(y_actual, y_probabilities, threshold_values):
    positive_count = np.sum(y_actual == 1)
    negative_count = np.sum(y_actual == 0)

    true_positive_rates, false_positive_rates = zip(*[
        (
            np.sum((y_actual == 1) & (y_probabilities >= threshold)) / positive_count if positive_count > 0 else 0,
            np.sum((y_actual == 0) & (y_probabilities >= threshold)) / negative_count if negative_count > 0 else 0
        )
        for threshold in threshold_values
    ])

    return np.array(true_positive_rates), np.array(false_positive_rates)

def calculate_roc_auc(y_actual, y_probabilities):
    unique_thresholds = np.sort(np.unique(y_probabilities))[::-1]
    true_positive_rates, false_positive_rates = calculate_tpr_fpr(y_actual, y_probabilities, unique_thresholds)

    # Prepend and append 0 and 1 for true and false positive rates
    true_positive_rates = np.r_[0, true_positive_rates, 1]
    false_positive_rates = np.r_[0, false_positive_rates, 1]

    return np.trapz(true_positive_rates, false_positive_rates)

In [7]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

best_k = None
best_metric = None
best_cv_score = 0

for distance_metric in ['euclidean', 'manhattan']:
  for k in [3, 5, 7, 9]:
    knn = KNN(k=k, distance_metric=distance_metric)
    cv_score = cross_validate(X, y, knn)
    print(f"k={k}, distance_metric={distance_metric}, cv_score={cv_score}")
    if cv_score > best_cv_score:
      best_cv_score = cv_score
      best_k = k
      best_metric = distance_metric

print(f"Best hyperparameters: k={best_k}, distance_metric={best_metric}, cv_score={best_cv_score}")

knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

k=3, distance_metric=euclidean, cv_score=0.8351045710405733
k=5, distance_metric=euclidean, cv_score=0.8593752033800804
k=7, distance_metric=euclidean, cv_score=0.8725500247753744
k=9, distance_metric=euclidean, cv_score=0.881341260660989
k=3, distance_metric=manhattan, cv_score=0.826956612971762
k=5, distance_metric=manhattan, cv_score=0.8594398663955497
k=7, distance_metric=manhattan, cv_score=0.8733790407515618
k=9, distance_metric=manhattan, cv_score=0.8793407844155899
Best hyperparameters: k=9, distance_metric=euclidean, cv_score=0.881341260660989
