In [70]:
import numpy as np
import pandas as pd

In [71]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # TODO: Implement the predict method
        y_pred = []
        for x in X:
            distances = self.compute_distance(self.X_train, x)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            k_distances = distances[k_indices]

            most_common_label = np.argmax(np.bincount(k_nearest_labels.astype(int)))

            y_pred.append(most_common_label)
        return y_pred

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        distances = np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        return distances

In [72]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    # Separate features and labels for training data
    # Assuming 'Exited' is the target variable and 'id' is a non-feature column
    X_train = train_data.drop(columns=['Exited', 'id'])
    y_train = train_data['Exited'].astype(int)
    X_test = test_data.drop(columns=['id'])

    # Convert to numpy arrays for use in KNN
    X_train = X_train.select_dtypes(include=[np.number]).values
    X_test = X_test.select_dtypes(include=[np.number]).values
    y_train = y_train.values

    # Normalize/scale the features (mean = 0, std = 1)
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std

    return X_train, y_train, X_test

In [73]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    fold_size = len(X) // n_splits
    # Compute ROC AUC scores
    auc_scores = []

    for i in range(n_splits):
        start = i * fold_size
        end = (i + 1) * fold_size if i < n_splits - 1 else len(X)

        X_val = X[start:end]
        y_val = y[start:end]

        X_train = np.concatenate([X[:start], X[end:]])
        y_train = np.concatenate([y[:start], y[end:]])

        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)

        # Compute ROC AUC score
        accuracy = np.mean(y_pred == y_val)
        auc_scores.append(accuracy)

    return auc_scores

In [74]:
# Load and preprocess data
X, y, X_test = preprocess_data('/content/train.csv', '/content/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
def hyperparameters_tuning(X, knn):
    best_k = None
    best_score = -1
    k_values = [3, 5, 7, 9]

    for k in k_values:
          knn = KNN(k=k, distance_metric='euclidean')
          score = cross_validate(X, y, knn)  # Use the new instance
          avg_score = np.mean(score)
          if avg_score > best_score: # Added check for None score
              best_score = avg_score
              best_k = k

    print("Best k:", best_k)
    print("Best score:", best_score)
    return best_k

best_k = hyperparameters_tuning(X, knn)

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric='euclidean')
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('/content/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: [0.8656666666666667, 0.874, 0.8753333333333333, 0.8683333333333333, 0.8733333333333333]
Best k: 9
Best score: 0.8750666666666668
