# Data Preprocessing:
- Handle missing values and ensure that all relevant features are properly scaled.
- Analyze categorical and numerical variables to determine which are most important for churn prediction.
- Perform feature selection or engineering to improve model performance.

# Model Building:
- Implement K-Nearest Neighbors (KNN) from scratch. Pre-built libraries (e.g., scikit-learn, TensorFlow) for KNN are not allowed.
- Split the dataset and train the model to classify customers into churn and non-churn categories.
- Explore different values for K and choose the optimal one based on performance metrics.
- Tune the model by adjusting hyperparameters such as the distance metric (Euclidean, Manhattan, etc.).

# Model Evaluation:
- Evaluate the model using metrics such as accuracy, precision, recall, F1-score, area under the ROC curve.
- Use cross-validation to ensure the model is not overfitting to the training data.

In [3]:
import numpy as np
import pandas as pd

In [5]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
    
    def euclidean_distance(self, point1, point2):
        euclidean_distance = np.linalg.norm(point1 - point2)
        return euclidean_distance
    
    def manhattan_distance(self, point1, point2):
        manhattan_distance = np.sum(np.abs(point1 - point2))
        return manhattan_distance

    def cosine_distance(self, point1, point2):
        cosine_distance = 1 - np.dot(point1, point2) / (np.linalg.norm(point1) * np.linalg.norm(point2))
        return cosine_distance

    def chebyshev_distance(self, point1, point2):
        chebyshev_distance = np.max(np.abs(point1 - point2))
        return chebyshev_distance

    def fit(self, X, y):
        # TODO: Implement the fit method
        pass

    def predict(self, X):
        # TODO: Implement the predict method
        pass

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        pass

In [6]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    pass

In [7]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    pass

In [None]:
# Load and preprocess data
X, y, X_test = preprocess_data('/path/of/train.csv', '/path/of/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning


# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = ...
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('/path/of/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)