# Data Preprocessing:
- Handle missing values and ensure that all relevant features are properly scaled.
- Analyze categorical and numerical variables to determine which are most important for churn prediction.
- Perform feature selection or engineering to improve model performance.

# Model Building:
- Implement K-Nearest Neighbors (KNN) from scratch. Pre-built libraries (e.g., scikit-learn, TensorFlow) for KNN are not allowed.
- Split the dataset and train the model to classify customers into churn and non-churn categories.
- Explore different values for K and choose the optimal one based on performance metrics.
- Tune the model by adjusting hyperparameters such as the distance metric (Euclidean, Manhattan, etc.).

# Model Evaluation:
- Evaluate the model using metrics such as accuracy, precision, recall, F1-score, area under the ROC curve.
- Use cross-validation to ensure the model is not overfitting to the training data.

In [34]:
import numpy as np
import pandas as pd

In [35]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean', p=2):
        self.k = k
        self.distance_metric = distance_metric
        self.p = p  # For Minkowski distance
    
    def euclidean_distance(self, X1, X2):
        return np.sqrt(np.sum((X1 - X2)**2, axis=1))
    
    def manhattan_distance(self, X1, X2):
        return np.sum(np.abs(X1 - X2), axis=1)

    def cosine_distance(self, X1, X2):
        dot_product = np.sum(X1 * X2, axis=1)
        norm_X1 = np.linalg.norm(X1, axis=1)
        norm_X2 = np.linalg.norm(X2)
        return 1 - dot_product / (norm_X1 * norm_X2)
    
    def minkowski_distance(self, point1, point2):
        return np.power(np.sum(np.power(np.abs(point1 - point2), self.p)), 1/self.p)

    def chebyshev_distance(self, point1, point2):
        return np.max(np.abs(point1 - point2))

    def compute_distance(self, X1, X2):
        if self.distance_metric == "euclidean":
            return self.euclidean_distance(X1, X2)
        elif self.distance_metric == 'manhattan':
            return self.manhattan_distance(X1, X2)
        elif self.distance_metric == 'cosine':
            return self.cosine_distance(X1, X2)
        elif self.distance_metric == 'minkowski':
            return self.minkowski_distance(X1, X2)
        elif self.distance_metric == 'chebyshev':
            return self.chebyshev_distance(X1, X2)
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")

    # Use Training data to store the data in KNN Model
    def fit(self, X, y):
        # TODO: Implement the fit method
        self.train_x = np.array(X)
        self.train_y = np.array(y)
        if (len(self.train_x) != len(self.train_y)):
            raise ValueError(f"Train X and Train Y size differ, Train size: {len(self.train_x)} Test size: {len(self.train_y)}")
        self.train_size = len(self.train_x)
        
    def n_closest_to(self, X):
        distances = self.compute_distance(self.train_x, X)
        return np.argsort(distances)[:self.k]
    
    def majority(self, closest_points):
        labels = self.train_y[closest_points]
        unique_labels, counts = np.unique(labels, return_counts=True)
        return unique_labels[np.argmax(counts)]

    # Predict the Class of Test_X data 
    def predict(self, X_test):
        predictions = np.empty(X_test.shape[0], dtype=self.train_y.dtype)
        for i, x in enumerate(X_test):
            closest_points = self.n_closest_to(x)
            predictions[i] = self.majority(closest_points)
        return predictions
    
    def predict_proba(self, X_test):
        probabilities = np.empty(X_test.shape[0])
        for i, x in enumerate(X_test):
            closest_points = self.n_closest_to(x)
            closest_labels = self.train_y[closest_points]
            probabilities[i] = np.mean(closest_labels)
        return probabilities

class SimpleEnsembleKNN:
    def __init__(self, n_models=5):
        self.models = [KNN(k=np.random.randint(3, 20)) for _ in range(n_models)]
    
    def fit(self, X, y):
        for model in self.models:
            model.fit(X, y)
    
    def predict(self, X):
        predictions = np.array([model.predict(X) for model in self.models])
        return np.round(np.mean(predictions, axis=0)).astype(int)
    
    def predict_proba(self, X):
        probas = np.array([model.predict_proba(X) for model in self.models])
        return np.mean(probas, axis=0)


In [39]:
def create_new_features(X):
    X['Age_squared'] = X['Age'] ** 2
    X['Balance_log'] = np.log1p(X['Balance'])
    X['CreditScore_scaled'] = X['CreditScore'] / 1000
    X['Age_CreditScore_interaction'] = X['Age'] * X['CreditScore_scaled']
    return X

def simple_feature_selection(X, y, k=5):
    y_series = pd.Series(y) if isinstance(y, np.ndarray) else y
    correlations = X.corrwith(y_series).abs().sort_values(ascending=False)
    selected_features = correlations.head(k).index
    return X[selected_features], selected_features

def svd_reduction(X, X_test, n_components=2):
    # X의 SVD 수행
    U, s, Vt = np.linalg.svd(X, full_matrices=False)
    
    # 처음 n_components 개의 특이값과 특이벡터만 사용
    U_reduced = U[:, :n_components]
    s_reduced = s[:n_components]
    Vt_reduced = Vt[:n_components, :]
    
    # 축소된 특징 공간으로 데이터 변환
    X_reduced = np.dot(U_reduced, np.diag(s_reduced))
    
    # 테스트 데이터를 같은 특징 공간으로 변환
    X_test_reduced = np.dot(X_test, Vt_reduced.T)
    
    return X_reduced, X_test_reduced
# Manually implementing one-hot encoding
def manual_one_hot_encoding(df, column):
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in the dataframe")
    
    unique_values = df[column].unique()  # Find unique values in the column
    
    for unique_value in unique_values:
        new_column_name = f"{column}_{unique_value}"
        df[new_column_name] = df[column].apply(lambda x: 1 if x == unique_value else 0)
    
    df = df.drop(column, axis=1)  # Drop the original column
    return df

# Function to apply manual feature scaling
def manual_standard_scaler(df, column):
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in the dataframe")
    
    mean = np.mean(df[column])
    std = np.std(df[column])
    
    # Apply the standardization formula
    df[column] = (df[column] - mean) / std
    return df

def preprocess_common(data, y=None, selected_features=None):
    data = data.drop(['id', 'CustomerId', 'Surname'], axis=1)
    data = manual_one_hot_encoding(data, 'Geography')
    data = manual_one_hot_encoding(data, 'Gender')
    
    data = create_new_features(data)
    
    if y is not None and selected_features is None:
        data, selected_features = simple_feature_selection(data, y)
    elif selected_features is not None:
        data = data[selected_features]
    
    for column in ['CreditScore', 'Age', 'Balance', 'CreditScore_scaled']:
        if column in data.columns:
            data = manual_standard_scaler(data, column)
    
    return data.to_numpy(), selected_features

def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    X_test = pd.read_csv(test_path)
    
    y = train_data['Exited']  # Keep y as a Pandas Series
    X, selected_features = preprocess_common(train_data.drop(['Exited'], axis=1), y)
    X_test, _ = preprocess_common(X_test, selected_features=selected_features)
    
    return X, y.to_numpy(), X_test

def preprocess_data_svd(train_path, test_path, n_components=2):
    X, y, X_test = preprocess_data(train_path, test_path)
    X_reduced, X_test_reduced = svd_reduction(X, X_test, n_components=n_components)
    return X_reduced, y, X_test_reduced

In [37]:
def compute_metrics(y_true, y_pred):
    confusion_matrix = np.zeros((2, 2))
    for i in range(2):
        for j in range(2):
            confusion_matrix[i, j] = np.sum((y_true == i) & (y_pred == j))
    
    tn, fp, fn, tp = confusion_matrix.ravel()
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return accuracy, precision, recall, f1

def compute_roc_auc(y_true, y_scores):
    sorted_indices = np.argsort(y_scores)[::-1]
    y_true = y_true[sorted_indices]
    
    tp_cumsum = np.cumsum(y_true)
    fp_cumsum = np.cumsum(1 - y_true)
    
    tpr = tp_cumsum / np.sum(y_true)
    fpr = fp_cumsum / np.sum(1 - y_true)
    
    # AUC 계산
    auc = np.trapz(tpr, fpr)
    
    return auc

# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    fold_size = len(X) // n_splits
    metrics = {
        'accuracy': [], 'precision': [], 'recall': [], 'f1_score': [], 'roc_auc': []
    }

    for i in range(n_splits):
        start = i * fold_size
        end = start + fold_size if i < n_splits - 1 else len(X)
        
        X_train = np.concatenate([X[:start], X[end:]])
        y_train = np.concatenate([y[:start], y[end:]])
        X_test = X[start:end]
        y_test = y[start:end]
        
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        y_scores = knn.predict_proba(X_test)

        accuracy, precision, recall, f1 = compute_metrics(y_test, y_pred)
        roc_auc = compute_roc_auc(y_test, y_scores)

        for metric, value in zip(metrics.keys(), [accuracy, precision, recall, f1, roc_auc]):
            metrics[metric].append(value)

    return {metric: np.mean(values) for metric, values in metrics.items()}

In [18]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')
X
# Create and evaluate model
knn = KNN(k=5, distance_metric='cosine')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)


Cross-validation scores: {'accuracy': 0.8586666666666666, 'precision': 0.7185034965841597, 'recall': 0.4959334840734737, 'f1_score': 0.5864332190375403, 'roc_auc': 0.8412420520662793}


In [149]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

def compute_model_score(X, y, k, distance_metric):
    # KNN 모델 생성 및 평가
    knn = KNN(k=k, distance_metric=distance_metric)
    scores = cross_validate(X, y, knn)  # 사용자 정의 cross-validation 함수
    return scores # 평균 점수 반환

# TODO: hyperparamters tuning
k_values = [3, 5, 10, 15]
distance_metrics = ['euclidean', 'manhattan']

best_score = 0
best_params = {}

for k in k_values:
    for distance_metric in distance_metrics:
        score = compute_model_score(X, y, k, distance_metric)
        print(f"k: {k}, distance_metric: {distance_metric}, Score: {score}\n")

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
# knn = ...
# knn.fit(X, y)
# test_predictions = knn.predict(X_test)

# # Save test predictions
# pd.DataFrame({'id': pd.read_csv('/path/of/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: {'accuracy': 0.7716000000000001, 'precision': 0.2668292377308332, 'recall': 0.0725209283940105, 'f1_score': 0.11375525816834915, 'roc_auc': -0.5335686112592383}
k: 3, distance_metric: euclidean, Score: {'accuracy': 0.7558666666666667, 'precision': 0.28402735013687613, 'recall': 0.13439466270438308, 'f1_score': 0.18205683315625296, 'roc_auc': -0.5442510683978025}

k: 3, distance_metric: manhattan, Score: {'accuracy': 0.7641333333333333, 'precision': 0.3159710056110793, 'recall': 0.14201601155555837, 'f1_score': 0.19571620765256964, 'roc_auc': -0.5759630274095184}

k: 3, distance_metric: chebyshev, Score: {'accuracy': 0.7461333333333333, 'precision': 0.24686502150063233, 'recall': 0.1231219750833908, 'f1_score': 0.1637603948192148, 'roc_auc': -0.5218395882741795}

k: 5, distance_metric: euclidean, Score: {'accuracy': 0.7716000000000001, 'precision': 0.2668292377308332, 'recall': 0.0725209283940105, 'f1_score': 0.11375525816834915, 'roc_auc': -0.5335686112592383}


In [41]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

def compute_model_score(X, y, k, distance_metric, p=2):
    knn = KNN(k=k, distance_metric=distance_metric, p=p)
    scores = cross_validate(X, y, knn)
    return scores['roc_auc']

# Hyperparameters tuning
k_values = [3, 5, 7, 10, 15, 20]
distance_metrics = ['euclidean', 'manhattan', 'cosine', 'minkowski', 'chebyshev']
p_values = [1, 2, 3]  # For Minkowski distance

best_score = 0
best_params = {}
results = []

# Grid search
for k in k_values:
    for distance_metric in distance_metrics:
        if distance_metric == 'minkowski':
            for p in p_values:
                score = compute_model_score(X, y, k, distance_metric, p)
                results.append({
                    'k': k,
                    'distance_metric': distance_metric,
                    'p': p,
                    'roc_auc': score
                })
                print(f"k: {k}, distance_metric: {distance_metric}, p: {p}, ROC AUC: {score:.4f}")
                
                if score > best_score:
                    best_score = score
                    best_params = {'k': k, 'distance_metric': distance_metric, 'p': p}
        else:
            score = compute_model_score(X, y, k, distance_metric)
            results.append({
                'k': k,
                'distance_metric': distance_metric,
                'p': None,
                'roc_auc': score
            })
            print(f"k: {k}, distance_metric: {distance_metric}, ROC AUC: {score:.4f}")
            
            if score > best_score:
                best_score = score
                best_params = {'k': k, 'distance_metric': distance_metric, 'p': None}

print("\nBest Parameters:")
print(f"k: {best_params['k']}")
print(f"Best Distance Metric: {best_params['distance_metric']}")
print(f"Best ROC AUC Score: {best_score:.4f}")

# Convert results to a DataFrame and sort by ROC AUC
import pandas as pd

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('roc_auc', ascending=False)

print("\nTop 5 Models:")
print(results_df.head())

# Final Model
best_knn = KNN(k=best_params['k'], distance_metric=best_params['distance_metric'])
final_scores = cross_validate(X, y, best_knn)

print("\nFinal Model Performance:")
for metric, score in final_scores.items():
    print(f"{metric}: {score:.4f}")

# Test Data Prediction
best_knn.fit(X, y)
test_predictions = best_knn.predict(X_test)
test_probabilities = best_knn.predict_proba(X_test)

print("\nComplete Predictions")

k: 3, distance_metric: euclidean, ROC AUC: 0.8104
k: 3, distance_metric: manhattan, ROC AUC: 0.8121
k: 3, distance_metric: cosine, ROC AUC: 0.8227
k: 3, distance_metric: minkowski, p: 1, ROC AUC: 0.4998
k: 3, distance_metric: minkowski, p: 2, ROC AUC: 0.4998
k: 3, distance_metric: minkowski, p: 3, ROC AUC: 0.4998
k: 3, distance_metric: chebyshev, ROC AUC: 0.4998
k: 5, distance_metric: euclidean, ROC AUC: 0.8329
k: 5, distance_metric: manhattan, ROC AUC: 0.8356
k: 5, distance_metric: cosine, ROC AUC: 0.8476
k: 5, distance_metric: minkowski, p: 1, ROC AUC: 0.4998
k: 5, distance_metric: minkowski, p: 2, ROC AUC: 0.4998
k: 5, distance_metric: minkowski, p: 3, ROC AUC: 0.4998
k: 5, distance_metric: chebyshev, ROC AUC: 0.4998
k: 7, distance_metric: euclidean, ROC AUC: 0.8450
k: 7, distance_metric: manhattan, ROC AUC: 0.8491
k: 7, distance_metric: cosine, ROC AUC: 0.8619
k: 7, distance_metric: minkowski, p: 1, ROC AUC: 0.4998
k: 7, distance_metric: minkowski, p: 2, ROC AUC: 0.4998
k: 7, dista

In [43]:
# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

knn = KNN(k=20, distance_metric='manhattan')
knn.fit(X, y)
test_predictions = knn.predict_proba(X_test)

# # Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)