In [152]:
import numpy as np
import pandas as pd
import gower

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

In [153]:
# Define the KNN class
def most_common(lst):
    '''Returns the most common element in a list'''
    return max(set(lst), key=lst.count)

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        """
        Predict method for KNN, computes predictions for the test data.
        """
    
        neighbors = []
        for x in X.itertuples(index=False):
            distances = self.compute_distance(x, self.X_train)
            y_sorted = [y for _, y in sorted(zip(distances, self.y_train))]
            neighbors.append(y_sorted[:self.k])
        return list(map(most_common, neighbors))
            

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X2 - X1) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2))
        elif self.distance_metric == 'gower':
            # Use Gower distance (X2 is a DataFrame, X1 is a single row)
            X1_reshaped = np.array(X1).reshape(1, -1)  # Reshape to 2D for compatibility
            return gower.gower_matrix(X2, X1_reshaped).flatten()

In [154]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc
    
    X = train_data.drop(['id', 'CustomerId', 'Surname', 'Exited'], axis=1)
    y = train_data['Exited']
    X_test = test_data.drop(['id', 'CustomerId', 'Surname'], axis=1)
    
    # process xtrain and test
    def process(df, scalar=None):
        #encode geography and gender
        le = LabelEncoder()
        df['Geography'] = le.fit_transform(df['Geography'])
        df['Gender'] = le.fit_transform(df['Gender'])
        #scale everyting
        if scalar is None:  
            scalar = StandardScaler()
            df_scaled = scalar.fit_transform(df)
        else:  
            df_scaled = scalar.transform(df)
        df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)
        return df_scaled, scalar
            
    X, scalar = process(X)
    X_test, _ = process(X_test, scalar)
    
    return X, y, X_test

In [155]:
X, y, X_test = preprocess_data('./train.csv', './test.csv')

# knn = KNN(k=5, distance_metric='euclidean')


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

knn = KNN(k=5, distance_metric='gower')
knn.fit(X_train, y_train)

y_val_pred = knn.predict(X_val)

accuracy = np.sum(y_val_pred == y_val) / len(y_val)
print(f"Validation Accuracy: {accuracy}")


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [156]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    auc_scores = []
    
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Fit the model on the training set
        knn.fit(X_train, y_train)

        # Make predictions on the validation set
        y_val_pred = knn.predict(X_val)
        print(y_val_pred)

        # Calculate the ROC AUC score
        auc = roc_auc_score(y_val, y_val_pred)
        auc_scores.append(auc)
    
    
    return auc_scores

In [157]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')



# # Create and evaluate model
# knn = KNN(k=5, distance_metric='manhattan')

# # Perform cross-validation
# cv_scores = cross_validate(X, y, knn)

# print("Cross-validation scores:", cv_scores)




# knn = KNN(k=5, distance_metric='euclidean')
# cv_scores = cross_validate(X, y, knn)
# print("Cross-validation scores:", cv_scores)



k_scores = []

for i in range(10, 26, 2):
    knn = KNN(k=i, distance_metric='gower')
    cv_scores = cross_validate(X, y, knn, n_splits=3)
    print("==========================")
    print(f"When k = {i}")
    print("Cross-validation scores:", cv_scores)
    print(np.mean(cv_scores))
    k_scores.append(np.mean(cv_scores))
    
print(k_scores)
    
    
# TODO: hyperparamters tuning


# # TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
# knn = KNN(k=5, distance_metric='euclidean')
# knn.fit(X, y)
# test_predictions = knn.predict(X_test)

# # Save test predictions
# pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [146]:
# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=20, distance_metric='euclidean')
knn.fit(X, y)
test_predictions = knn.predict(X_test)
# print(test_predictions)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [149]:
print(len(test_predictions))

df = pd.read_csv('submissions.csv')

print(len(df))

10000
10000
