In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

# Load the dataset
data = pd.read_csv('UniversalBank.csv')

# Drop irrelevant columns: ID and ZIP Code
data = data.drop(['ID', 'ZIP Code'], axis=1)

# One-hot encode the 'Education' column
data = pd.get_dummies(data, columns=['Education'], drop_first=True)

# Separate features (X) and target (y)
X = data.drop('Personal Loan', axis=1)
y = data['Personal Loan']

# Standardize the features for k-NN
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training (60%) and validation (40%) sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.4, random_state=42)

# Part (a): k-NN classification with k=1
knn_k1 = KNeighborsClassifier(n_neighbors=1)
knn_k1.fit(X_train, y_train)

# Define the new customer
new_customer = [[40, 10, 84, 2, 2, 0, 0, 0, 1, 1, 1, 0]]  # Includes Education_2 and Education_3
new_customer_scaled = scaler.transform(new_customer)

# Classify the new customer
new_customer_prediction_k1 = knn_k1.predict(new_customer_scaled)
print("Part (a): New customer classification with k=1:", new_customer_prediction_k1[0])

# Part (b): Determine the best k
k_values = range(1, 21)
validation_accuracies = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_val_pred = knn.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    validation_accuracies.append(accuracy)

optimal_k = k_values[np.argmax(validation_accuracies)]
print("Part (b): Optimal k =", optimal_k)

# Part (c): Confusion matrix for the validation data using the best k
knn_best = KNeighborsClassifier(n_neighbors=optimal_k)
knn_best.fit(X_train, y_train)
y_val_pred_best = knn_best.predict(X_val)
confusion_matrix_val = confusion_matrix(y_val, y_val_pred_best)
print("Part (c): Confusion matrix for validation data using best k:\n", confusion_matrix_val)

# Part (d): Classify the same customer using the best k
new_customer_prediction_best_k = knn_best.predict(new_customer_scaled)
print("Part (d): New customer classification with best k:", new_customer_prediction_best_k[0])

# Part (e): Repartition the data into training (50%), validation (30%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.5, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.4, random_state=42)

# Apply the k-NN method with the optimal k
knn_repartitioned = KNeighborsClassifier(n_neighbors=optimal_k)
knn_repartitioned.fit(X_train, y_train)

# Generate predictions for all sets
y_train_pred = knn_repartitioned.predict(X_train)
y_val_pred = knn_repartitioned.predict(X_val)
y_test_pred = knn_repartitioned.predict(X_test)

# Generate confusion matrices
confusion_matrix_train = confusion_matrix(y_train, y_train_pred)
confusion_matrix_val = confusion_matrix(y_val, y_val_pred)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)

print("Part (e): Confusion matrix for training data:\n", confusion_matrix_train)
print("Part (e): Confusion matrix for validation data:\n", confusion_matrix_val)
print("Part (e): Confusion matrix for test data:\n", confusion_matrix_test)



Part (a): New customer classification with k=1: 0
Part (b): Optimal k = 1
Part (c): Confusion matrix for validation data using best k:
 [[1782   17]
 [  63  138]]
Part (d): New customer classification with best k: 0
Part (e): Confusion matrix for training data:
 [[2276    0]
 [   0  224]]
Part (e): Confusion matrix for validation data:
 [[1320   15]
 [  60  105]]
Part (e): Confusion matrix for test data:
 [[896  13]
 [ 27  64]]
