In [1]:
import pandas as pd

In [2]:
# Load the data
banking_df = pd.read_csv('bank-additional-full.csv', sep = ';')

In [3]:
banking_df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
banking_df["y"] = banking_df["y"].apply(lambda x: 1 if x=="yes" else 0)

In [5]:
banking_df.dtypes.value_counts()

object     10
int64       6
float64     5
Name: count, dtype: int64

In [6]:
banking_df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [7]:
banking_df.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911,0.112654
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528,0.316173
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6,0.0
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1,0.0
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0,0.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1,0.0
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1,1.0


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [9]:
banking_df = pd.get_dummies(data = banking_df, drop_first = True)

In [10]:
correlations = abs(banking_df.corr())

In [11]:
top_5_features = correlations["y"].sort_values(ascending=False)[1:6].index

In [12]:
print(correlations["y"].sort_values(ascending=False)[1:6])

duration            0.405274
nr.employed         0.354678
pdays               0.324914
poutcome_success    0.316269
euribor3m           0.307771
Name: y, dtype: float64


In [13]:
X = banking_df.drop(["y"], axis=1)
y = banking_df["y"]

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X[top_5_features], y, test_size=0.2, random_state=417)

In [15]:
scaler = MinMaxScaler()

In [16]:
X_train_scaled = scaler.fit_transform(X_train)

In [17]:
from sklearn.neighbors import KNeighborsClassifier

In [18]:
# Create a list that stores integers from 1 to 5
num_neighbors = [num for num in range(1, 6)]

In [19]:
X_val_scaled = scaler.fit_transform(X_val)

In [20]:
# Create an empty dictionary named "accuracies."
accuracies = {}

In [21]:
for neighbors in num_neighbors:
    knn = KNeighborsClassifier(n_neighbors = neighbors)
    knn.fit(X_train_scaled, y_train)
    val_accuracy = knn.score(X_val_scaled, y_val)
    accuracies[neighbors] = val_accuracy

print(accuracies)
    

{1: 0.8611313425588735, 2: 0.8924496236950716, 3: 0.8792182568584608, 4: 0.8942704539936878, 5: 0.8911143481427531}


Using Weight points in each neighborhood by the inverse of their distance and set the power parameter to 5.

In [22]:
for neighbors in num_neighbors:
    knn1 = KNeighborsClassifier(n_neighbors = neighbors, weights = "distance", p = 5)
    knn1.fit(X_train_scaled, y_train)
    val_accuracy1 = knn1.score(X_val_scaled, y_val)
    accuracies[neighbors] = val_accuracy1
    
print(accuracies)

{1: 0.8628307841709153, 2: 0.8664724447681476, 3: 0.8784899247390143, 4: 0.8798252002913328, 5: 0.8846807477543093}


Using GridSearch

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X[top_5_features], y, test_size=0.2, random_state=417)

In [25]:
grid_params = {"n_neighbors": range(1, 10),
                "metric": ["minkowski", "manhattan"]
              }


In [26]:
 knn1 = KNeighborsClassifier()

In [27]:
knn_search = GridSearchCV(knn1, grid_params, scoring='accuracy')

In [28]:
knn_search.fit(X_train_scaled, y_train)

In [30]:
best_score = knn_search.best_score_
best_params = knn_search.best_params_

In [31]:
print(f"Best model's accuracy: {best_score*100:.2f}")
print(f"Best model's parameters: {best_params}")

Best model's accuracy: 90.99
Best model's parameters: {'metric': 'minkowski', 'n_neighbors': 9}


In [33]:
X_test_scaled = scaler.transform(X_test)
accuracy = knn_search.best_estimator_.score(X_test_scaled, y_test)

In [34]:
print(f" Model Accuracy on test set: {accuracy*100:.2f}")

 Model Accuracy on test set: 89.31
