In [2]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix 
from pprint import pp

# Data import en opschonen

In [3]:
data = pd.read_csv("../data/train.csv", delimiter=';')
data["overcrowding"] = data["overcrowding"].str.replace(",", ".").astype("float")
data["meaneduc"] = data["meaneduc"].str.replace(",", ".").astype("float")

In [4]:
X = data.drop(columns="target")
y = data["target"]

In [5]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1, train_size=0.7)

# 1x kNN trainen

In [5]:
params = {
    'n_neighbors': 5
    }

clf = KNeighborsClassifier(**params, n_jobs=-1)
clf = clf.fit(train_X, train_y)

In [6]:
clf.score(val_X, val_y)

0.7334205933682374

# Hyper parameter estimation

In [36]:
from sklearn.model_selection import GridSearchCV

n_neighbors = np.arange(3,50)
p = [1]

param_grid = [
    {
        'n_neighbors': n_neighbors,
        'p': p,
        'weights': ["distance"]
    },
]

grid = GridSearchCV(KNeighborsClassifier(), param_grid, n_jobs=-1, cv=5, verbose=3) 

In [37]:
# fitting the model for grid search 
grid.fit(train_X, train_y) 

Fitting 5 folds for each of 157 candidates, totalling 785 fits


In [38]:
# print best parameter after tuning 
pp(grid.best_params_) 
grid_predictions = grid.predict(val_X) 

# print classification report 
print("\n______________________________________________________")
print("Classification report: \n" + classification_report(val_y, grid_predictions)) 

{'n_neighbors': 8, 'p': 1, 'weights': 'distance'}

______________________________________________________
Classification report: 
              precision    recall  f1-score   support

           0       0.82      0.89      0.85      1467
           1       0.76      0.65      0.70       825

    accuracy                           0.80      2292
   macro avg       0.79      0.77      0.78      2292
weighted avg       0.80      0.80      0.80      2292

