In [9]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns




In [None]:
def KNeighbors_test(path, cv=5, test_size=0.3):

    # Import dataset
    df_dataset = pd.read_csv(path)

    # Set a random state for sampling
    RANDOM_STATE_SEED = np.random.randint(123)
    print(f'Random Seed:{RANDOM_STATE_SEED}')
    
    # Split dataset in train and test
    train, test = train_test_split(df_dataset, test_size=test_size, random_state=RANDOM_STATE_SEED)
    
    # Count how many instances there are in each label
    print(df_dataset["Label"].value_counts())

    # Separate in X and y for better classification
    y_train = np.array(train.pop("Label"))# pop removes "Label" from the dataframe
    X_train = train.values

    print(f'Tipo X_train: {type(X_train)} Tipo y_train: {type(y_train)} Shape X_train:{X_train.shape} Shape y_train: {y_train.shape}')

    y_test = np.array(test.pop("Label")) # pop removes "Label" from the dataframe
    X_test = test.values

    print(f'Tipo X_test: {type(X_test)} Tipo y_test: {type(y_test)} Shape X_test:{X_test.shape} Shape y_test: {y_test.shape}')
    
    # Define the used classification model
    model = KNeighborsClassifier(n_neighbors=7, p=2, metric='minkowski')

    # Define the grid for hyperparameter tuning
    param_grid = {
    'n_neighbors': [7,8,9],
    #'kernel': ['linear', 'rfb'],
    #'gamma':[0.1, 1, 'scale']
    }

    # Define the final classification model
    clf = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=5,
        verbose=1,
        n_jobs=-1 
    )

    # Fit the model to the training data
    
    %time clf.fit(X=X_train, y=y_train)

    # Print best paramenters
    print("Accuracy score on Validation set: \n")
    print(clf.best_score_ )
    print("---------------")
    print("Best performing hyperparameters on Validation set: ")
    print(clf.best_params_)
    print("---------------")
    print(clf.best_estimator_)

    # Test model on test data and print metrics
    model = clf.best_estimator_
    predictions = model.predict(X_test)
    print(classification_report(y_test,predictions,digits=10))
    print(f'Accuracy: {accuracy_score(y_test, predictions)}')

    # Print Confusion Matrix
    cf_matrix = confusion_matrix(y_test, predictions)
    
    sns.heatmap(cf_matrix, annot=True)   