In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

#Leave One Out (LOO)
from sklearn.model_selection import LeaveOneOut

### Dividindo a base em treino e teste

In [19]:
# Carregando Dataset
df = pd.read_csv('data/input-knn.csv',engine='python', sep=',')
df.head()

# Separando Alternativas selecionas(X) e conteúdos sugeridos(y)
X = df.drop('Content',axis=1).values
y = df['Content'].values

# Normalizando os id´s das alternarivas
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Dividindo a base em Treino(80%) e Teste(20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

# Definindo os parâmetros que serão utilizados pelo KNN
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(X_train, y_train)

# Algoritmo final com os hyper-parâmetros otimizados
#KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
#                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
#                     weights='uniform')

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

### Aplicando o Leave-One-Out (LOO) para definir o melhor valor de K e a acurácia média.

In [20]:
neighbors = range(1,10,2)
train_accuracy_mean, test_accuracy_mean = [],[]
test_accuracy_max, best_k = 0,0

for i in neighbors:
    train_accuracy, test_accuracy = np.empty(len(X)), np.empty(len(X))

    loo = LeaveOneOut()
    for train, test in loo.split(X):
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        
        knn = KNeighborsClassifier(n_neighbors=i)
        knn.fit(X_train,y_train)

        #Calculando acurácia para treinamento e teste
        train_accuracy[test], test_accuracy[test] = knn.score(X_train, y_train), knn.score(X_test, y_test)
        
    train_accuracy_mean.append(train_accuracy.mean())
    test_accuracy_mean.append(test_accuracy.mean())

    if (np.amax(test_accuracy_mean) > test_accuracy_max):
        best_k = i
        test_accuracy_max = test_accuracy.mean()

train_accuracy_mean, test_accuracy_mean = np.array(train_accuracy_mean), np.array(test_accuracy_mean)

print(f"Melhor K: {best_k} - Acurácia: {round(test_accuracy_max * 100, 2)}")
# output =  Melhor K: 7 - Acurácia: 64.56

Melhor K: 7 - Acurácia: 64.56
