## Problema
> O algorítimo irá aprender a predizer se um cidadão americano ganha mais ou menos que US$ 50 mil por ano segundo determinadas features (idade, escolaridade, sexo e etc). Para isso, foi utilizado a base de dados (dataset) adult disponível gratuitamente pelo repositório UCI.

In [None]:
# Bibliotecas
import pandas as pd
import numpy as np
from sklearn import preprocessing

# Definição dos nomes das colunas
names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
         'marital-status', 'occupation', 'relationship', 'race', 'sex',
        'capital-gain', 'capital-loss', 'hours-per-week','native-country',
        'class']


# importar a base e colocar os nomes das colunas
df = pd.read_csv('adult.data.txt.txt', header=None, names=names)

In [None]:
df.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [None]:
# Excluir as colunas que não serão utilizadas no modelo
df = df.drop(['fnlwgt','education-num'], axis=1)

# Exlcuir as linhas que possuem valores nulos
df.dropna(inplace=True)

# Transformar features não-numericas em features numéricas.
non_numerical = ['workclass', 'education', 'marital-status', 'occupation',
'relationship', 'race', 'sex', 'native-country']

le = preprocessing.LabelEncoder()
for x in non_numerical:
    le.fit(df[x])
    df[x] = le.transform(df[x].astype(str))

In [None]:
# Separação das Features e Labels
X = np.array(df.iloc[:, 0:12]) 	# features
y = np.array(df['class'])

In [None]:
# Criação de intervalo de números ímpares de K para KNN
neighbors = list(range(1,50,2))


# Criação de intervalo de f para k-fold
cv_list = list(range(10,40))


# Criação da estrutura de listas para armazenar valores
k_list = []
fold_list = []
cv_scores = []

In [17]:
# importar bibliotecas
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

# executar KNN e k-fold cross validation
for k in neighbors:
  for f in cv_list:
      knn = KNeighborsClassifier(n_neighbors=k)
      scores = cross_val_score(knn, X, y, cv=f, scoring='accuracy')
      cv_scores.append(scores.mean())  #popular listas
      k_list.append(k)
      fold_list.append(f)
  print(f'K[{k}/50] | F[{f}/40] | Score : {scores.mean()}')

K[1/50] | F[39/40] | Score : 0.8095885705070214
K[3/50] | F[39/40] | Score : 0.826632991165796
K[5/50] | F[39/40] | Score : 0.8334198969704935
K[7/50] | F[39/40] | Score : 0.8361847417505885
K[9/50] | F[39/40] | Score : 0.834833376964941
K[11/50] | F[39/40] | Score : 0.8366452853378282
K[13/50] | F[39/40] | Score : 0.8375671457306746
K[15/50] | F[39/40] | Score : 0.8364617012055947
K[17/50] | F[39/40] | Score : 0.8366765086318785
K[19/50] | F[39/40] | Score : 0.8357244927227264
K[21/50] | F[39/40] | Score : 0.8367687793569843
K[23/50] | F[39/40] | Score : 0.8354175986709481
K[25/50] | F[39/40] | Score : 0.8357246031824932
K[27/50] | F[39/40] | Score : 0.8368605346031788
K[29/50] | F[39/40] | Score : 0.8370449287737014
K[31/50] | F[39/40] | Score : 0.8360621314095661
K[33/50] | F[39/40] | Score : 0.8350183602542196
K[35/50] | F[39/40] | Score : 0.8356632243721374
K[37/50] | F[39/40] | Score : 0.836031570874116
K[39/50] | F[39/40] | Score : 0.8353252911259201
K[41/50] | F[39/40] | Score 

In [None]:
# calcular o erro
MSE = [1 - x for x in cv_scores]

# contrução do dataframe
df_1 = pd.DataFrame (k_list, columns=['k_list'])
df_2 = pd.DataFrame (fold_list, columns=['fold_list'])
df_3 = pd.DataFrame (MSE, columns=['MSE'])
df_knn = pd.concat([df_1, df_2, df_3], axis=1)

# retorna o menor erro obtido
optimal_k = min(df_knn['MSE'])

# retorna os valores de k e f do menor erro obtido
index_opt = df_knn[df_knn['MSE'] == optimal_k].index.item()

In [None]:
# importar bibliotecas
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

# printar os resultados
print ("The optimal number of neighbors k is %d" % df_knn.loc[index_opt, 'k_list'] )
print ("The optimal number of folds f is %d" % df_knn.loc[index_opt, 'fold_list'] )
print ("Misclassification Error of %f" % optimal_k )

# plotar misclassification error vs k vs f (k-fold)
plt.rcParams['figure.figsize'] = (11,7)
fig = plt.figure()
ax = plt.axes(projection='3d')
color= ['red' if l == index_opt else 'grey' for l in df_knn.index.tolist()]
fc= ['red' if l == index_opt else 'none' for l in df_knn.index.tolist()]
ax.scatter3D(k_list, fold_list, MSE, s=20, facecolor = fc, edgecolors=color,
depthshade=False)

# nomear o gráfico e os eixos
ax.set_title('Soluções algorítimo KNN')
ax.set_xlabel('k_list')
ax.set_ylabel('fold_list')
ax.set_zlabel('MSE')