<a href="https://colab.research.google.com/github/felipersdf/machine-learning/blob/main/KNNImproveBase1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import math
from collections import Counter
import numpy

url = "https://raw.githubusercontent.com/felipersdf/machine-learning/main/datasets/glass.data"


# #LEGENDA
#    1. Id number: 1 to 214
#    2. RI: refractive index
#    3. Na: Sodium (unit measurement: weight percent in corresponding oxide, as 
#                   are attributes 4-10)
#    4. Mg: Magnesium
#    5. Al: Aluminum
#    6. Si: Silicon
#    7. K: Potassium
#    8. Ca: Calcium
#    9. Ba: Barium
#   10. Fe: Iron
#   11. Type of glass: (class attribute)
#       -- 1 building_windows_float_processed
#       -- 2 building_windows_non_float_processed
#       -- 3 vehicle_windows_float_processed
#       -- 4 vehicle_windows_non_float_processed (none in this database)
#       -- 5 containers
#       -- 6 tableware
#       -- 7 headlamps

# GLASS
col_names = ['id', 'ri', 'na', 'mg', 'al', 'si', 'K', 'ca', 'ba', 'fe', 'tipo']
feature_cols = ['ri', 'na', 'mg', 'al', 'si', 'K', 'ca', 'ba', 'fe',]


# Carregar base de dados
# DataFrame
dataset = pd.read_csv(url, header=None, names=col_names)

X = dataset[feature_cols] # Atributos (Features)
y = dataset['tipo'] # Saída

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, stratify=y) # 80% treino e 20% teste

K = 6   # Quantidade de vizinhos mais próximos

### Tranforma os dados em listas

train_x = X_train.values.tolist()
train_y = y_train.values.tolist()

test_x = X_test.values.tolist()
test_y = y_test.values.tolist()

In [12]:
resultKNN = []
resultKNN_improve = []

raios = calcular_raios(train_x, train_y)

for i in range(len(test_x)):
  
  # classe = knn(train_x, train_y, test_x[i], K)
  # resultKNN.append(classe)
  
  classeI = knn_improve(train_x, train_y, test_x[i], K, raios)
  resultKNN_improve.append(classeI)

# acc = metrics.accuracy_score(resultKNN, test_y)
acc2 = metrics.accuracy_score(resultKNN_improve, test_y)
# show = round(acc * 100)
show2 = round(acc2 * 100)
# print("{}%".format(show))
print("{}%".format(show2))

# print(resultKNN)
print(resultKNN_improve)
print(test_y)

67.0%
[1, 2, 7, 7, 1, 1, 2, 1, 1, 1, 7, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 7, 7, 2, 2, 1, 1, 7, 6, 1, 7, 2, 5, 7, 1, 2, 7, 1, 2, 2, 2, 2]
[1, 2, 7, 7, 3, 1, 2, 1, 3, 1, 7, 1, 1, 1, 5, 2, 3, 2, 1, 2, 2, 2, 7, 2, 2, 2, 1, 1, 7, 6, 1, 2, 2, 5, 6, 1, 2, 7, 1, 2, 5, 2, 1]


In [4]:
###############
#
# KNN Improve
#
###############
def calcular_raios(train_x, train_y):
  e = 1e-20
  raios = []

  for i in range(len(train_x)):
    newData = train_x.copy()
    newData.pop(i)
    newData_y = train_y.copy()
    newData_y.pop(i)

    results = []

    for j in range(len(newData)):
      r = 0
      
      for k in range(len(train_x[i])):
        r += (train_x[i][k] - newData[j][k]) ** 2 # Distância Euclidiana
  
      results.append(math.sqrt(r))
    
    indexes = numpy.argsort(results) # retorna os índices ordenados

    aux = 0
    while train_y[i] == newData_y[indexes[aux]]:
      aux += 1
    
    raios.append(results[indexes[aux]] - e)

  return raios

def knn_improve(train_x, train_y, test, k, raios):
  results = []
  
  for i in range(len(train_x)):
    r = 0
    
    for j in range(len(test)):
      r += (test[j] - train_x[i][j]) ** 2 # Distância Euclidiana
    
    results.append(math.sqrt(r)/raios[i]) # Distância Euclidiana / Raio
    
  indexes = numpy.argsort(results) # retorna os índices ordenados
  
  indexes = indexes[0:k] # Pega os k índices mais próximos
  
  res = [train_y[i] for i in indexes] # Retorna a classe de cada um dos vizinhos
  
  final = Counter(res)

  return final.most_common(1)[0][0] # retorna a classe com maior frequência

In [5]:
def knn(train_x, train_y, test, k):
  results = []
  
  for i in range(0,len(train_x)):
    r = 0
    
    for j in range(0,len(test)):
      r += (test[j] - train_x[i][j]) ** 2 # Distância Euclidiana
    
    results.append(math.sqrt(r)) # Distância Euclidiana
    
  indexes = numpy.argsort(results) # retorna os índices ordenados
  
  indexes = indexes[0:k] # Pega os k índices mais próximos
  
  res = [train_y[i] for i in indexes] # Retorna a classe de cada um dos vizinhos
  
  final = Counter(res)

  return final.most_common(1)[0][0] # retorna a classe com maior frequência