In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import knn
import random

In [52]:
separacion = 10
clusterNum = int(2000/4)

data = [[1, random.random() * separacion + 40, random.random() * separacion + 30] for i in range(clusterNum)]
data += [[2, random.random() * separacion + 0, random.random() * separacion + 30] for i in range(clusterNum)]
data += [[3, random.random() * separacion + 20, random.random() * separacion + -10] for i in range(clusterNum)]
data += [[4, random.random() * separacion + -5, random.random() * separacion + 80] for i in range(clusterNum)]
data += [[5, random.random() * separacion + -10, random.random() * separacion + 75] for i in range(clusterNum)]
random.shuffle(data)
df = pd.DataFrame(np.array(data), columns=["Class", "A", "B"])
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Class   2000 non-null   float64
 1   A       2000 non-null   float64
 2   B       2000 non-null   float64
dtypes: float64(3)
memory usage: 47.0 KB


In [53]:
# Se sustituyen valores categóricos por valores numéricos

dfNum = pd.DataFrame()
for col, t in zip(df.columns, df.dtypes):
    pos = 0
    keys = pd.DataFrame()    
    if (t == 'object'):
        key = dict([(k,i) for i,k in enumerate(df.loc[:, col].unique())])
        dfNum[col] = df[col].map(key)
    else:
        dfNum[col] = df[col]

In [54]:
# Se filtran las columnas redundantes
r = 0.0
corrMat = dfNum.corr()
relevantCols = corrMat.loc[(corrMat['Class'] > r) | (corrMat['Class'] < -r)].transpose().columns
dfNum = dfNum.loc[:, relevantCols.to_list()]

In [55]:
# Se separan la columna a predecir de los atributos

individuals = pd.DataFrame(dfNum.iloc[:,1:])
individuals

Unnamed: 0,A,B
0,20.867796,-4.179468
1,0.720704,34.793914
2,22.033792,-2.354455
3,-1.880027,88.768109
4,49.306918,37.018355
...,...,...
1995,25.777600,-1.776601
1996,-1.872656,85.532276
1997,1.716777,81.337349
1998,5.189061,31.562332


In [56]:
target = pd.DataFrame(dfNum['Class'])
target

Unnamed: 0,Class
0,3.0
1,2.0
2,3.0
3,4.0
4,1.0
...,...
1995,3.0
1996,4.0
1997,4.0
1998,2.0


In [57]:
# Normalización
individuals = pd.DataFrame((individuals - individuals.min())/(individuals.max() - individuals.min()))
individuals

Unnamed: 0,A,B
0,0.470350,0.057503
1,0.103885,0.447584
2,0.491559,0.075770
3,0.056579,0.987806
4,0.987643,0.469848
...,...,...
1995,0.559657,0.081553
1996,0.056713,0.955419
1997,0.122003,0.913432
1998,0.185162,0.415239


In [58]:
# Se separa el conjunto de datos en entrenamiento y test

xTrain, xTest, yTrain, yTest = train_test_split(individuals, target, test_size=0.3)
classes = dfNum['Class'].unique()
classes


array([3., 2., 4., 1.])

In [59]:
# Se instancia el clasificador y se añaden los datos

classifier = knn.Knn(5)
classifier.fit(xTrain, yTrain)
# classifier.predSingle(yTrain[0])

In [60]:
# classifier.predSingle(xTrain.iloc[1])

In [63]:
predicted = classifier.pred(xTest)
precision, confusionMatrix = classifier.precision(predicted, yTest, classes)
confusionMatrix = pd.DataFrame(confusionMatrix, columns=np.concatenate((np.array(["Class"]), classes)))
print("Fila = predicho, Columna = real")
print("Precisión = " + str(precision))
confusionMatrix


Fila = predicho, Columna = real
Precisión = 1.0


Unnamed: 0,Class,3.0,2.0,4.0,1.0
0,3.0,142.0,0.0,0.0,0.0
1,2.0,0.0,149.0,0.0,0.0
2,4.0,0.0,0.0,163.0,0.0
3,1.0,0.0,0.0,0.0,146.0


In [64]:
cm = confusion_matrix(yTest, predicted)
dfcm = pd.DataFrame(cm)
dfcm

Unnamed: 0,0,1,2,3
0,146,0,0,0
1,0,149,0,0
2,0,0,142,0
3,0,0,0,163
