In [52]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

In [53]:
dataset = pd.read_csv('data/mammography.csv.zip')
dataset.sample(frac=1)
dataset.drop_duplicates(inplace=True)

In [54]:
dataset.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,class
0,0.23002,5.072578,-0.276061,0.832444,-0.377866,0.480322,-1
1,0.155491,-0.16939,0.670652,-0.859553,-0.377866,-0.945723,-1
2,-0.784415,-0.443654,5.674705,-0.859553,-0.377866,-0.945723,-1
3,0.546088,0.131415,-0.456387,-0.859553,-0.377866,-0.945723,-1
4,-0.102987,-0.394994,-0.140816,0.979703,-0.377866,1.013566,-1


In [55]:
dataset.tail()

Unnamed: 0,X1,X2,X3,X4,X5,X6,class
11178,-0.250012,-0.3773,-0.321142,1.269157,3.652984,1.092791,1
11179,0.281343,-0.417112,-0.366224,0.85101,2.789649,1.3457,1
11180,1.204988,1.763724,-0.501468,1.562408,6.489072,0.931294,1
11181,0.736644,-0.222474,-0.050653,1.509665,0.539269,1.315229,1
11182,0.177003,-0.191508,-0.501468,1.578864,7.750705,1.555951,1


In [56]:
target = 'class'
# No dataset de mamografia as classes estão definidas como -1 e 1
# É preciso que os valores sejam 0 e 1
if dataset.loc[dataset[target] == -1][target].count() != 0:
    # Todos os valores da coluna 'class' que são -1 são trocados para 0
    dataset[target] = dataset[target].map({-1 : 0, 1 : 1})

In [57]:
dataset.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,class
0,0.23002,5.072578,-0.276061,0.832444,-0.377866,0.480322,0
1,0.155491,-0.16939,0.670652,-0.859553,-0.377866,-0.945723,0
2,-0.784415,-0.443654,5.674705,-0.859553,-0.377866,-0.945723,0
3,0.546088,0.131415,-0.456387,-0.859553,-0.377866,-0.945723,0
4,-0.102987,-0.394994,-0.140816,0.979703,-0.377866,1.013566,0


In [58]:
classA = dataset.loc[dataset[target] == 0] # Cria dataset só com os exemplos da classe 0
classB = dataset.loc[dataset[target] == 1] # Cria dataset só com os exemplos da classe 1

In [59]:
classA.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,class
0,0.23002,5.072578,-0.276061,0.832444,-0.377866,0.480322,0
1,0.155491,-0.16939,0.670652,-0.859553,-0.377866,-0.945723,0
2,-0.784415,-0.443654,5.674705,-0.859553,-0.377866,-0.945723,0
3,0.546088,0.131415,-0.456387,-0.859553,-0.377866,-0.945723,0
4,-0.102987,-0.394994,-0.140816,0.979703,-0.377866,1.013566,0


In [60]:
classB.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,class
1093,-0.335889,-0.452501,-0.591631,0.817254,-0.377866,1.034896,1
1094,4.592985,1.91855,-0.456387,2.264523,1.91884,0.702761,1
1095,0.660252,-0.3773,-0.411305,1.708401,3.520855,1.400548,1
1096,-0.32183,-0.452501,-0.276061,1.325275,13.750423,1.013566,1
1097,-0.520347,-0.452501,-0.276061,3.00166,13.750423,0.702761,1


In [61]:
lenB = classB[target].count()
halfB = int(round(lenB*0.5))

# Divide o dataset da classe B em três datasets com 50%-25%-25%
classB_1 = classB[:halfB]
classB_2 = classB[halfB:halfB+int(round(halfB/2))]
classB_3 = classB[halfB+int(round(halfB/2)):]

In [62]:
classB_1[target].count()

127

In [63]:
classB_2[target].count()

63

In [64]:
classB_3[target].count()

64

In [65]:
kmeans = KMeans(n_clusters=lenB, random_state=0).fit(classA)
classA = pd.DataFrame(kmeans.cluster_centers_.tolist())
classA.columns = ['X1','X2','X3','X4','X5','X6','class']
classA[target] = classA[target].map({1.0 : int(1), 0.0 : int(0)})
classA.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,class
0,1.449115,-0.439516,-0.409851,-0.859553,-0.377866,-0.945723,0
1,0.359804,-0.110153,-0.027132,1.448556,0.638238,1.264753,0
2,1.732134,4.671051,-0.338481,1.369612,-0.281552,0.30734,0
3,-0.385311,-0.34486,4.397394,0.010778,-0.12351,1.182172,0
4,-0.245983,-0.3876,0.01697,0.518234,-0.377866,0.991888,0


In [66]:
# Divide o dataset da classe A em três datasets com 50%-25%-25%
classA_1 = classA[:halfB]
classA_2 = classA[halfB:halfB+int(round(halfB/2))]
classA_3 = classA[halfB+int(round(halfB/2)):]

In [67]:
classA_1[target].count()

127

In [68]:
classA_2[target].count()

63

In [69]:
classA_3[target].count()

64

In [70]:
# Cria os datasets de treino, validação e teste concatenando os conjuntos
# das classes A e B e aleatorizando a ordem dos exemplos
train = pd.concat([classA_1, classB_1]).sample(frac=1)
train.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,class
42,-0.068572,-0.379664,-0.029424,-0.859553,-0.377866,-0.945723,0
3356,4.166479,0.657823,-0.366224,4.942182,1.484884,1.391407,1
1106,-0.134831,-0.32864,-0.321142,1.679286,7.722192,1.290852,1
2233,-0.032355,-0.288828,-0.276061,1.4565,3.245072,1.010519,1
34,-0.092547,-0.318193,-0.26455,0.70613,2.753197,1.031978,0


In [71]:
validation = pd.concat([classA_2, classB_2]).sample(frac=1)
validation.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,class
134,1.401324,2.013104,-0.292966,0.778593,1.098582,0.654388,0
157,-0.237901,-0.224685,12.617266,0.782655,-0.001628,0.620489,0
7822,2.259227,0.215463,-0.54655,3.00166,13.750423,0.961765,1
8924,0.001014,-0.412689,0.264918,1.218101,1.071244,1.680882,1
154,-0.104963,-0.302169,0.184773,0.144742,0.596204,1.117119,0


In [72]:
test = pd.concat([classA_3, classB_3]).sample(frac=1)
test.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,class
10064,0.439716,-0.381723,-0.456387,1.570003,4.072156,1.211628,1
11159,0.75426,0.016401,0.084592,4.942182,1.421602,1.25124,1
10057,0.151765,-0.333064,-0.140816,1.39152,1.583726,1.629081,1
11178,-0.250012,-0.3773,-0.321142,1.269157,3.652984,1.092791,1
8932,0.443103,-0.023412,-0.411305,1.624012,4.371546,1.184204,1


In [73]:
train.to_csv("data/undersampling-km/mammography_train.csv", index=False)

IOError: [Errno 2] No such file or directory: 'data/undersampling-km/mammography_train.csv'

In [None]:
validation.to_csv("data/undersampling-km/mammography_validation.csv", index=False)

In [None]:
test.to_csv("data/undersampling-km/mammography_test.csv", index=False)