In [1]:
SEED=185

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn import tree
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix

In [3]:
DATA='data/mamografias.csv'

# Función para leer los datos
def readData(data_file):
    return pd.read_csv(data_file,sep=',', na_values='?')

data = readData(DATA) # Lectura de los datos

data.rename(columns = {'BI-RADS':'BiRads'}, inplace = True) # Para poder referirnos a esta columna como data.BiRads
data.BiRads.replace(0,pd.NA,inplace=True) # BiRads 0 significa radiografía insuficiente
data.Shape.replace('N',pd.NA,inplace=True) # Lo mismo pasa con Shape N
data.replace(pd.NA, np.nan, inplace=True)

data=data.dropna() # Eliminamos las instancias con valores perdidos
data.shape[0] # Nos quedamos con 825 instancias

# La distribución de la característica densidad es prácticamente degenerada en 3
data.drop('Density', axis='columns',inplace=True)
data.BiRads=data.BiRads.replace(6,5).replace(2,4).replace(3,4)

In [4]:
# Sklearn necesita datos numéricos (aunque sean nominales)
le = preprocessing.LabelEncoder()
data.Shape = le.fit_transform(data.Shape)
data.Severity = le.fit_transform(data.Severity)
print(le.inverse_transform([0,1])) # Consideraremos maligno como la clase positiva

['benigno' 'maligno']


In [5]:
dataArray=np.array(data)
print(data.columns)
dataArray

Index(['BiRads', 'Age', 'Shape', 'Margin', 'Severity'], dtype='object')


array([[ 5., 67.,  1.,  5.,  1.],
       [ 5., 58.,  0.,  5.,  1.],
       [ 4., 28.,  3.,  1.,  0.],
       ...,
       [ 4., 64.,  0.,  5.,  0.],
       [ 5., 66.,  0.,  5.,  1.],
       [ 4., 62.,  1.,  3.,  0.]])

In [6]:
# Separamos en datos y target (label)
x=dataArray[:,:-1]
y=dataArray[:,-1]

In [7]:
# Binarización de los atributos nominales
x_nom=x[:,2:4] # Columnas con variables nominales: Shape y Margin
x=np.delete(x,[2,3],axis=1) # Elimino las columnas
onehot = preprocessing.OneHotEncoder(sparse=False)
x_nom = onehot.fit_transform(x_nom)
print(x_nom.shape)
print(x_nom)
x=np.hstack((x,x_nom))
print(x.shape)
print(x)

(825, 9)
[[0. 1. 0. ... 0. 0. 1.]
 [1. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 1.]
 [1. 0. 0. ... 0. 0. 1.]
 [0. 1. 0. ... 1. 0. 0.]]
(825, 11)
[[ 5. 67.  0. ...  0.  0.  1.]
 [ 5. 58.  1. ...  0.  0.  1.]
 [ 4. 28.  0. ...  0.  0.  0.]
 ...
 [ 4. 64.  1. ...  0.  0.  1.]
 [ 5. 66.  1. ...  0.  0.  1.]
 [ 4. 62.  0. ...  1.  0.  0.]]


In [8]:
# Para calcular la matriz de confusión usando validación cruzada sumamos las matrices obtenidas en las distintas particiones
# https://stats.stackexchange.com/questions/147175/how-is-the-confusion-matrix-reported-from-k-fold-cross-validation
# https://stackoverflow.com/questions/40057049/using-confusion-matrix-as-scoring-metric-in-cross-validation-in-scikit-learn
def KFoldConfusionMatrix(model, data, target):
    conf_matrix_list_of_arrays = []
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    for train_index, test_index in kf.split(data):
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        model.fit(X_train, y_train)
        conf_matrix = confusion_matrix(y_test, model.predict(X_test))
        conf_matrix_list_of_arrays.append(conf_matrix)
    return np.sum(conf_matrix_list_of_arrays, axis=0)

In [9]:
""" La matriz de confusión aparece como
          Pred:    0  1
(Benigno) Real=0: TN FP
(Maligno) Real=1: FN TP
"""
# Quiero ponerla: TP, TN, FP, FN
def mcToArray(mc):
    return ', '.join(list(map(str,[mc[1][1], mc[0][0], mc[0][1], mc[1][0]])))

In [10]:
# Dummy: siempre maligno (aunque benigno sea más frecuente)
dummy=DummyClassifier(strategy='constant',constant=1)
dummy_cf=KFoldConfusionMatrix(dummy, x, y)
print(dummy_cf)

[[  0 425]
 [  0 400]]


In [11]:
# Decision Tree
dt=tree.DecisionTreeClassifier(random_state=SEED)
dt_cf=KFoldConfusionMatrix(dt, x, y)
print(dt_cf)

[[347  78]
 [103 297]]


In [12]:
# Gaussian Naive Bayes
gnb=GaussianNB()
gnb_cf=KFoldConfusionMatrix(gnb, x, y)
print(gnb_cf)

[[321 104]
 [ 46 354]]


In [13]:
# Multinomial Naive Bayes
mnb=MultinomialNB()
mnb_cf=KFoldConfusionMatrix(mnb, x, y)
print(mnb_cf)

[[326  99]
 [ 64 336]]


In [14]:
# Random Forest
rf=RandomForestClassifier(n_jobs=4, random_state=SEED)
rf_cf=KFoldConfusionMatrix(rf, x, y)
print(rf_cf)

[[344  81]
 [ 81 319]]


In [15]:
# KNN
knn=KNeighborsClassifier() # K=5 por defecto
knn_cf=KFoldConfusionMatrix(knn, x, y)
print(knn_cf)

[[332  93]
 [ 68 332]]


In [16]:
# Neural Network
rn=MLPClassifier(max_iter=500,random_state=SEED) # Max_iter=500 porque recibí warning de convergencia
rn_cf=KFoldConfusionMatrix(rn, x, y)
print(rn_cf)

[[337  88]
 [ 58 342]]


In [17]:
with open("results/results_binarization.csv",'w+') as outfile:
    outfile.write('\n'.join([
            mcToArray(dummy_cf),
            mcToArray(dt_cf),
            mcToArray(gnb_cf),
            mcToArray(mnb_cf),
            mcToArray(rf_cf),
            mcToArray(knn_cf),
            mcToArray(rn_cf)
        ]))