In [1]:
SEED=185

In [2]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn import tree
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix

In [3]:
DATA='data/mamografias.csv'

# Función para leer los datos
def readData(data_file):
    return pd.read_csv(data_file,sep=',', na_values='?')

data = readData(DATA) # Lectura de los datos

In [4]:
data.head() # Comprobamos el formato de los datos, 6 columnas siendo la última la variable a predecir

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
0,5.0,67.0,L,5.0,3.0,maligno
1,4.0,43.0,R,1.0,,maligno
2,5.0,58.0,I,5.0,3.0,maligno
3,4.0,28.0,R,1.0,3.0,benigno
4,5.0,74.0,R,5.0,,maligno


In [5]:
data.rename(columns = {'BI-RADS':'BiRads'}, inplace = True) # Para poder referirnos a esta columna como data.BiRads
data.head()

Unnamed: 0,BiRads,Age,Shape,Margin,Density,Severity
0,5.0,67.0,L,5.0,3.0,maligno
1,4.0,43.0,R,1.0,,maligno
2,5.0,58.0,I,5.0,3.0,maligno
3,4.0,28.0,R,1.0,3.0,benigno
4,5.0,74.0,R,5.0,,maligno


In [6]:
data.shape[0] # 961 instancias

961

In [7]:
# Comprobamos que las clases están balanceadas: 46% frente a 54% aprox.
print(data[data['Severity']=='maligno'].shape[0])
print(data[data['Severity']=='benigno'].shape[0])

445
516


In [8]:
data.isna().sum() # Hay algunos valores perdidos

BiRads       2
Age          5
Shape        0
Margin      48
Density     76
Severity     0
dtype: int64

In [9]:
data.BiRads.replace(0,pd.NA,inplace=True) # BiRads 0 significa radiografía insuficiente
data.Shape.replace('N',pd.NA,inplace=True) # Lo mismo pasa con Shape N
data.isna().sum()

BiRads       7
Age          5
Shape       31
Margin      48
Density     76
Severity     0
dtype: int64

In [10]:
data=data.dropna() # Eliminamos las instancias con valores perdidos
data.shape[0] # Nos quedamos con 825 instancias

825

In [11]:
print(data[data['Severity']=='maligno'].shape[0]) # Las clases siguen balanceadas: 48'5% frente a 51'5%
print(data[data['Severity']=='benigno'].shape[0])

400
425


In [12]:
# Sklearn necesita datos numéricos (aunque sean nominales)
le = preprocessing.LabelEncoder()
data.Shape = le.fit_transform(data.Shape)
data.Severity = le.fit_transform(data.Severity)
print(le.inverse_transform([0,1])) # Consideraremos maligno como la clase positiva

['benigno' 'maligno']


In [13]:
data.head()

Unnamed: 0,BiRads,Age,Shape,Margin,Density,Severity
0,5.0,67.0,1,5.0,3.0,1
2,5.0,58.0,0,5.0,3.0,1
3,4.0,28.0,3,1.0,3.0,0
8,5.0,57.0,3,5.0,3.0,1
10,5.0,76.0,3,4.0,3.0,1


In [14]:
dataArray=np.array(data)
dataArray

array([[ 5., 67.,  1.,  5.,  3.,  1.],
       [ 5., 58.,  0.,  5.,  3.,  1.],
       [ 4., 28.,  3.,  1.,  3.,  0.],
       ...,
       [ 4., 64.,  0.,  5.,  3.,  0.],
       [ 5., 66.,  0.,  5.,  3.,  1.],
       [ 4., 62.,  1.,  3.,  3.,  0.]])

In [15]:
# Separamos en datos y target (label)
x=dataArray[:,:-1]
y=dataArray[:,-1]

In [16]:
# Para calcular la matriz de confusión usando validación cruzada sumamos las matrices obtenidas en las distintas particiones
# https://stats.stackexchange.com/questions/147175/how-is-the-confusion-matrix-reported-from-k-fold-cross-validation
# https://stackoverflow.com/questions/40057049/using-confusion-matrix-as-scoring-metric-in-cross-validation-in-scikit-learn
def KFoldConfusionMatrix(model, data, target):
    conf_matrix_list_of_arrays = []
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    for train_index, test_index in kf.split(data):
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        model.fit(X_train, y_train)
        conf_matrix = confusion_matrix(y_test, model.predict(X_test))
        conf_matrix_list_of_arrays.append(conf_matrix)
    return np.sum(conf_matrix_list_of_arrays, axis=0)

In [17]:
""" La matriz de confusión aparece como
          Pred:    0  1
(Benigno) Real=0: TN FP
(Maligno) Real=1: FN TP
"""
# Quiero ponerla: TP, TN, FP, FN
def mcToArray(mc):
    return ', '.join(list(map(str,[mc[1][1], mc[0][0], mc[0][1], mc[1][0]])))

In [18]:
# Dummy: siempre maligno (aunque benigno sea más frecuente)
dummy=DummyClassifier(strategy='constant',constant=1)
dummy_cf=KFoldConfusionMatrix(dummy, x, y)
print(dummy_cf)

[[  0 425]
 [  0 400]]


In [19]:
# Decision Tree
dt=tree.DecisionTreeClassifier(random_state=SEED)
dt_cf=KFoldConfusionMatrix(dt, x, y)
print(dt_cf)

[[341  84]
 [112 288]]


In [20]:
# Gaussian Naive Bayes
gnb=GaussianNB()
gnb_cf=KFoldConfusionMatrix(gnb, x, y)
print(gnb_cf)

[[340  85]
 [ 59 341]]


In [21]:
# Multinomial Naive Bayes
mnb=MultinomialNB()
mnb_cf=KFoldConfusionMatrix(mnb, x, y)
print(mnb_cf)

[[325 100]
 [ 70 330]]


In [22]:
# Random Forest
rf=RandomForestClassifier(n_jobs=4, random_state=SEED)
rf_cf=KFoldConfusionMatrix(rf, x, y)
print(rf_cf)

[[345  80]
 [ 88 312]]


In [23]:
# KNN
knn=KNeighborsClassifier() # K=5 por defecto
knn_cf=KFoldConfusionMatrix(knn, x, y)
print(knn_cf)

[[333  92]
 [ 70 330]]


In [24]:
# Neural Network
rn=MLPClassifier(max_iter=500,random_state=SEED) # Max_iter=500 porque recibí warning de convergencia
rn_cf=KFoldConfusionMatrix(rn, x, y)
print(rn_cf)

[[332  93]
 [ 63 337]]


In [25]:
with open("results/results_basic.csv",'w+') as outfile:
    outfile.write('\n'.join([
            mcToArray(dummy_cf),
            mcToArray(dt_cf),
            mcToArray(gnb_cf),
            mcToArray(mnb_cf),
            mcToArray(rf_cf),
            mcToArray(knn_cf),
            mcToArray(rn_cf)
        ]))