In [1]:
SEED=185

In [2]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn import tree
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix

In [3]:
DATA='data/mamografias.csv'

# Función para leer los datos
def readData(data_file):
    return pd.read_csv(data_file,sep=',', na_values='?')

data = readData(DATA) # Lectura de los datos

data.rename(columns = {'BI-RADS':'BiRads'}, inplace = True) # Para poder referirnos a esta columna como data.BiRads
data.BiRads.replace(0,pd.NA,inplace=True) # BiRads 0 significa radiografía insuficiente
data.Shape.replace('N',pd.NA,inplace=True) # Lo mismo pasa con Shape N
data.replace(pd.NA, np.nan, inplace=True)
data.isna().sum()

BiRads       7
Age          5
Shape       31
Margin      48
Density     76
Severity     0
dtype: int64

In [4]:
medianImputer=SimpleImputer(missing_values=np.nan,strategy='median') # Rellenamos valores perdidos con la mediana
data[['BiRads','Age','Density']] = medianImputer.fit_transform(data[['BiRads','Age','Density']])
modeImputer=SimpleImputer(missing_values=np.nan,strategy='most_frequent') # Para las nominales usamos la moda
data[['Shape','Margin']] = modeImputer.fit_transform(data[['Shape','Margin']])
data.isna().sum()

BiRads      0
Age         0
Shape       0
Margin      0
Density     0
Severity    0
dtype: int64

In [5]:
# Sklearn necesita datos numéricos (aunque sean nominales)
le = preprocessing.LabelEncoder()
data.Shape = le.fit_transform(data.Shape)
data.Severity = le.fit_transform(data.Severity)
print(le.inverse_transform([0,1])) # Consideraremos maligno como la clase positiva

['benigno' 'maligno']


In [6]:
dataArray=np.array(data)
dataArray

array([[ 5., 67.,  1.,  5.,  3.,  1.],
       [ 4., 43.,  3.,  1.,  3.,  1.],
       [ 5., 58.,  0.,  5.,  3.,  1.],
       ...,
       [ 4., 64.,  0.,  5.,  3.,  0.],
       [ 5., 66.,  0.,  5.,  3.,  1.],
       [ 4., 62.,  1.,  3.,  3.,  0.]])

In [7]:
# Separamos en datos y target (label)
x=dataArray[:,:-1]
y=dataArray[:,-1]

In [8]:
# Para calcular la matriz de confusión usando validación cruzada sumamos las matrices obtenidas en las distintas particiones
# https://stats.stackexchange.com/questions/147175/how-is-the-confusion-matrix-reported-from-k-fold-cross-validation
# https://stackoverflow.com/questions/40057049/using-confusion-matrix-as-scoring-metric-in-cross-validation-in-scikit-learn
def KFoldConfusionMatrix(model, data, target):
    conf_matrix_list_of_arrays = []
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    for train_index, test_index in kf.split(data):
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        model.fit(X_train, y_train)
        conf_matrix = confusion_matrix(y_test, model.predict(X_test))
        conf_matrix_list_of_arrays.append(conf_matrix)
    return np.sum(conf_matrix_list_of_arrays, axis=0)

In [9]:
""" La matriz de confusión aparece como
          Pred:    0  1
(Benigno) Real=0: TN FP
(Maligno) Real=1: FN TP
"""
# Quiero ponerla: TP, TN, FP, FN
def mcToArray(mc):
    return ', '.join(list(map(str,[mc[1][1], mc[0][0], mc[0][1], mc[1][0]])))

In [10]:
# Dummy: siempre maligno (aunque benigno sea más frecuente)
dummy=DummyClassifier(strategy='constant',constant=1)
dummy_cf=KFoldConfusionMatrix(dummy, x, y)
print(dummy_cf)

[[  0 516]
 [  0 445]]


In [11]:
# Decision Tree
dt=tree.DecisionTreeClassifier(random_state=SEED)
dt_cf=KFoldConfusionMatrix(dt, x, y)
print(dt_cf)

[[420  96]
 [127 318]]


In [12]:
# Gaussian Naive Bayes
gnb=GaussianNB()
gnb_cf=KFoldConfusionMatrix(gnb, x, y)
print(gnb_cf)

[[413 103]
 [ 71 374]]


In [13]:
# Multinomial Naive Bayes
mnb=MultinomialNB()
mnb_cf=KFoldConfusionMatrix(mnb, x, y)
print(mnb_cf)

[[390 126]
 [ 82 363]]


In [14]:
# Random Forest
rf=RandomForestClassifier(n_jobs=4, random_state=SEED)
rf_cf=KFoldConfusionMatrix(rf, x, y)
print(rf_cf)

[[420  96]
 [110 335]]


In [15]:
# KNN
knn=KNeighborsClassifier() # K=5 por defecto
knn_cf=KFoldConfusionMatrix(knn, x, y)
print(knn_cf)

[[414 102]
 [ 99 346]]


In [16]:
# Neural Network
rn=MLPClassifier(max_iter=500,random_state=SEED) # Max_iter=500 porque recibí warning de convergencia
rn_cf=KFoldConfusionMatrix(rn, x, y)
print(rn_cf)

[[410 106]
 [ 79 366]]


In [17]:
with open("results/results_median.csv",'w+') as outfile:
    outfile.write('\n'.join([
            mcToArray(dummy_cf),
            mcToArray(dt_cf),
            mcToArray(gnb_cf),
            mcToArray(mnb_cf),
            mcToArray(rf_cf),
            mcToArray(knn_cf),
            mcToArray(rn_cf)
        ]))