In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
RESULTS='results/mamografias/'
PROCS=['dropna', 'median', 'features','binarization','stdScaler']
MODELS=['Dummy','DecisionTree','Naive-Bayes','SVM','RandomForest','KNN','NeuralNetwork']
DATA='data/mamografias.csv'

# Métricas por modelo y preprocesamiento

In [16]:
# Leer los resultados de la práctica anterior
def readResults(results_file):
    return pd.read_csv(results_file,header=0,skipfooter=2,engine='python', index_col=0).round(3)

In [17]:
res_proc=[]

In [18]:
for p in PROCS:
    res_proc.append(readResults(RESULTS+p+'.csv'))

In [19]:
# Para cada uno de los 5 preprocesamientos, almaceno las métricas medidas
res_proc[1]

Unnamed: 0,TP,TN,FP,FN,Acc,TPR,FPR,AUC,F1-score,G-measure
Dummy,445,0,516,0,0.463,1.0,1.0,0.5,0.633,0.68
DecisionTree,337,458,58,108,0.827,0.757,0.112,0.822,0.802,0.804
GaussianNB,374,413,103,71,0.819,0.84,0.2,0.82,0.811,0.812
SupportVectorM,298,381,135,147,0.707,0.67,0.262,0.704,0.679,0.679
RandomForest,344,419,97,101,0.794,0.773,0.188,0.792,0.776,0.776
KNN,368,399,117,77,0.798,0.827,0.227,0.8,0.791,0.792
NeuralNetwork,366,410,106,79,0.808,0.822,0.205,0.808,0.798,0.799


In [20]:
results_acc=pd.DataFrame(columns=PROCS,index=res_proc[0].index)
for i in range(5):
    results_acc[[PROCS[i]]]=res_proc[i][['Acc']]
results_acc

Unnamed: 0,dropna,median,features,binarization,stdScaler
Dummy,0.485,0.463,0.485,0.485,0.485
DecisionTree,0.834,0.827,0.835,0.836,0.836
GaussianNB,0.826,0.819,0.84,0.818,0.818
SupportVectorM,0.721,0.707,0.721,0.691,0.842
RandomForest,0.811,0.794,0.807,0.806,0.807
KNN,0.794,0.798,0.79,0.8,0.829
NeuralNetwork,0.811,0.808,0.801,0.823,0.834


In [21]:
results_f1=pd.DataFrame(columns=PROCS,index=res_proc[0].index)
for i in range(5):
    results_f1[[PROCS[i]]]=res_proc[i][['F1-score']]
results_f1

Unnamed: 0,dropna,median,features,binarization,stdScaler
Dummy,0.653,0.633,0.653,0.653,0.653
DecisionTree,0.818,0.802,0.82,0.82,0.82
GaussianNB,0.826,0.811,0.84,0.825,0.825
SupportVectorM,0.721,0.679,0.72,0.686,0.841
RandomForest,0.805,0.776,0.802,0.802,0.804
KNN,0.799,0.791,0.798,0.8,0.823
NeuralNetwork,0.812,0.798,0.798,0.824,0.829


In [29]:
def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 2),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

In [30]:
%matplotlib qt

Accuracy y F1-score de los modelos para cada procesamiento

In [31]:
for proc in PROCS:
    x=np.arange(len(MODELS))
    width=0.4
    fig, ax = plt.subplots()
    rects1 = ax.bar(x - width/2, np.array(results_acc[[proc]]).flatten(), width, label='Accuracy')
    rects2 = ax.bar(x + width/2, np.array(results_f1[[proc]]).flatten(), width, label='F1-score')

    ax.set_ylabel('Scores')
    ax.set_title('Scores para el procesamiento '+proc)
    ax.set_xticks(x)
    ax.set_xticklabels(MODELS)
    ax.legend()

    autolabel(rects1)
    autolabel(rects2)

    fig.tight_layout()

    plt.show()

Accuracy y F1-score de cada modelo para todos los preprocesamientos

In [40]:
for i,mod in enumerate(MODELS):
    x=np.arange(len(PROCS))
    width=0.4
    fig, ax = plt.subplots()
    rects1 = ax.bar(x - width/2, np.array(results_acc.iloc[i]).flatten(), width, label='Accuracy')
    rects2 = ax.bar(x + width/2, np.array(results_f1.iloc[i]).flatten(), width, label='F1-score')

    ax.set_ylabel('Scores')
    ax.set_title('Scores del modelo '+mod)
    ax.set_xticks(x)
    ax.set_xticklabels(PROCS)
    ax.legend(loc=4)

    autolabel(rects1)
    autolabel(rects2)

    fig.tight_layout()

    plt.show()

# Relación de los atributos con la severidad

In [63]:
def readData(data_file):
    return pd.read_csv(data_file,sep=',', na_values='?')

data=readData(DATA)
data.rename(columns = {'BI-RADS':'BiRads'}, inplace = True) # Para poder referirnos a esta columna como data.BiRads
data.BiRads.replace(0,pd.NA,inplace=True) # BiRads 0 significa radiografía insuficiente
data.Shape.replace('N',pd.NA,inplace=True) # Lo mismo pasa con Shape N
data

Unnamed: 0,BiRads,Age,Shape,Margin,Density,Severity
0,5.0,67.0,L,5.0,3.0,maligno
1,4.0,43.0,R,1.0,,maligno
2,5.0,58.0,I,5.0,3.0,maligno
3,4.0,28.0,R,1.0,3.0,benigno
4,5.0,74.0,R,5.0,,maligno
...,...,...,...,...,...,...
956,4.0,47.0,O,1.0,3.0,benigno
957,4.0,56.0,I,5.0,3.0,maligno
958,4.0,64.0,I,5.0,3.0,benigno
959,5.0,66.0,I,5.0,3.0,maligno


In [64]:
datam=data[data.Severity=='maligno']
datab=data[data.Severity=='benigno']

In [65]:
data[(data.columns[3])]

0      5.0
1      1.0
2      5.0
3      1.0
4      5.0
      ... 
956    1.0
957    5.0
958    5.0
959    5.0
960    3.0
Name: Margin, Length: 961, dtype: float64

In [90]:
width = 0.4

for c in data.columns[:-1]:
    malignos=Counter(datam[c].dropna())
    benignos=Counter(datab[c].dropna())
    values=sorted(list(Counter(data[c].dropna()).keys())) # valores de la variable
    countm=[malignos[x] for x in values]
    countb=[benignos[x] for x in values]
    
    fig, ax = plt.subplots()

    ax.bar(values, countm, width, label='Maligno')
    ax.bar(values, countb, width, bottom=countm,
    label='Benigno')

    ax.set_xlabel(c)
    ax.set_ylabel('Ejemplos')
    ax.set_title('Distribución de los casos según el atributo '+c)
    ax.legend()

    plt.show()