In [3]:
SEED=185

In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

In [40]:
DATA='data/mamografias.csv'

# Función para leer los datos
def readData(data_file):
    return pd.read_csv(data_file,sep=',')

data = readData(DATA) # Lectura de los datos

In [41]:
data.head() # Comprobamos el formato de los datos, 6 columnas siendo la última la variable a predecir

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
0,5,67,L,5,3,maligno
1,4,43,R,1,?,maligno
2,5,58,I,5,3,maligno
3,4,28,R,1,3,benigno
4,5,74,R,5,?,maligno


In [5]:
data.shape[0] # 961 instancias

961

In [6]:
# Comprobamos que las clases están balanceadas: 46% frente a 54% aprox.
print(data[data['Severity']=='maligno'].shape[0])
print(data[data['Severity']=='benigno'].shape[0])

445
516


In [7]:
data.isna().sum() # Hay algunos valores perdidos

BI-RADS     0
Age         0
Shape       0
Margin      0
Density     0
Severity    0
dtype: int64

In [8]:
data=data.dropna() # Eliminamos las instancias con valores perdidos
data.shape[0] # Nos quedamos con 847 instancias

961

In [9]:
print(data[data['Severity']=='maligno'].shape[0]) # Las clases siguen balanceadas
print(data[data['Severity']=='benigno'].shape[0])

445
516


In [44]:
data=np.array(data)
data

array([['5', '67', 'L', '5', '3', 'maligno'],
       ['4', '43', 'R', '1', '?', 'maligno'],
       ['5', '58', 'I', '5', '3', 'maligno'],
       ...,
       ['4', '64', 'I', '5', '3', 'benigno'],
       ['5', '66', 'I', '5', '3', 'maligno'],
       ['4', '62', 'L', '3', '3', 'benigno']], dtype=object)

In [47]:
# Sklearn Models convert data to numeric
le = preprocessing.LabelEncoder()
data[:,2] = le.fit_transform(data[:,2]) # Shape
#data[['Margin']] = le.fit_transform(data[['Margin']]) # Esta columna no daría problema, porque los valores son strings que representan números
#data[['Severity']] = le.fit_transform(data[['Severity']])

In [48]:
data

array([['5', '67', 1, '5', '3', 'maligno'],
       ['4', '43', 4, '1', '?', 'maligno'],
       ['5', '58', 0, '5', '3', 'maligno'],
       ...,
       ['4', '64', 0, '5', '3', 'benigno'],
       ['5', '66', 0, '5', '3', 'maligno'],
       ['4', '62', 1, '3', '3', 'benigno']], dtype=object)

In [50]:
# Separamos en datos y target (label)
x=data[:,:-1]
y=data[:,-1]

In [51]:
# Comprobación de que mantiene la proporción de instancias de cada clase
from collections import Counter
print(Counter(y))
Counter(y)

Counter({'benigno': 516, 'maligno': 445})


Counter({'benigno': 516, 'maligno': 445})

In [52]:
# https://stats.stackexchange.com/questions/147175/how-is-the-confusion-matrix-reported-from-k-fold-cross-validation
# https://stackoverflow.com/questions/40057049/using-confusion-matrix-as-scoring-metric-in-cross-validation-in-scikit-learn
def KFoldConfusionMatrix(model, data, target):
    conf_matrix_list_of_arrays = []
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    for train_index, test_index in kf.split(data):
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = target[train_index], target[test_index]
        model.fit(X_train, y_train)
        conf_matrix = confusion_matrix(y_test, model.predict(X_test))
        conf_matrix_list_of_arrays.append(conf_matrix)
    return np.sum(conf_matrix_list_of_arrays, axis=0)

In [53]:
# Decision Tree
dt=DecisionTreeClassifier()
dt_cf=KFoldConfusionMatrix(dt, x, y)
dt_cf

ValueError: could not convert string to float: '?'

In [308]:
# Gaussian Naive Bayes
gnb=GaussianNB()
gnb_cf=KFoldConfusionMatrix(gnb, data_train, target_train)
gnb_cf

array([[251,  77],
       [ 45, 262]])

In [309]:
# Multinomial Naive Bayes
mnb=MultinomialNB()
mnb_cf=KFoldConfusionMatrix(mnb, data_train, target_train)
mnb_cf

array([[245,  83],
       [ 54, 253]])

In [310]:
# Random Forest
rf=RandomForestClassifier(n_jobs=4)
rf_cf=KFoldConfusionMatrix(rf, data_train, target_train)
rf_cf

array([[270,  58],
       [ 69, 238]])

In [311]:
# KNN
knn=KNeighborsClassifier()
knn_cf=KFoldConfusionMatrix(knn, data_train, target_train)
knn_cf

array([[249,  79],
       [ 58, 249]])

In [312]:
# Neural Network
rn=MLPClassifier(max_iter=500)
rn_cf=KFoldConfusionMatrix(rn, data_train, target_train)
rn_cf

array([[251,  77],
       [ 52, 255]])