In [1]:
from sklearn import datasets

In [3]:
df = datasets.load_breast_cancer()

print('Información del dataset:')
print(df.keys())

Información del dataset:
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [5]:
#Características del dataset:
print(df.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [6]:
#Seleccionamos todas las columnas
X = df.data

#Datos correspondientes a las etiquetas
y = df.target

## Regresión Logística 

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Escalar los datos para achicar las magnitudes y rangos llevándolos a un mismo nivel 
from sklearn.preprocessing import StandardScaler
escalar = StandardScaler()
X_train = escalar.fit_transform(X_train)
X_test = escalar.fit_transform(X_test)

In [9]:
#Defino el algoritmo a utilizaar
from sklearn.linear_model import LogisticRegression
algoritmo = LogisticRegression()

In [10]:
#Entrenamiento del modelo
algoritmo.fit(X_train, y_train)

LogisticRegression()

In [11]:
#Predicción
y_pred = algoritmo.predict(X_test)

In [13]:
print(y_pred)
print(y_test)

[0 1 0 1 1 0 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 0 1 1 1 1 0 0 1 0
 1 1 1 0 0 0 0 1 1 1 0 1 1 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 1 1 0 0 1 0 0 1 0
 0 0 1 1 0 1 1 1 0 1 1 1 1 0 0 1 1 0 0 1 0 1 0 1 1 1 1 0 1 1 1 0 1 0 1 1 0
 1 0 1]
[0 1 0 1 1 0 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 0 1 1 1 1 0 0 1 0
 1 1 1 0 0 0 0 0 1 1 0 1 1 1 0 1 1 0 0 0 1 1 0 1 1 0 0 1 1 1 0 0 1 0 0 1 0
 0 0 1 1 0 1 1 1 0 1 1 1 1 0 1 1 1 0 0 1 0 0 0 1 1 1 1 0 1 1 1 0 1 1 1 1 0
 1 0 1]


## Cálculo de métricas para verificar el rendimiento del modelo

In [16]:
#Verifico la matriz de confusión - Para modelos de clasificación
from sklearn.metrics import confusion_matrix

matriz = confusion_matrix(y_test, y_pred)
print(matriz)

[[40  5]
 [ 2 67]]


40 datos verdaderos positivos (datos 1 predecidos correctamente).  
67 verdaderos negativos (datos 0 predecidos correctamente).  
5 falsos negativos (no predecidos correctamente, era 1 y predijo 0).  
2 falsos positivos (0 y predijo 1).  
Por lo tanto 7 resultados no fueron predecidos correctamente.

In [17]:
#Para Datasets desbalanceados: 
#Precisión del modelo
from sklearn.metrics import precision_score
precision = precision_score(y_test, y_pred)
print(precision)

0.9305555555555556


In [18]:
#Exactitud del modelo
from sklearn.metrics import accuracy_score
exactitud = accuracy_score(y_test, y_pred)
print(exactitud)

0.9385964912280702


In [19]:
#Sensibilidad (recall)
from sklearn.metrics import recall_score
sensibilidad = recall_score(y_test, y_pred)
print(sensibilidad)

0.9710144927536232


In [20]:
#Cálculo de puntaje F1 (Combinación entre precisión y sensibilidad)
from sklearn.metrics import f1_score
pf1 = f1_score(y_test, y_pred)
print(pf1)

0.9503546099290779


In [21]:
#Cálculo curva ROC - AUC
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_test, y_pred)
print(roc_auc)

0.929951690821256
