In [1]:
from statsmodels.datasets import get_rdataset
biopsy = get_rdataset('biopsy',package='MASS')

In [2]:
biopsy.data['cl'] = biopsy.data['class'].replace({'benign':0, 'malignant':1})

In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
#impostiamo un seed in modo che lo split sia ripetibile
#generalmente non è necessario (né consigliabile!) farlo
#qui lo facciamo per questioni didattiche
np.random.seed(1234)
biopsy_train, biopsy_test = train_test_split(biopsy.data, test_size=0.25)
print(len(biopsy_train))
print(len(biopsy_test))

524
175


In [5]:
from statsmodels.formula.api import logit
model = logit('cl ~ V1 + V4 + V6 + V7 + V8',biopsy_train).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.087157
         Iterations 10


0,1,2,3
Dep. Variable:,cl,No. Observations:,509.0
Model:,Logit,Df Residuals:,503.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 04 Mar 2021",Pseudo R-squ.:,0.8646
Time:,22:29:40,Log-Likelihood:,-44.363
converged:,True,LL-Null:,-327.56
Covariance Type:,nonrobust,LLR p-value:,3.69e-120

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-10.3177,1.308,-7.885,0.000,-12.882,-7.753
V1,0.7604,0.149,5.116,0.000,0.469,1.052
V4,0.5434,0.147,3.691,0.000,0.255,0.832
V6,0.3649,0.094,3.887,0.000,0.181,0.549
V7,0.4723,0.158,2.988,0.003,0.163,0.782
V8,0.4147,0.120,3.445,0.001,0.179,0.651


In [6]:
test_probs = model.predict(biopsy_test.dropna()) #applichiamo dropna per evitare i NaN
test_probs.head()

341    0.001093
532    0.001093
687    0.007106
83     0.045540
428    0.000682
dtype: float64

In [7]:
test_preds = test_probs.round().astype(int)
test_preds.head()

341    0
532    0
687    0
83     0
428    0
dtype: int32

In [8]:
test_gt = biopsy_test.dropna()['cl']
test_gt.head()

341    0
532    0
687    0
83     0
428    0
Name: cl, dtype: int64

In [10]:
#np.logical_and applica and tra due array di booleani
hit = np.logical_and(test_gt==1, test_preds==1)
miss = np.logical_and(test_gt==1, test_preds==0)
#print(hit.head(),'\n')
#print(miss.head())
print("Numero di hit:",hit.sum())
print("Numero di miss:",miss.sum())

Numero di hit: 61
Numero di miss: 3


In [11]:
false_alarms = np.logical_and(test_gt==0, test_preds==1)
print("Numero di falsi allarmi:",false_alarms.sum())

Numero di falsi allarmi: 2


In [12]:
true_negatives = np.logical_and(test_gt==0, test_preds==0)
print("Numero di veri negativi:",true_negatives.sum())

Numero di veri negativi: 108


In [15]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_gt, test_preds)

tn, fp, fn, tp = confusion_matrix(test_gt, test_preds).ravel()
print("Numero di True Negative:",tn)
print("Numero di False Positive:",fp)
print("Numero di False Negative:",fn)
print("Numero di Ture Positive:",tp)

cm = confusion_matrix(test_gt, test_preds)
print(cm)
cm.sum(1).reshape(-1,1) #il reshape serve a trasformare il vettore in un vettore colonna

Numero di True Negative: 108
Numero di False Positive: 2
Numero di False Negative: 3
Numero di Ture Positive: 61
[[108   2]
 [  3  61]]


array([[110],
       [ 64]], dtype=int64)

In [17]:
cm=cm/cm.sum(1).reshape(-1,1) #il reshape serve a trasformare il vettore in un vettore colonna
cm

array([[0.98181818, 0.01818182],
       [0.046875  , 0.953125  ]])

In [18]:
tnr, fpr, fnr, tpr = cm.ravel()
print("TNR:",tnr)
print("FPR:",fpr)
print("FNR:",fnr)
print("TPR:",tpr)

TNR: 0.9818181818181818
FPR: 0.01818181818181818
FNR: 0.046875
TPR: 0.953125


array([[0.98181818, 0.01818182],
       [0.046875  , 0.953125  ]])