# Regressione logistica 

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [4]:
df = pd.read_csv('pid.csv')

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,pregnant,glucose,pressure,triceps,insulin,mass,pedigree,age,diabetes
0,1,6,148,72,35,0,33.6,0.627,50,pos
1,2,1,85,66,29,0,26.6,0.351,31,neg
2,3,8,183,64,0,0,23.3,0.672,32,pos
3,4,1,89,66,23,94,28.1,0.167,21,neg
4,5,0,137,40,35,168,43.1,2.288,33,pos


In [9]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [10]:
df.head()

Unnamed: 0,pregnant,glucose,pressure,triceps,insulin,mass,pedigree,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,pos
1,1,85,66,29,0,26.6,0.351,31,neg
2,8,183,64,0,0,23.3,0.672,32,pos
3,1,89,66,23,94,28.1,0.167,21,neg
4,0,137,40,35,168,43.1,2.288,33,pos


In [11]:
columns = ['Number of times pregnant', 'Plasma glucose concentration', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness', '2-Hour serum insulin (mu U/ml)', 'Body mass index', 'Diabetes pedigree function', 'Age', 'Class']

In [12]:
df.columns = columns

In [13]:
df.shape

(768, 9)

In [18]:
cl_f = {'pos': 1, 'neg': 0}
df['Class'] = df['Class'].map(cl_f)

In [19]:
cl = df['Class'].values

In [20]:
cl

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,

In [21]:
df.drop('Class', axis=1, inplace=True)

In [23]:
df.head()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration,Diastolic blood pressure (mm Hg),Triceps skin fold thickness,2-Hour serum insulin (mu U/ml),Body mass index,Diabetes pedigree function,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [24]:
df_sc = StandardScaler().fit_transform(df)

In [25]:
x_train, x_test, y_train, y_test = train_test_split(df_sc, cl, test_size = 0.3, random_state = 12345)

In [26]:
lr = LogisticRegression()

In [27]:
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
lr_pred = lr.predict(x_test)

In [29]:
from sklearn.metrics import confusion_matrix, classification_report

In [30]:
print(confusion_matrix(y_test, lr_pred))

[[150   8]
 [ 30  43]]


In [31]:
print(classification_report(y_test, lr_pred))

             precision    recall  f1-score   support

          0       0.83      0.95      0.89       158
          1       0.84      0.59      0.69        73

avg / total       0.84      0.84      0.83       231

