In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('diabetes_preprocessed.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,4.0,110.0,66.0,0.0,0.0,31.9,0.471,29.0,0.0
1,1.0,109.0,60.0,8.0,182.0,25.4,0.947,21.0,0.0
2,2.0,84.0,0.0,0.0,0.0,0.0,0.304,21.0,0.0
3,5.0,117.0,92.0,0.0,0.0,34.1,0.337,38.0,0.0
4,0.0,119.0,64.0,18.0,92.0,34.9,0.725,23.0,0.0
...,...,...,...,...,...,...,...,...,...
531,4.0,83.0,86.0,19.0,0.0,29.3,0.317,34.0,0.0
532,2.0,141.0,58.0,34.0,128.0,25.4,0.699,24.0,0.0
533,4.0,90.0,0.0,0.0,0.0,28.0,0.610,31.0,0.0
534,6.0,134.0,70.0,23.0,130.0,35.4,0.542,29.0,1.0


## Scaling the variables

In [3]:
targets = df['Outcome']

In [4]:
from sklearn.preprocessing import StandardScaler

In [5]:
df.columns.values

array(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
       'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype=object)

In [6]:
unscaled_inputs = df.columns[:-1]

In [7]:
scaler = StandardScaler()

In [8]:
scaled_data = scaler.fit_transform(df[unscaled_inputs])

In [9]:
scaled_df = pd.DataFrame(scaled_data, columns=unscaled_inputs)
scaled_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,-0.026481,-0.463686,-0.151051,-1.256405,-0.675364,-0.096596,-0.038767,-0.45632
1,-0.878108,-0.49333,-0.443867,-0.772138,0.760248,-0.884382,1.349807,-1.134788
2,-0.594232,-1.234432,-3.372021,-1.256405,-0.675364,-3.962804,-0.525935,-1.134788
3,0.257395,-0.256178,1.117816,-1.256405,-0.675364,0.170039,-0.429668,0.306956
4,-1.161983,-0.19689,-0.248656,-0.166805,0.05033,0.266997,0.702195,-0.965171


## Splitting the dataset

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
train_test_split(scaled_data,targets)

[array([[ 0.54127013, -0.19688965, -0.93189247, ..., -0.67834559,
          2.43207785, -0.11708629],
        [ 2.24452331, -0.58226244,  0.5321848 , ..., -1.1025377 ,
         -1.01310227,  0.81580661],
        [-1.16198304,  0.09955095,  1.99626208, ...,  4.16956421,
         -0.48217696, -0.7107454 ],
        ...,
        [-0.87810751, -0.9083471 , -0.15105126, ..., -1.58732868,
         -0.43841938, -0.79555385],
        [-1.16198304,  0.36634749, -0.44386671, ...,  0.2306375 ,
          0.14501501, -1.13478763],
        [ 0.2573946 , -1.17514365, -0.0534461 , ..., -0.30263258,
         -0.35090422, -0.88036229]]),
 array([[-0.87810751, -0.25617778,  0.92260541, ...,  0.21851773,
         -0.23713452,  0.47657283],
        [-0.02648093, -0.4636862 , -0.15105126, ..., -0.09659641,
         -0.03876682, -0.45632007],
        [ 1.10902119,  0.75172028,  0.43457965, ...,  1.23657878,
          0.09250592,  0.13733905],
        ...,
        [ 1.10902119, -0.16724559, -3.37202125, ..., -

In [12]:
x_train,x_test,y_train,y_test = train_test_split(scaled_data,targets,train_size=0.8,random_state=20)

In [13]:
x_train.shape

(428, 8)

In [14]:
y_train.shape

(428,)

In [15]:
x_test.shape

(108, 8)

In [16]:
y_test.shape

(108,)

## Create model

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [18]:
reg = LogisticRegression(penalty='l1',solver='saga',max_iter=1000,random_state=20)

In [19]:
reg.fit(x_train,y_train)

In [20]:
reg.score(x_train,y_train)

0.7523364485981309

In [21]:
model_outputs = reg.predict(x_train)

In [22]:
model_outputs.shape

(428,)

In [34]:
print (reg.score(x_test,y_test)*100,'%')

75.0 %


## Find intercepts and coefficients

In [24]:
reg.intercept_

array([-0.03164824])

In [25]:
reg.coef_

array([[ 0.44790029,  1.18435064, -0.16229852,  0.        , -0.19904526,
         0.69349578,  0.43127151,  0.11613871]])

In [26]:
df.columns.values

array(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
       'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype=object)

In [27]:
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
       'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

In [28]:
summary_table = pd.DataFrame(columns=['Feature Name'], data = columns)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Pregnancies,0.4479
1,Glucose,1.184351
2,BloodPressure,-0.162299
3,SkinThickness,0.0
4,Insulin,-0.199045
5,BMI,0.693496
6,DiabetesPedigreeFunction,0.431272
7,Age,0.116139


In [29]:
summary_table.index = summary_table.index+1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()

In [30]:
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,-0.031648
1,Pregnancies,0.4479
2,Glucose,1.184351
3,BloodPressure,-0.162299
4,SkinThickness,0.0
5,Insulin,-0.199045
6,BMI,0.693496
7,DiabetesPedigreeFunction,0.431272
8,Age,0.116139


## Interpret Coefficients

In [32]:
summary_table['Odds Ratio'] = np.exp(summary_table.Coefficient)
summary_table.sort_values(['Odds Ratio'], ascending=False)

Unnamed: 0,Feature Name,Coefficient,Odds Ratio
2,Glucose,1.184351,3.268564
6,BMI,0.693496,2.000697
1,Pregnancies,0.4479,1.565023
7,DiabetesPedigreeFunction,0.431272,1.539213
8,Age,0.116139,1.123152
4,SkinThickness,0.0,1.0
0,Intercept,-0.031648,0.968847
3,BloodPressure,-0.162299,0.850187
5,Insulin,-0.199045,0.819513
