# Logistic Regression

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

In [2]:
diabetes = pd.read_csv('diabetes.csv')

In [3]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
def outlier_check(value, mean, std):
    if ((mean + 3*std) < value) or ((mean - 3*std)> value):
        return [True, value]
    else:
        return [False, value]
    
for x in list(diabetes.columns):
    try:
        outlier_list = []
        print("\n")
        print(x)
        for y in diabetes[x]:
            if outlier_check(y,diabetes[x].mean(),diabetes[x].std())[0]:
                outlier_list.append(outlier_check(y,diabetes[x].mean(),diabetes[x].std())[1])
        print(outlier_list)
    except:
        print("Non-numeric column")



Pregnancies
[15, 17, 14, 14]


Glucose
[0, 0, 0, 0, 0]


BloodPressure
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


SkinThickness
[99]


Insulin
[543, 846, 495, 485, 495, 478, 744, 680, 545, 465, 579, 474, 480, 600, 440, 540, 480, 510]


BMI
[0.0, 0.0, 0.0, 0.0, 0.0, 67.1, 0.0, 0.0, 59.4, 0.0, 0.0, 57.3, 0.0, 0.0]


DiabetesPedigreeFunction
[2.2880000000000003, 1.893, 1.781, 2.329, 1.476, 2.137, 1.7309999999999999, 1.6, 2.42, 1.699, 1.6980000000000002]


Age
[69, 72, 81, 70, 69]


Outcome
[]


All zero values for blood pressure, glucose levels, skin thickness, and BMI must be errors (otherwise the patients would be deceased).

Those values can be removed.

In [5]:
diabetes = diabetes[(diabetes['BMI']!=0) & (diabetes['BloodPressure'] != 0) & (diabetes['Glucose']!=0) & (diabetes['SkinThickness']!=0)]

In [6]:
diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
6,3,78,50,32,88,31.0,0.248,26,1
...,...,...,...,...,...,...,...,...,...
761,9,170,74,31,0,44.0,0.403,43,1
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0


Now we need to create dummy variables for pregnancies and Age, as their classifications.

In [7]:
X = diabetes.drop(columns=['Outcome'])
y = diabetes["Outcome"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.25,random_state=0)

In [9]:
log_model = LogisticRegressionCV(max_iter=250,Cs=3,fit_intercept=True).fit(X_train, y_train)
log_model.coef_

array([[ 8.27441318e-02,  3.60417518e-02, -1.32602610e-02,
         1.83982988e-02, -8.22349250e-04,  6.61708633e-02,
         1.18703482e+00,  3.43714114e-02]])

Cs of 3 seems to yield the largest correlation coefficients.

In [10]:
predictions= log_model.predict(X_test)

In [13]:
sklearn.metrics.confusion_matrix(y_test,predictions,normalize='true')

array([[0.93333333, 0.06666667],
       [0.48837209, 0.51162791]])

True negatives has a high probability of guessing who truly doesn't have diabetes, while true positives has a moderate probability of guessing who actually does have diabetes. May need to remove columns that correlate the least with the outcome.

In [12]:
sklearn.metrics.matthews_corrcoef(y_test,predictions)

0.510509701219046

Since this Matthews correlation coefficient is positive and sits in between 0 and 1, it suggests this model moderately predictions who does/doesn't have diabetes.