In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
#from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_val_score

# Logistic regression demo for computer-generated stroke data

# read data (replace with your own path)
df = pd.read_csv('stroke_v1.csv', sep=',')
df.head(10)



Unnamed: 0,ID,Weight,Smoking,Exercise,Cholesterol,Income,Exphappiness,Birthyear,Sex,Stroke
0,1,117,1,2,8.0,1080,27,1913,M,1
1,2,62,0,8,5.5,2120,55,1949,M,0
2,3,74,0,6,4.8,3170,65,1976,M,0
3,4,77,0,5,4.2,4740,61,1973,F,0
4,5,67,0,8,4.5,1900,53,1929,M,0
5,6,76,0,6,6.2,3410,72,1959,F,0
6,7,63,0,7,4.1,3640,71,1979,F,0
7,8,75,0,5,5.2,2500,99,1960,F,0
8,9,70,0,6,4.9,2110,48,1922,F,0
9,10,82,0,5,5.8,2560,34,2007,F,1


In [2]:
# re-encode gender column
df['Sex'].replace(['M','F'], [1,2], inplace=True)

# drop ID
df.drop('ID', axis=1, inplace=True)

df.head(10)

Unnamed: 0,Weight,Smoking,Exercise,Cholesterol,Income,Exphappiness,Birthyear,Sex,Stroke
0,117,1,2,8.0,1080,27,1913,1,1
1,62,0,8,5.5,2120,55,1949,1,0
2,74,0,6,4.8,3170,65,1976,1,0
3,77,0,5,4.2,4740,61,1973,2,0
4,67,0,8,4.5,1900,53,1929,1,0
5,76,0,6,6.2,3410,72,1959,2,0
6,63,0,7,4.1,3640,71,1979,2,0
7,75,0,5,5.2,2500,99,1960,2,0
8,70,0,6,4.9,2110,48,1922,2,0
9,82,0,5,5.8,2560,34,2007,2,1


In [3]:
df.describe()

Unnamed: 0,Weight,Smoking,Exercise,Cholesterol,Income,Exphappiness,Birthyear,Sex,Stroke
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,78.752,0.223,5.119,5.6584,2827.99,51.024,1955.716,1.494,0.4
std,13.939038,0.416467,1.924199,1.312262,1105.714549,16.805888,21.149341,0.500214,0.490143
min,27.0,0.0,0.0,0.1,-790.0,0.0,1878.0,1.0,0.0
25%,70.0,0.0,4.0,4.8,2087.5,40.0,1942.0,1.0,0.0
50%,78.0,0.0,5.0,5.6,2830.0,51.0,1957.0,1.0,0.0
75%,88.0,0.0,6.0,6.5,3562.5,62.0,1971.0,2.0,1.0
max,130.0,1.0,10.0,9.9,5860.0,100.0,2026.0,2.0,1.0


In [4]:
# split into explanatory and response variables 
X = df.iloc[:,:8]
Y = df.iloc[:,8]


In [5]:
# build and fit model
reg = LogisticRegression()
reg.fit(X,Y)

print("Coefficients: ",reg.coef_)
print("Intercept: ", reg.intercept_)

# compute predicted values from training set
Y_pred = reg.predict(X)

cm = confusion_matrix(Y, Y_pred)
print("Confusion matrix:\n",cm)

accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[1][1]+cm[0][1]+cm[1][0])
print("Accuracy calculated from the training set = %.3f" % (accuracy))

print(classification_report(Y, Y_pred, target_names=['no', 'yes']))


Coefficients:  [[ 7.85136330e-02  5.05374309e-02 -2.96795154e-01  1.53651555e-01
   5.16102078e-06 -2.99075413e-03 -3.07638384e-03  1.28246222e-02]]
Intercept:  [-0.00091598]
Confusion matrix:
 [[504  96]
 [145 255]]
Accuracy calculated from the training set = 0.759
              precision    recall  f1-score   support

          no       0.78      0.84      0.81       600
         yes       0.73      0.64      0.68       400

    accuracy                           0.76      1000
   macro avg       0.75      0.74      0.74      1000
weighted avg       0.76      0.76      0.76      1000



In [6]:
# cross-validate
# number of folds
k = 10
scores = cross_val_score(estimator=reg,
                        X=X,
                        y=Y,
                        scoring="accuracy",
                        cv=k)
print("Accuracies from %d individual folds:" % k)
print(scores)
print("Accuracy calculated using %d-fold cross validation = %.3f" % (k, scores.mean()))

Accuracies from 10 individual folds:
[0.75 0.79 0.75 0.69 0.69 0.76 0.77 0.81 0.67 0.78]
Accuracy calculated using 10-fold cross validation = 0.746


In [7]:
# retrieve estimated probabilities (from training set)
reg.predict_proba(X)

array([[0.0193815 , 0.9806185 ],
       [0.94261884, 0.05738116],
       [0.81431552, 0.18568448],
       ...,
       [0.24648923, 0.75351077],
       [0.11986899, 0.88013101],
       [0.6354463 , 0.3645537 ]])