In [4]:
import boto3
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score
## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'craig-shaffer-data-445-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the datafile
heart = pd.read_csv(file_content_stream)
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [7]:
# removing observations with NA
heart=heart.dropna()

In [10]:
# defining the input and target
X=heart[['age','currentSmoker','totChol','BMI','heartRate']]
Y=heart[['TenYearCHD']]

#splitting the data
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = .2)


In [12]:
#build the logistic model
logit_md = LogisticRegression().fit(X_train,Y_train)

#predict on test dataset
logit_pred = logit_md.predict_proba(X_test)[:,1]
logit_pred

  y = column_or_1d(y, warn=True)


array([0.34068754, 0.3017265 , 0.22959097, 0.15171913, 0.2981333 ,
       0.15013727, 0.15934743, 0.05238756, 0.10896679, 0.09222478,
       0.10937473, 0.1324004 , 0.15899621, 0.16376026, 0.1505368 ,
       0.12055148, 0.11220532, 0.09414008, 0.11569443, 0.19256998,
       0.22579007, 0.21633915, 0.03266837, 0.18457303, 0.10208955,
       0.08792379, 0.25481118, 0.32457707, 0.14563606, 0.13989661,
       0.07533387, 0.09553404, 0.27563287, 0.06126309, 0.3853116 ,
       0.16373873, 0.09033948, 0.23018701, 0.06949665, 0.12263591,
       0.22196028, 0.06566959, 0.26796331, 0.05749066, 0.04848886,
       0.22673646, 0.10419491, 0.1295995 , 0.03300386, 0.12076748,
       0.08970661, 0.21887499, 0.04064262, 0.09258246, 0.26518305,
       0.0571313 , 0.0370131 , 0.22350595, 0.14544424, 0.16367919,
       0.05943135, 0.10463242, 0.15869327, 0.1280339 , 0.06923567,
       0.11665815, 0.26079538, 0.09360746, 0.14454472, 0.04874575,
       0.04258607, 0.20457745, 0.06886285, 0.1646839 , 0.08661

In [13]:
#changing liklihoods to label
logit_label= np.where(logit_pred<0.25,0,1)
logit_label

array([1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,

In [15]:
# constructing confusion matrix
confusion_matrix(Y_test, logit_label)

array([[538,  84],
       [ 71,  39]])

In [16]:
#computing accuaracy
accuracy_score(Y_test, logit_label)

0.7882513661202186