In [6]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)


## define csv file to read in the bucket
file_key= 'framingham.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
heart = pd.read_csv(file_content_stream)
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [7]:
## removing missing observations (with NAs)
heart = heart.dropna()

In [8]:
## defining input and output variables
X = heart[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate']]
Y = heart['TenYearCHD']

In [9]:
## splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [12]:
## building the logistic model
logit_md = LogisticRegression().fit(X_train, Y_train)

## predicting on the test dataset
logit_pred = logit_md.predict_proba(X_test)[:,1]
logit_pred

## top left, likelihood that the patient has CHD

array([0.05726281, 0.29368643, 0.24552021, 0.11131363, 0.22196366,
       0.06183494, 0.06101575, 0.04537235, 0.22339669, 0.3403973 ,
       0.11906943, 0.05405138, 0.09075918, 0.03337003, 0.2151506 ,
       0.19875048, 0.09548719, 0.1925549 , 0.05579235, 0.18959642,
       0.050165  , 0.18275963, 0.21970724, 0.18666797, 0.16185476,
       0.03931398, 0.09790791, 0.16357859, 0.09576095, 0.07992765,
       0.13945803, 0.04874724, 0.07931807, 0.12125192, 0.17971346,
       0.14690724, 0.07337613, 0.27153104, 0.20051416, 0.26980819,
       0.04596296, 0.0472874 , 0.05386718, 0.10934516, 0.13937692,
       0.08020817, 0.2338416 , 0.06511019, 0.04140228, 0.10316783,
       0.1765051 , 0.07491139, 0.18100866, 0.06583426, 0.22857276,
       0.111212  , 0.10575413, 0.16021808, 0.11205989, 0.19552665,
       0.22450773, 0.15969151, 0.07773768, 0.20073999, 0.16606108,
       0.06320713, 0.12586711, 0.20553333, 0.07821083, 0.15257702,
       0.08393108, 0.13307485, 0.06801407, 0.10652281, 0.10844

In [14]:
## changing likelihoods to labels
logit_label = np.where(logit_pred < 0.25, 0, 1)
logit_label

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,

In [15]:
## constructing the confusion matrix
confusion_matrix(Y_test, logit_label)

array([[527,  87],
       [ 75,  43]])

In [16]:
## calculating the accuracy score
accuracy_score(Y_test, logit_label)

0.7786885245901639