In [5]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

s3 = boto3.resource('s3')
bucket_name = 'grant-gonnerman-data-445'
bucket = s3.Bucket(bucket_name)

file_key = 'framingham.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading data file
heart = pd.read_csv(file_content_stream)
heart.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [6]:
# removing NA 
heart = heart.dropna()

In [7]:
# defining input and target variables 
x = heart[['age', 'currentSmoker', 'totChol', 'BMI', 'heartRate']]
y = heart['TenYearCHD']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)


In [11]:
# building logistic regression model
logit_md = LogisticRegression().fit(x_train, y_train)

# predicting on the test data
logit_pred = logit_md.predict_proba(x_test)[:,1]
logit_pred

array([0.07486516, 0.32826559, 0.1898292 , 0.02914356, 0.31261405,
       0.07952922, 0.06479974, 0.13509657, 0.08790162, 0.52360514,
       0.052317  , 0.17656731, 0.02504475, 0.08156755, 0.14232775,
       0.08321398, 0.08964741, 0.14039399, 0.084374  , 0.07746416,
       0.49035744, 0.10748013, 0.29952958, 0.17177337, 0.06207523,
       0.0894431 , 0.21114722, 0.07217415, 0.19231817, 0.06626578,
       0.27600561, 0.45314545, 0.04919182, 0.15506565, 0.02493592,
       0.12459974, 0.10495214, 0.14922108, 0.05816096, 0.03339432,
       0.07516203, 0.30788053, 0.33629257, 0.05782545, 0.04356199,
       0.15162101, 0.2329426 , 0.11189881, 0.03133945, 0.16470535,
       0.13628648, 0.24679031, 0.07704375, 0.07048565, 0.18654813,
       0.280083  , 0.04799696, 0.21495335, 0.10487127, 0.17200755,
       0.07017869, 0.05945898, 0.05624247, 0.12922012, 0.06365831,
       0.08265261, 0.04584001, 0.20408515, 0.09455705, 0.36743305,
       0.20583541, 0.07140881, 0.12529538, 0.21935173, 0.11393

In [12]:
# changing likelyhoods to labels
logit_label = np.where(logit_pred < .25, 0, 1)
logit_label

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,

In [13]:
# constructing confusion matrix 
confusion_matrix(y_test, logit_label)

array([[529,  76],
       [ 91,  36]])

In [15]:
# computing the accuracy
accuracy_score(y_test, logit_label)

0.7718579234972678