**Predicting Heart Disease**

Import Dependencies

In [1]:
import io
from google.colab import files
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler


Import Data

In [2]:
import os
spark_version = 'spark-3.1.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (91.180% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (91.18                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (91.180% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com] [Connecting to                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com] [Connecting to                                                                               Get:4 https://developer.download.nvidia.com/comp

In [3]:
# Download the Postgres driver that will allow Spark to interact with Postgres.
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2021-05-06 22:14:31--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2021-05-06 22:14:32 (6.03 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [4]:
# start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CloudETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

Import Data from AWS S3

In [5]:
# Read in data from S3 Buckets
url ="https://group4-heart-study.s3.amazonaws.com/framingham.csv"
heart_disease_df = pd.read_csv(url)
url2 ="https://group4-heart-study.s3.amazonaws.com/heart_attack.csv"
heart_attack_df = pd.read_csv(url2)

In [6]:
# Show DataFrame
heart_disease_df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [7]:
# Show DataFrame
heart_attack_df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


Cleaning the Data

In [8]:
# Drop the null columns where all values are null
heart_disease_df = heart_disease_df.dropna(axis='columns', how='all')
heart_attack_df = heart_attack_df.dropna(axis='columns', how='all')
# Drop the null rows
heart_disease_df = heart_disease_df.dropna()
heart_attack_df = heart_attack_df.dropna()

In [9]:
# checking data types
heart_disease_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3656 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             3656 non-null   int64  
 1   age              3656 non-null   int64  
 2   education        3656 non-null   float64
 3   currentSmoker    3656 non-null   int64  
 4   cigsPerDay       3656 non-null   float64
 5   BPMeds           3656 non-null   float64
 6   prevalentStroke  3656 non-null   int64  
 7   prevalentHyp     3656 non-null   int64  
 8   diabetes         3656 non-null   int64  
 9   totChol          3656 non-null   float64
 10  sysBP            3656 non-null   float64
 11  diaBP            3656 non-null   float64
 12  BMI              3656 non-null   float64
 13  heartRate        3656 non-null   float64
 14  glucose          3656 non-null   float64
 15  TenYearCHD       3656 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 485.6 KB


In [10]:
# changing all datatypes to int
heart_disease_df['education'] = heart_disease_df['education'].astype('int')
heart_disease_df['cigsPerDay'] = heart_disease_df['cigsPerDay'].astype('int')
heart_disease_df['BPMeds'] = heart_disease_df['BPMeds'].astype('int')
heart_disease_df['totChol'] = heart_disease_df['totChol'].astype('int')
heart_disease_df['sysBP'] = heart_disease_df['sysBP'].astype('int')
heart_disease_df['diaBP'] = heart_disease_df['diaBP'].astype('int')
heart_disease_df['BMI'] = heart_disease_df['BMI'].astype('int')
heart_disease_df['heartRate'] = heart_disease_df['heartRate'].astype('int')
heart_disease_df['glucose'] = heart_disease_df['glucose'].astype('int')
heart_disease_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3656 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   male             3656 non-null   int64
 1   age              3656 non-null   int64
 2   education        3656 non-null   int64
 3   currentSmoker    3656 non-null   int64
 4   cigsPerDay       3656 non-null   int64
 5   BPMeds           3656 non-null   int64
 6   prevalentStroke  3656 non-null   int64
 7   prevalentHyp     3656 non-null   int64
 8   diabetes         3656 non-null   int64
 9   totChol          3656 non-null   int64
 10  sysBP            3656 non-null   int64
 11  diaBP            3656 non-null   int64
 12  BMI              3656 non-null   int64
 13  heartRate        3656 non-null   int64
 14  glucose          3656 non-null   int64
 15  TenYearCHD       3656 non-null   int64
dtypes: int64(16)
memory usage: 485.6 KB


In [11]:
# checking data types
heart_attack_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trtbps    303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalachh  303 non-null    int64  
 8   exng      303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slp       303 non-null    int64  
 11  caa       303 non-null    int64  
 12  thall     303 non-null    int64  
 13  output    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 35.5 KB


In [12]:
# changing all datatypes to int
heart_attack_df['oldpeak'] = heart_attack_df['oldpeak'].astype('int')
heart_attack_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   age       303 non-null    int64
 1   sex       303 non-null    int64
 2   cp        303 non-null    int64
 3   trtbps    303 non-null    int64
 4   chol      303 non-null    int64
 5   fbs       303 non-null    int64
 6   restecg   303 non-null    int64
 7   thalachh  303 non-null    int64
 8   exng      303 non-null    int64
 9   oldpeak   303 non-null    int64
 10  slp       303 non-null    int64
 11  caa       303 non-null    int64
 12  thall     303 non-null    int64
 13  output    303 non-null    int64
dtypes: int64(14)
memory usage: 35.5 KB


Export Clean Data to Database: Postgres & Amazon RDS 

In [16]:
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://heartstudy.coqvcapoz4j9.us-east-2.rds.amazonaws.com:5432/heartstudy"
config = {"user":"postgres", 
          "password": <password>, 
          "driver":"org.postgresql.Driver"}

In [19]:
# Convert to Spark DataFrame
heart_disease_sparkdf = spark.createDataFrame(heart_disease_df)
heart_attack_sparkdf = spark.createDataFrame(heart_attack_df)

In [21]:
# Write heart_disease_df to table in RDS
heart_disease_sparkdf.write.jdbc(url=jdbc_url, table='Heart_Disease', mode=mode, properties=config)

In [22]:
# Write heart_disease_df to table in RDS
heart_attack_sparkdf.write.jdbc(url=jdbc_url, table='Heart_Attack', mode=mode, properties=config)

Separate the Features (X) from the Target (y)

In [23]:
# Target (y) feature is 10 year risk of coronary heart disease CHD (TenYearCHD)
#1 ="yes", we will interpret as will get or has CHD 0="no", we will interpret as does not have or likely to avoid CHD

#Expected Features:
#sex
#gender
#age
#education
#smoking (yes/no)
#cigs per day
#Blood Presure Meds
#Prevalent Stroke: whether or not the patient had previously had a stroke (yes/no)
#Prevalent Hyp: whether or not the patient was hypertensive  (yes/no)
#Diabetes: whether or not the patient had diabetes 
#Tot Chol: total cholesterol level 
#Sys BP: systolic blood pressure 
#Dia BP: diastolic blood pressure
#BMI: Body Mass Index 
#Heart Rate: heart rate (Continuous - In medical research,  continuous because of large number of possible values.)
#glucose: glucose level (Continuous)

In [24]:
#scale split data
data_scaler = StandardScaler()
data_scaler.fit_transform(heart_disease_df)

array([[ 1.11982472, -1.23335117,  1.97575158, ...,  0.35633992,
        -0.20312694, -0.42395212],
       [-0.8929969 , -0.41559058,  0.01979497, ...,  1.60828949,
        -0.24495595, -0.42395212],
       [ 1.11982472, -0.1819447 , -0.95818334, ..., -0.06097661,
        -0.49592996, -0.42395212],
       ...,
       [ 1.11982472,  0.05170118, -0.95818334, ..., -0.81214636,
         0.17333408,  2.35875694],
       [ 1.11982472,  0.16852412,  0.99777327, ..., -0.89560966,
        -0.57958797, -0.42395212],
       [-0.8929969 ,  0.28534706,  0.01979497, ...,  0.35633992,
         1.05174313, -0.42395212]])

In [25]:
y = heart_disease_df["TenYearCHD"]
X = heart_disease_df.drop(columns="TenYearCHD")

Split our data into training and testing

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=42, 
                                                    stratify=y)
X_train.shape

(2742, 15)

Create a Logistic Regression Model

In [27]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=10000,
                                random_state=42)

Fit (train) or model using the training data

In [28]:
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Make predictions

In [29]:
#Model wants to predict ability based on set of factors whether or not individual will/has high risk of CHD
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(5)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [30]:
#review of model accuracy for testing data
print(accuracy_score(y_test, y_pred))

0.862144420131291


In [31]:
#review of model accuracy for training data
y_train_pred=classifier.predict(X_train)
print(accuracy_score(y_train,y_train_pred))

0.8533916849015317


In [32]:
#review of confusion matrix
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[770   5]
 [121  18]]


In [33]:
#sensitivity and precision
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.86      0.99      0.92       775
           1       0.78      0.13      0.22       139

    accuracy                           0.86       914
   macro avg       0.82      0.56      0.57       914
weighted avg       0.85      0.86      0.82       914



Expected predictions- We want to be able to predict with a high degree of accuracy (+75%) wether or not someone will get coronary heart disease