## 2. Preliminary Exploratory Data Analysis

In [None]:
library(tidyverse)
library(ggplot2)

In [None]:
#let's read the data in from online
heart_disease <- read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data")
colnames(heart_disease) <- c("age", "sex", "chest_pain", "trest_bps", "chol", "fbs", "resting_ecg", 
                             "max_hr", "exer_agina", "depr_excer_vs_rest", "exercise_segment_slope", 
                             "num_major_vessels", "heart_condition", "heart_disease_diagnosis")


heart_disease

In [None]:
# Let's clean up the table by giving the categorical names to the values 
# of the variables which should be factors

heart_disease <- heart_disease |>
                mutate(sex = as.factor(sex)) |>
                mutate(sex = fct_recode(sex, "male" = "1", "female" = "0"))

heart_disease <- heart_disease |>
                mutate(chest_pain = as.factor(chest_pain)) |>
                mutate(chest_pain = fct_recode(chest_pain, 
                                               "typical angina" = "1", "atypical angina" = "2",
                                              "non-aginal pain" = "3", "asymptomatic" = "4"))
heart_disease |> distinct(heart_condition)

heart_disease <- heart_disease |>
                mutate(heart_condition = as.factor(heart_condition)) |>
                mutate(heart_condition = fct_recode(heart_condition, 
                                               "normal" = "3.0", "fixed defect" = "6.0",
                                              "reversible defect" = "7.0"))
heart_disease <- heart_disease |>
                mutate(exercise_segment_slope = as.factor(exercise_segment_slope)) |>
                mutate(exercise_segment_slope = fct_recode(exercise_segment_slope, 
                                               "unsloping" = "1", "flat" = "2",
                                              "reversible defect" = "3"))
heart_disease <- heart_disease |>
                mutate(heart_disease_diagnosis = as.factor(heart_disease_diagnosis)) |>
                mutate(heart_disease_diagnosis = fct_recode(heart_disease_diagnosis, 
                                               "no disease" = "0", "possible" = "1",
                                              "likely" = "3", "certain" = "4"))

heart_disease <- heart_disease |>
                mutate(heart_disease_diagnosis = as.factor(heart_disease_diagnosis))

heart_disease

In [None]:
# we do not need all of these variables, so let's take the ones that seem most useful
heart_disease_train <- heart_disease |>
            select(age, chest_pain, trest_bps, chol, max_hr, depr_excer_vs_rest, heart_condition, heart_disease_diagnosis)

heart_disease_train

In [None]:
#Now it is time for visualization! 
#Let's see a few different plots to see what the relationships within the data are.

options(repr.plot.width = 10, repr.plot.height = 6)
age_vs_max_hr <- heart_disease_train |>
            ggplot(aes(x = age, y = max_hr, color = heart_disease_diagnosis)) +
            geom_point() +
            labs(title = "Heart Conditions based on Maxiumum Heart Rate vs Age",
                x = "Age (years)", y = "Maximum Heart Rate (bpm)",
                color = "Heart Disease Diagionsis") +
            theme(text = element_text(size = 16))

age_vs_max_hr

In [None]:
options(repr.plot.width = 10, repr.plot.height = 6)
age_vs_max_hr <- heart_disease_train |>
            ggplot(aes(x = age, y = max_hr, color = heart_disease_diagnosis)) +
            geom_point() +
            labs(title = "Heart Conditions based on Maxiumum Heart Rate vs Age",
                x = "Age (years)", y = "Maximum Heart Rate (bpm)",
                color = "Heart Disease Diagionsis") +
            theme(text = element_text(size = 16))

age_vs_max_hr

In [None]:
options(repr.plot.width = 8, repr.plot.height = 6)

chol_vs_trest_bps <- heart_disease_train |>
            ggplot(aes(x = chol, y = trest_bps, color = heart_condition)) +
            geom_point() +
            labs(title = "Heart Conditions based on Maxiumum Heart Rate vs Age",
                x = "Cholesteral (mg/dl)", y = "Blood Pressure (mm Hg)",
                color = "Heart Condition") +
            theme(text = element_text(size = 16))

chol_vs_trest_bps