In [18]:
# Run cell before starting workspace - loads necessary packages

library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

In [19]:
# Read data into df
url <- "https://raw.githubusercontent.com/incribo-inc/stroke_prediction/main/stroke_prediction_dataset.csv"
stroke <- read.csv(url)

In [27]:
# set seed - don't change!
set.seed(1234)

# wrangling
stroke_selected <- stroke |>
    select(Age, Gender, Hypertension, Heart.Disease, Alcohol.Intake, 
           Physical.Activity, Family.History.of.Stroke,
           Dietary.Habits, Stress.Levels, Diagnosis)

# rename columns
stroke_renamed <- stroke_selected |>
    rename(age = Age,
           gender = Gender,
           hypertension = Hypertension,
           heart_disease = Heart.Disease,
           alcohol_intake = Alcohol.Intake,
           activity = Physical.Activity,
           family_history = Family.History.of.Stroke,
           diet = Dietary.Habits,
           stress = Stress.Levels,
           dx = Diagnosis)

stroke_renamed



age,gender,hypertension,heart_disease,alcohol_intake,activity,family_history,diet,stress,dx
<int>,<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>
56,Male,0,1,Social Drinker,Moderate,Yes,Vegan,3.48,Stroke
80,Male,0,0,Never,Low,No,Paleo,1.73,Stroke
26,Male,1,1,Rarely,High,Yes,Paleo,7.31,Stroke
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
35,Male,0,0,Rarely,High,No,Paleo,0.51,Stroke
73,Male,0,0,Never,High,Yes,Paleo,1.53,No Stroke
64,Female,0,0,Rarely,Moderate,No,Vegan,4.57,Stroke


In [28]:
unique(stroke_renamed$diet)

In [26]:
# split data
# we are using 9 predictors, so 67% data used as training, 33% predicting

stroke_split <- initial_split(stroke_renamed, prop = 0.67, strata = dx)
stroke_train <- training(stroke_split)
stroke_test <- testing(stroke_split)

#stroke_train
#stroke_test

In [24]:
# summarizing observations per class, means of variables, missing data counts for diagnosed strokes

stroke_summary <- stroke_train |>
    group_by(dx) |>
    summarize(
        count = n(),
        mean_age = mean(age, na.rm = TRUE),
        mean_gender = mean(gender, na.rm = TRUE),
        mean_hypertension = mean(hypertension, na.rm = TRUE),
        mean_heart_disease = mean(heart_disease, na.rm = TRUE),
        mean_alcohol = mean(alcohol_intake, na.rm = TRUE),
        mean_activity = mean(activity, na.rm = TRUE),
        mean_famhist = mean(family_history, na.rm = TRUE),
        mean_diet = mean(diet, na.rm = TRUE),
        mean_stress = mean(stress, na.rm = TRUE),
        missing_age = sum(is.na(age)),
        missing_gender = sum(is.na(gender)),
        missing_hypertension = sum(is.na(hypertension)),
        missing_heart_disease = sum(is.na(heart_disease)),
        missing_alcohol = sum(is.na(alcohol_intake)),
        missing_activity = sum(is.na(activity)),
        missing_famhist = sum(is.na(family_history)),
        missing_diet = sum(is.na(diet)),
        missing_stress = sum(is.na(stress)),
        )

# after printing the above table, there were no missing values for any columns
# additionally
# for clarity, these columns were omitted

stroke_summary <- stroke_summary |>
    select(mean_age, mean_gender, mean_hypertension, 
           mean_heart_disease, mean_alcohol, mean_activity,
           mean_famhist, mean_diet, mean_stress)

stroke_summary
        
      

[1m[22m[36mℹ[39m In argument: `mean_gender = mean(gender, na.rm = TRUE)`.
[36mℹ[39m In group 1: `dx = "No Stroke"`.
[33m![39m argument is not numeric or logical: returning NA


mean_age,mean_gender,mean_hypertension,mean_heart_disease,mean_alcohol,mean_activity,mean_famhist,mean_diet,mean_stress
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
54.0759,,0.2471264,0.4992073,,,,,5.076141
54.3306,,0.2448531,0.498301,,,,,4.970406
