In [1]:
library(tidyverse)
library(tidymodels)
library(dplyr)


url <- "https://raw.githubusercontent.com/danialtaj0/Dataset/main/diabetes_prediction_dataset.csv"
diabetes_data <- read_csv(url) |>
                 mutate(diabetes = as_factor(diabetes))

set.seed(1)

diabetes_split <- initial_split(diabetes_data, prop = 0.75, strata = diabetes)
diabetes_train <- training(diabetes_split)
diabetes_test <- testing(diabetes_split)
glimpse(diabetes_train)

class_distribution <- diabetes_train |>
  group_by(diabetes) |>
  summarise(Count = n())

class_distribution

predictor_means <- diabetes_train |>
  summarise(
    mean_bmi = mean(bmi, na.rm = TRUE),
    mean_age = mean(age, na.rm = TRUE),
    mean_blood_glucose_level = mean(blood_glucose_level, na.rm = TRUE)
  )

predictor_means

# Filtering rows where there is missing data in any of the specified columns
rows_with_missing_data <- diabetes_train |>
  filter(is.na(bmi) | is.na(age) | is.na(blood_glucose_level) | is.na(diabetes))


print(rows_with_missing_data)

# Count the number of rows with missing data
num_rows_with_missing_data <- nrow(rows_with_missing_data)

# Print the count
missing_data <- tibble(n_rows_missing_data = num_rows_with_missing_data)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.3     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.5     [32m✔[39m [34mrsample     [39

Rows: 75,000
Columns: 9
$ gender              [3m[90m<chr>[39m[23m "Female", "Female", "Female", "Female", "Female", …
$ age                 [3m[90m<dbl>[39m[23m 80, 20, 44, 79, 32, 53, 54, 78, 67, 76, 78, 15, 42…
$ hypertension        [3m[90m<dbl>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ heart_disease       [3m[90m<dbl>[39m[23m 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ smoking_history     [3m[90m<chr>[39m[23m "never", "never", "never", "No Info", "never", "ne…
$ bmi                 [3m[90m<dbl>[39m[23m 25.19, 27.32, 19.31, 23.86, 27.32, 27.32, 54.70, 3…
$ HbA1c_level         [3m[90m<dbl>[39m[23m 6.6, 6.6, 6.5, 5.7, 5.0, 6.1, 6.0, 5.0, 5.8, 5.0, …
$ blood_glucose_level [3m[90m<dbl>[39m[23m 140, 85, 200, 85, 100, 85, 100, 130, 200, 160, 126…
$ diabetes            [3m[90m<fct>[39m[23m 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…


diabetes,Count
<fct>,<int>
0,68653
1,6347


mean_bmi,mean_age,mean_blood_glucose_level
<dbl>,<dbl>,<dbl>
27.30796,41.96079,138.1894


[90m# A tibble: 0 × 9[39m
[90m# ℹ 9 variables: gender <chr>, age <dbl>, hypertension <dbl>,[39m
[90m#   heart_disease <dbl>, smoking_history <chr>, bmi <dbl>, HbA1c_level <dbl>,[39m
[90m#   blood_glucose_level <dbl>, diabetes <fct>[39m
