In [21]:
library(tidyverse)
library(tidymodels)
library(dplyr)


url <- "https://raw.githubusercontent.com/danialtaj0/Dataset/main/diabetes_prediction_dataset.csv"
diabetes_data <- read_csv(url) |>
                 mutate(diabetes = as_factor(diabetes))

set.seed(1)

diabetes_split <- initial_split(diabetes_data, prop = 0.75, strata = diabetes)
diabetes_train <- training(diabetes_split)
diabetes_test <- testing(diabetes_split)
glimpse(diabetes_train)

class_distribution <- diabetes_train |>
  group_by(diabetes) |>
  summarise(Count = n())

class_distribution

predictor_means <- diabetes_train |>
  summarise(
    mean_bmi = mean(bmi, na.rm = TRUE),
    mean_age = mean(age, na.rm = TRUE),
    mean_blood_glucose_level = mean(blood_glucose_level, na.rm = TRUE)
  )

predictor_means

# Filtering rows where there is missing data in any of the specified columns
rows_with_missing_data <- diabetes_train |>
  filter(is.na(bmi) | is.na(age) | is.na(blood_glucose_level) | is.na(diabetes))


print(rows_with_missing_data)

# Count the number of rows with missing data
num_rows_with_missing_data <- nrow(rows_with_missing_data)

# Print the count
print(num_rows_with_missing_data)

[1mRows: [22m[34m100000[39m [1mColumns: [22m[34m9[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (2): gender, smoking_history
[32mdbl[39m (7): age, hypertension, heart_disease, bmi, HbA1c_level, blood_glucose_l...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Rows: 75,000
Columns: 9
$ gender              [3m[90m<chr>[39m[23m "Female", "Female", "Male", "Female", "Female", "M…
$ age                 [3m[90m<dbl>[39m[23m 54, 36, 76, 20, 79, 42, 53, 54, 78, 67, 76, 78, 15…
$ hypertension        [3m[90m<dbl>[39m[23m 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ heart_disease       [3m[90m<dbl>[39m[23m 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ smoking_history     [3m[90m<chr>[39m[23m "No Info", "current", "current", "never", "No Info…
$ bmi                 [3m[90m<dbl>[39m[23m 27.32, 23.45, 20.14, 27.32, 23.86, 33.64, 27.32, 5…
$ HbA1c_level         [3m[90m<dbl>[39m[23m 6.6, 5.0, 4.8, 6.6, 5.7, 4.8, 6.1, 6.0, 5.0, 5.8, …
$ blood_glucose_level [3m[90m<dbl>[39m[23m 80, 155, 155, 85, 85, 145, 85, 100, 130, 200, 160,…
$ diabetes            [3m[90m<fct>[39m[23m 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…


diabetes,Count
<fct>,<int>
0,68688
1,6312


mean_bmi,mean_age,mean_blood_glucose_level
<dbl>,<dbl>,<dbl>
27.3311,41.89038,137.9643


[90m# A tibble: 0 × 9[39m
[90m# ℹ 9 variables: gender <chr>, age <dbl>, hypertension <dbl>,[39m
[90m#   heart_disease <dbl>, smoking_history <chr>, bmi <dbl>, HbA1c_level <dbl>,[39m
[90m#   blood_glucose_level <dbl>, diabetes <fct>[39m
[1] 0
