In [32]:
library(ggplot2)
library(ggpubr)
library(fitdistrplus)
library(nortest)
library(car)

Loading required package: carData



In [2]:
data <- read.csv("../results/cardiovascular_disease_clean.csv", 
colClasses = c("numeric", "factor", "integer", "numeric", "integer", "integer", "factor", "factor", "factor", "factor", "factor", "factor", "numeric", "numeric"))

In [3]:
quantitative.cols <- c(1, 3, 4, 5, 6, 13, 14)


### Distribuciones

In [4]:
plot_histograms <- function(column){
    ggplot(data, aes(x=data[,column], fill=cardio)) + 
    geom_density(alpha=0.5) +
    labs(x=column)
}
plots <- lapply(colnames(data[, quantitative.cols]), plot_histograms)

plot.arrange <- ggarrange(plots[[1]],plots[[2]],plots[[3]],plots[[6]], plots[[4]],
          plots[[5]],plots[[7]], nrow=2, ncol=4)

ggsave("../docs/assests/plot.png",bg = "white", plot = plot.arrange, width = 15, height = 10)



In [123]:
cardio.yes <- data[data$cardio == "Yes", ]
cardio.no <- data[data$cardio == "No", ]

In [124]:
plot_descdist <- function(colname){
    png(paste("../docs/assests/", colname, "_cardio_yes_dists.png", sep=""), bg = "white", width = 800, height = 600)
    descdist(cardio.yes[, colname])
    dev.off()
}

lapply(colnames(cardio.yes[, quantitative.cols]), plot_descdist)

In [7]:
plot_descdist <- function(colname){
    png(paste("../docs/assests/", colname, "_cardio_no_dists.png", sep=""), bg = "white", width = 800, height = 600)
    descdist(cardio.no[, colname])
    dev.off()
}

lapply(colnames(cardio.no[, quantitative.cols]), plot_descdist)

### Prueba de normalidad

In [12]:
lapply(data[, quantitative.cols], lillie.test)

$age_year

	Lilliefors (Kolmogorov-Smirnov) normality test

data:  X[[i]]
D = 0.059375, p-value < 2.2e-16


$height

	Lilliefors (Kolmogorov-Smirnov) normality test

data:  X[[i]]
D = 0.051846, p-value < 2.2e-16


$weight

	Lilliefors (Kolmogorov-Smirnov) normality test

data:  X[[i]]
D = 0.079516, p-value < 2.2e-16


$ap_hi

	Lilliefors (Kolmogorov-Smirnov) normality test

data:  X[[i]]
D = 0.2716, p-value < 2.2e-16


$ap_lo

	Lilliefors (Kolmogorov-Smirnov) normality test

data:  X[[i]]
D = 0.28659, p-value < 2.2e-16


$IMC

	Lilliefors (Kolmogorov-Smirnov) normality test

data:  X[[i]]
D = 0.079099, p-value < 2.2e-16


$pulse

	Lilliefors (Kolmogorov-Smirnov) normality test

data:  X[[i]]
D = 0.3057, p-value < 2.2e-16



Basado en las estadísticas de normalidad estimadas por el test de Kolmogorov-Smirnov, vemos que ninguno de los datos sigue una distribución normal. Por lo tanto, para el resto de análisis aplicaremos pruebas no paramétricas.

### Homocedasticidad

In [70]:
apply_fligner <- function(x, col) {
    fligner.test(x = x, g = data[, col])
}
lapply(data[, quantitative.cols], apply_fligner, col="cardio")

$age_year

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 145.74, df = 1, p-value < 2.2e-16


$height

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 53.315, df = 1, p-value = 2.841e-13


$weight

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 817.97, df = 1, p-value < 2.2e-16


$ap_hi

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 11337, df = 1, p-value < 2.2e-16


$ap_lo

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 1878.6, df = 1, p-value < 2.2e-16


$IMC

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 1550.5, df = 1, p-value < 2.2e-16


$pulse

	Fligner-Killeen test of homogeneity of variances

data:  x and 

In [69]:
apply_levene <- function(col, col2){
    leveneTest(y=data[,col]~data[,col2], data = data, center = "median")
    }
levene.results <- lapply(colnames(data[, quantitative.cols]), apply_levene, col2="cardio")
print(levene.results)

[[1]]
Levene's Test for Homogeneity of Variance (center = "median")
         Df F value    Pr(>F)    
group     1  203.25 < 2.2e-16 ***
      61213                      
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

[[2]]
Levene's Test for Homogeneity of Variance (center = "median")
         Df F value  Pr(>F)    
group     1  53.105 3.2e-13 ***
      61213                    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

[[3]]
Levene's Test for Homogeneity of Variance (center = "median")
         Df F value    Pr(>F)    
group     1  834.87 < 2.2e-16 ***
      61213                      
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

[[4]]
Levene's Test for Homogeneity of Variance (center = "median")
         Df F value    Pr(>F)    
group     1   14612 < 2.2e-16 ***
      61213                      
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

[[5]]
Levene's Test for Homogeneity of Variance (cente

In [86]:
print("Cholesterol")
apply_fligner <- function(x, col) {
    fligner.test(x = x, g = data[, col])
}
lapply(data[, quantitative.cols], apply_fligner, col="cholesterol")

print("Gender")
apply_fligner <- function(x, col) {
    fligner.test(x = x, g = data[, col])
}
lapply(data[, quantitative.cols], apply_fligner, col="gender")

print("Gluc")
apply_fligner <- function(x, col) {
    fligner.test(x = x, g = data[, col])
}
lapply(data[, quantitative.cols], apply_fligner, col="gluc")

print("smoke")
apply_fligner <- function(x, col) {
    fligner.test(x = x, g = data[, col])
}
lapply(data[, quantitative.cols], apply_fligner, col="smoke")

print("alco")
apply_fligner <- function(x, col) {
    fligner.test(x = x, g = data[, col])
}
lapply(data[, quantitative.cols], apply_fligner, col="alco")

print("active")
apply_fligner <- function(x, col) {
    fligner.test(x = x, g = data[, col])
}
lapply(data[, quantitative.cols], apply_fligner, col="active")

print("cardio")
apply_fligner <- function(x, col) {
    fligner.test(x = x, g = data[, col])
}
lapply(data[, quantitative.cols], apply_fligner, col="cardio")



[1] "Cholesterol"


$age_year

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 268.24, df = 2, p-value < 2.2e-16


$height

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 82.682, df = 2, p-value < 2.2e-16


$weight

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 422.2, df = 2, p-value < 2.2e-16


$ap_hi

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 1577.4, df = 2, p-value < 2.2e-16


$ap_lo

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 560.73, df = 2, p-value < 2.2e-16


$IMC

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 579.04, df = 2, p-value < 2.2e-16


$pulse

	Fligner-Killeen test of homogeneity of variances

data:  x and da

[1] "Gender"


$age_year

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 85.725, df = 1, p-value < 2.2e-16


$height

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 15.516, df = 1, p-value = 8.182e-05


$weight

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 99.848, df = 1, p-value < 2.2e-16


$ap_hi

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 1.3441, df = 1, p-value = 0.2463


$ap_lo

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 7.6894, df = 1, p-value = 0.005555


$IMC

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 755.38, df = 1, p-value < 2.2e-16


$pulse

	Fligner-Killeen test of homogeneity of variances

data:  x and

[1] "Gluc"


$age_year

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 116.33, df = 2, p-value < 2.2e-16


$height

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 34.999, df = 2, p-value = 2.512e-08


$weight

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 188.65, df = 2, p-value < 2.2e-16


$ap_hi

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 636.51, df = 2, p-value < 2.2e-16


$ap_lo

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 125.61, df = 2, p-value < 2.2e-16


$IMC

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 270.78, df = 2, p-value < 2.2e-16


$pulse

	Fligner-Killeen test of homogeneity of variances

data:  x and

[1] "smoke"


$age_year

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 2.2084, df = 1, p-value = 0.1373


$height

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 32.563, df = 1, p-value = 1.154e-08


$weight

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 28.535, df = 1, p-value = 9.202e-08


$ap_hi

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 35.351, df = 1, p-value = 2.754e-09


$ap_lo

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 14.951, df = 1, p-value = 0.0001103


$IMC

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 26.837, df = 1, p-value = 2.213e-07


$pulse

	Fligner-Killeen test of homogeneity of variances

data:

[1] "alco"


$age_year

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 0.12201, df = 1, p-value = 0.7269


$height

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 6.9104, df = 1, p-value = 0.008569


$weight

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 27.106, df = 1, p-value = 1.926e-07


$ap_hi

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 53.406, df = 1, p-value = 2.712e-13


$ap_lo

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 60.398, df = 1, p-value = 7.75e-15


$IMC

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 1.8771, df = 1, p-value = 0.1707


$pulse

	Fligner-Killeen test of homogeneity of variances

data:  x 

[1] "active"


$age_year

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 0.040573, df = 1, p-value = 0.8404


$height

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 4.215, df = 1, p-value = 0.04007


$weight

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 6.4643, df = 1, p-value = 0.01101


$ap_hi

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 10.482, df = 1, p-value = 0.001206


$ap_lo

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 8.1413, df = 1, p-value = 0.004327


$IMC

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 6.1294, df = 1, p-value = 0.0133


$pulse

	Fligner-Killeen test of homogeneity of variances

data:  x and 

[1] "cardio"


$age_year

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 145.74, df = 1, p-value < 2.2e-16


$height

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 53.315, df = 1, p-value = 2.841e-13


$weight

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 817.97, df = 1, p-value < 2.2e-16


$ap_hi

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 11337, df = 1, p-value < 2.2e-16


$ap_lo

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 1878.6, df = 1, p-value < 2.2e-16


$IMC

	Fligner-Killeen test of homogeneity of variances

data:  x and data[, col]
Fligner-Killeen:med chi-squared = 1550.5, df = 1, p-value < 2.2e-16


$pulse

	Fligner-Killeen test of homogeneity of variances

data:  x and 

In [98]:
fligner.test(x = data$age_year, g = paste(data$cardio, data$cholesterol))

fligner.test(x = data$age_year, g = paste(data$cardio, data$gluc))

fligner.test(x = data$age_year, g = paste(data$cardio, data$gender))

fligner.test(x = data$age_year, g = paste(data$cardio, data$smoke))

fligner.test(x = data$age_year, g = paste(data$cardio, data$alco))

fligner.test(x = data$age_year, g = paste(data$cardio, data$active))


	Fligner-Killeen test of homogeneity of variances

data:  data$age_year and paste(data$cardio, data$cholesterol)
Fligner-Killeen:med chi-squared = 317.9, df = 5, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$age_year and paste(data$cardio, data$gluc)
Fligner-Killeen:med chi-squared = 247.81, df = 5, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$age_year and paste(data$cardio, data$gender)
Fligner-Killeen:med chi-squared = 306.7, df = 3, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$age_year and paste(data$cardio, data$smoke)
Fligner-Killeen:med chi-squared = 170.42, df = 3, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$age_year and paste(data$cardio, data$alco)
Fligner-Killeen:med chi-squared = 148.37, df = 3, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$age_year and paste(data$cardio, data$active)
Fligner-Killeen:med chi-squared = 145.82, df = 3, p-value < 2.2e-16


In [99]:
fligner.test(x = data$ap_hi, g = paste(data$cardio, data$cholesterol))

fligner.test(x = data$ap_hi, g = paste(data$cardio, data$gluc))

fligner.test(x = data$ap_hi, g = paste(data$cardio, data$gender))

fligner.test(x = data$ap_hi, g = paste(data$cardio, data$smoke))

fligner.test(x = data$ap_hi, g = paste(data$cardio, data$alco))

fligner.test(x = data$ap_hi, g = paste(data$cardio, data$active))


	Fligner-Killeen test of homogeneity of variances

data:  data$ap_hi and paste(data$cardio, data$cholesterol)
Fligner-Killeen:med chi-squared = 10975, df = 5, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$ap_hi and paste(data$cardio, data$gluc)
Fligner-Killeen:med chi-squared = 11321, df = 5, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$ap_hi and paste(data$cardio, data$gender)
Fligner-Killeen:med chi-squared = 11445, df = 3, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$ap_hi and paste(data$cardio, data$smoke)
Fligner-Killeen:med chi-squared = 11201, df = 3, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$ap_hi and paste(data$cardio, data$alco)
Fligner-Killeen:med chi-squared = 11236, df = 3, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$ap_hi and paste(data$cardio, data$active)
Fligner-Killeen:med chi-squared = 11357, df = 3, p-value < 2.2e-16


In [100]:
fligner.test(x = data$ap_lo, g = paste(data$cardio, data$cholesterol))

fligner.test(x = data$ap_lo, g = paste(data$cardio, data$gluc))

fligner.test(x = data$ap_lo, g = paste(data$cardio, data$gender))

fligner.test(x = data$ap_lo, g = paste(data$cardio, data$smoke))

fligner.test(x = data$ap_lo, g = paste(data$cardio, data$alco))

fligner.test(x = data$ap_lo, g = paste(data$cardio, data$active))


	Fligner-Killeen test of homogeneity of variances

data:  data$ap_lo and paste(data$cardio, data$cholesterol)
Fligner-Killeen:med chi-squared = 1924.8, df = 5, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$ap_lo and paste(data$cardio, data$gluc)
Fligner-Killeen:med chi-squared = 2178.4, df = 5, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$ap_lo and paste(data$cardio, data$gender)
Fligner-Killeen:med chi-squared = 2039.1, df = 3, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$ap_lo and paste(data$cardio, data$smoke)
Fligner-Killeen:med chi-squared = 2104, df = 3, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$ap_lo and paste(data$cardio, data$alco)
Fligner-Killeen:med chi-squared = 1812.4, df = 3, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$ap_lo and paste(data$cardio, data$active)
Fligner-Killeen:med chi-squared = 1907.6, df = 3, p-value < 2.2e-16


In [101]:
fligner.test(x = data$IMC, g = paste(data$cardio, data$cholesterol))

fligner.test(x = data$IMC, g = paste(data$cardio, data$gluc))

fligner.test(x = data$IMC, g = paste(data$cardio, data$gender))

fligner.test(x = data$IMC, g = paste(data$cardio, data$smoke))

fligner.test(x = data$IMC, g = paste(data$cardio, data$alco))

fligner.test(x = data$IMC, g = paste(data$cardio, data$active))


	Fligner-Killeen test of homogeneity of variances

data:  data$IMC and paste(data$cardio, data$cholesterol)
Fligner-Killeen:med chi-squared = 1607.3, df = 5, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$IMC and paste(data$cardio, data$gluc)
Fligner-Killeen:med chi-squared = 1604, df = 5, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$IMC and paste(data$cardio, data$gender)
Fligner-Killeen:med chi-squared = 2289.6, df = 3, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$IMC and paste(data$cardio, data$smoke)
Fligner-Killeen:med chi-squared = 1595.3, df = 3, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$IMC and paste(data$cardio, data$alco)
Fligner-Killeen:med chi-squared = 1548.2, df = 3, p-value < 2.2e-16



	Fligner-Killeen test of homogeneity of variances

data:  data$IMC and paste(data$cardio, data$active)
Fligner-Killeen:med chi-squared = 1553.6, df = 3, p-value < 2.2e-16


### Normalizar

In [107]:
lapply(data[,quantitative.cols], bestNormalize)










$age_year
Best Normalizing transformation with 61215 Observations
 Estimated Normality Statistics (Pearson P / df, lower => more normal):
 - arcsinh(x): 60.0781
 - Box-Cox: 55.2542
 - Center+scale: 57.8941
 - Double Reversed Log_b(x+a): 54.7672
 - Exp(x): 3712.9255
 - Log_b(x+a): 60.0793
 - orderNorm (ORQ): 1.1441
 - sqrt(x + a): 61.3754
 - Yeo-Johnson: 55.321
Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
 
Based off these, bestNormalize chose:
orderNorm Transformation with 61215 nonmissing obs and ties
 - 7990 unique values 
 - Original quantiles:
    0%    25%    50%    75%   100% 
29.584 48.292 53.948 58.375 64.967 

$height
Best Normalizing transformation with 61215 Observations
 Estimated Normality Statistics (Pearson P / df, lower => more normal):
 - arcsinh(x): 182.0352
 - Box-Cox: 181.8664
 - Center+scale: 182.1074
 - Double Reversed Log_b(x+a): 183.4496
 - Exp(x): 6293.2881
 - Log_b(x+a): 182.0352
 - orderNorm (ORQ): 182.4126
 - sqrt(x + a): 181.9576
 - Y

In [110]:
transform <-function(col){
    orderNorm(data[,col])$x.t
}

normalized <- lapply(colnames(data[, quantitative.cols]), transform)

“Ties in data, Normal distribution not guaranteed
”


“Ties in data, Normal distribution not guaranteed
”
“Ties in data, Normal distribution not guaranteed
”
“Ties in data, Normal distribution not guaranteed
”
“Ties in data, Normal distribution not guaranteed
”
“Ties in data, Normal distribution not guaranteed
”
“Ties in data, Normal distribution not guaranteed
”


In [119]:
normalized.df <- as.data.frame(normalized)
colnames(normalized.df) <- colnames(data[,quantitative.cols])
head(normalized.df)

Unnamed: 0_level_0,age_year,height,weight,ap_hi,ap_lo,IMC,pulse
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,-0.4057892,0.41887384,-0.8913307,-1.2311587,-0.07993353,-1.3301027,-1.6360867
2,0.1663161,-1.12195809,0.9485029,0.9672581,1.02832263,1.5384435,0.6883186
3,-0.3141891,0.05229387,-0.7177043,0.5243535,-1.25106063,-0.7747778,1.333283
4,-0.6757626,0.57095417,0.7639328,1.4449331,1.96966384,0.5130873,0.6883186
5,-0.74729,-1.12195809,-1.5222658,-2.0530072,-2.29198696,-0.9727312,-0.3429522
6,0.9134493,-1.80144499,-0.4138947,-0.2475163,-0.07993353,0.6209632,-0.3429522


In [121]:
plot_histograms <- function(column){
    ggplot(normalized.df, aes(x=normalized.df[,column], fill=data$cardio)) + 
    geom_density(alpha=0.5) +
    labs(x=column)
}
plots <- lapply(colnames(normalized.df), plot_histograms)

plot.arrange <- ggarrange(plots[[1]],plots[[2]],plots[[3]],plots[[6]], plots[[4]],
          plots[[5]],plots[[7]], nrow=2, ncol=4)

ggsave("../docs/assests/plot_norm.png",bg = "white", plot = plot.arrange, width = 15, height = 10)


In [125]:
plot_descdist <- function(colname){
    png(paste("../docs/assests/", colname, "_cardio_yes_dists_norm.png", sep=""), bg = "white", width = 800, height = 600)
    descdist(cardio.yes[, colname])
    dev.off()
}

lapply(colnames(normalized.df[data$cardio == "Yes",]), plot_descdist)

In [126]:
plot_descdist <- function(colname){
    png(paste("../docs/assests/", colname, "_cardio_no_dists_norm.png", sep=""), bg = "white", width = 800, height = 600)
    descdist(cardio.yes[, colname])
    dev.off()
}
lapply(colnames(normalized.df[data$cardio == "No",]), plot_descdist)

In [132]:
lillie.test(normalized.df[data$cardio == "Yes", "age_year"])
lillie.test(normalized.df[data$cardio == "No", "age_year"])
lillie.test(normalized.df[data$cardio == "Yes", "height"])
lillie.test(normalized.df[data$cardio == "No", "height"])
lillie.test(normalized.df[data$cardio == "Yes", "weight"])
lillie.test(normalized.df[data$cardio == "No", "weight"])
lillie.test(normalized.df[data$cardio == "Yes", "ap_hi"])
lillie.test(normalized.df[data$cardio == "No", "ap_hi"])
lillie.test(normalized.df[data$cardio == "Yes", "ap_lo"])
lillie.test(normalized.df[data$cardio == "No", "ap_lo"])
lillie.test(normalized.df[data$cardio == "Yes", "IMC"])
lillie.test(normalized.df[data$cardio == "No", "IMC"])
lillie.test(normalized.df[data$cardio == "Yes", "pulse"])
lillie.test(normalized.df[data$cardio == "No", "pulse"])


	Lilliefors (Kolmogorov-Smirnov) normality test

data:  normalized.df[data$cardio == "Yes", "age_year"]
D = 0.0076844, p-value = 0.0003229



	Lilliefors (Kolmogorov-Smirnov) normality test

data:  normalized.df[data$cardio == "No", "age_year"]
D = 0.0079086, p-value = 0.0003162



	Lilliefors (Kolmogorov-Smirnov) normality test

data:  normalized.df[data$cardio == "Yes", "height"]
D = 0.041096, p-value < 2.2e-16



	Lilliefors (Kolmogorov-Smirnov) normality test

data:  normalized.df[data$cardio == "No", "height"]
D = 0.046406, p-value < 2.2e-16



	Lilliefors (Kolmogorov-Smirnov) normality test

data:  normalized.df[data$cardio == "Yes", "weight"]
D = 0.028629, p-value < 2.2e-16



	Lilliefors (Kolmogorov-Smirnov) normality test

data:  normalized.df[data$cardio == "No", "weight"]
D = 0.038379, p-value < 2.2e-16



	Lilliefors (Kolmogorov-Smirnov) normality test

data:  normalized.df[data$cardio == "Yes", "ap_hi"]
D = 0.17317, p-value < 2.2e-16



	Lilliefors (Kolmogorov-Smirnov) normality test

data:  normalized.df[data$cardio == "No", "ap_hi"]
D = 0.35255, p-value < 2.2e-16



	Lilliefors (Kolmogorov-Smirnov) normality test

data:  normalized.df[data$cardio == "Yes", "ap_lo"]
D = 0.26263, p-value < 2.2e-16



	Lilliefors (Kolmogorov-Smirnov) normality test

data:  normalized.df[data$cardio == "No", "ap_lo"]
D = 0.36741, p-value < 2.2e-16



	Lilliefors (Kolmogorov-Smirnov) normality test

data:  normalized.df[data$cardio == "Yes", "IMC"]
D = 0.0079905, p-value = 0.000143



	Lilliefors (Kolmogorov-Smirnov) normality test

data:  normalized.df[data$cardio == "No", "IMC"]
D = 0.013979, p-value = 5.84e-14



	Lilliefors (Kolmogorov-Smirnov) normality test

data:  normalized.df[data$cardio == "Yes", "pulse"]
D = 0.22389, p-value < 2.2e-16



	Lilliefors (Kolmogorov-Smirnov) normality test

data:  normalized.df[data$cardio == "No", "pulse"]
D = 0.36725, p-value < 2.2e-16


In [None]:
lillie.test(normalized.df$age_year)

In [135]:
chisq.test(x = data$cardio, y=data$cholesterol)


	Pearson's Chi-squared test

data:  data$cardio and data$cholesterol
X-squared = 3755.2, df = 2, p-value < 2.2e-16


In [137]:
chisq.test(x = data$cardio, y=data$smoke)


	Pearson's Chi-squared test with Yates' continuity correction

data:  data$cardio and data$smoke
X-squared = 23.456, df = 1, p-value = 1.278e-06


In [146]:
table(data[, c("cardio", "smoke")])

      smoke
cardio    No   Yes
   No  26946  2796
   Yes 28865  2608