upload our package

frangam · Mar 16, 2023 · 1ea090d · 1ea090d
1 parent 44001ef
commit 1ea090d
Show file tree

Hide file tree

Showing 9 changed files with 4,310 additions and 0 deletions.
diff --git a/examples/extract-features.R b/examples/extract-features.R
@@ -0,0 +1,120 @@
+#------------------------------------------------------
+# © 2020 Francisco M. Garcia-Moreno
+#------------------------------------------------------
+
+source("utils/utils.R")
+source("utils/predict_fun.R")
+source("utils/multiplot.R")
+source("utils/sensor_utils.R")
+Sys.setlocale("LC_ALL", "en_US.UTF-8")
+
+#------------------------------------------------------
+# Extract features and generate dataset windowed
+#------------------------------------------------------
+
+# Load questionnaire dataset (where we have the target label CLASS - frail, pre-frail, non-frail)
+questionnaires_frailty<-read.csv("Datasets/questionnaires/preprocessed/all_questionnaires_frailty.csv", row.names = "X")
+questionnaires_frailty <- questionnaires_frailty[questionnaires_frailty$Edad>=65,]
+subjects_excluded <- rownames(questionnaires_frailty[questionnaires_frailty$Edad<65,])
+
+sampling_rates<-c(0.1, 0.01, 0.06, 0.08, 0.05, 0.15, 0.25, 0.5) #desired sampling rates
+windows_sizes_in_sec<-c(0.5, 1, 1.5, 2, 2.5, 3) #desired window sizes
+
+# Generate the wearable dataset
+complete_dataset_extracting_features(subjects_excluded, devices=c("smartwatch_empatica"),
+                                     split_events = T, # split_events: True if we have a column "event" where we have labelled some known events
+                                     windows_sizes_in_sec = c(1, 1.5, 2, 2.5, 3),
+                                     sampling_rates = )
+
+#------------------------------------------------------
+# Load generated dataset (whitout event column)
+#------------------------------------------------------
+dfs <- load_complete_dataset_with_features_extracted(questionnaires_frailty, devices=c(
+                                                    "smartwatch_empatica"
+                                                    ),
+                                                     windows_sizes_in_sec = windows_sizes_in_sec,
+                                                     sampling_rates = sampling_rates,
+                                                     split_events = split_events,
+                                                     y_col_name = "fried_frailty_risk", new_cols=c(),
+                                                     del_cols=c(
+                                                         built_col_name_of_basic_stats(c("acc", "gyr", "gra", "linacc"), c("corr.xy", "corr.xz", "corr.yz"))
+                                                       # , built_col_name_of_basic_stats(c("pres", "lig", "noise"), c(statisticsID(), "energy"))
+                                                       , built_col_name_of_basic_stats(c("pres"), c(statisticsID(), "energy"))
+                                                       , built_col_name_of_basic_stats(c("lig"), c(statisticsID(), "energy"))
+                                                       , built_col_name_of_basic_stats(c("noise"), c(statisticsID(), "energy"))
+
+                                                       , built_col_name_of_basic_stats(c("linacc.x", "linacc.y", "linacc.z"), c(statisticsID(), "energy"))
+                                                       , built_col_name_of_basic_stats(c("gra.x", "gra.y", "gra.z"), c(statisticsID(), "energy"))
+                                                       # built_col_name_of_basic_stats(SMARTWATCH_TRIAXIAL_SENSORS_COL_NAMES, c(statisticsID(), "energy")),
+                                                       # , built_col_name_of_basic_stats(c("eda"), c(statisticsID(), "energy"))
+                                                       # , built_col_name_of_basic_stats(c("temp"), c(statisticsID(), "energy"))
+                                                       # , built_col_name_of_basic_stats(c("hr"), c(statisticsID(), "energy"))
+                                                       # , built_col_name_of_basic_stats(c("gyr.x", "gyr.y", "gyr.z"), c(statisticsID(), "energy"))
+                                                       #
+                                                       # , built_col_name_of_basic_stats(c("acc.x", "acc.y", "acc.z"), c(statisticsID(), "energy"))
+                                                       # , built_col_name_of_basic_stats(c("acc.x"), c(statisticsID(), "energy"))
+                                                       # , built_col_name_of_basic_stats(c("acc.y"), c(statisticsID(), "energy"))
+                                                       # , built_col_name_of_basic_stats(c("acc.z"), c(statisticsID(), "energy"))
+
+
+
+
+                                                       )
+)
+
+
+#------------------------------------------------------
+# Example: Prepare only data for EMPATICA E4 device
+# for the different sampling-rates and window-sizes configurations
+#------------------------------------------------------
+dfs_smart_emp <- dfs$smartwatch_empatica
+
+
+#Sampling rate = 1/0.1 = 10Hz
+df_smart_emp_sr0.1_w0.5 <- dfs_smart_emp$`0.1S`$`0.5s`
+df_smart_emp_sr0.1_w1 <- dfs_smart_emp$`0.1S`$`1s`
+df_smart_emp_sr0.1_w1.5 <- dfs_smart_emp$`0.1S`$`1.5s`
+df_smart_emp_sr0.1_w2 <- dfs_smart_emp$`0.1S`$`2s`
+df_smart_emp_sr0.1_w2.5 <- dfs_smart_emp$`0.1S`$`2.5s`
+df_smart_emp_sr0.1_w3 <- dfs_smart_emp$`0.1S`$`3s`
+
+#Sampling rate = 1/0.06 = ~15Hz
+df_smart_emp_sr0.06_w0.5 <- dfs_smart_emp$`0.06S`$`0.5s`
+df_smart_emp_sr0.06_w1 <- dfs_smart_emp$`0.06S`$`1s`
+df_smart_emp_sr0.06_w1.5 <- dfs_smart_emp$`0.06S`$`1.5s`
+df_smart_emp_sr0.06_w2 <- dfs_smart_emp$`0.06S`$`2s`
+df_smart_emp_sr0.06_w2.5 <- dfs_smart_emp$`0.06S`$`2.5s`
+df_smart_emp_sr0.06_w3 <- dfs_smart_emp$`0.06S`$`3s`
+
+#Sampling rate = 1/0.04 = 25Hz
+df_smart_emp_sr0.04_w0.5 <- dfs_smart_emp$`0.04S`$`0.5s`
+df_smart_emp_sr0.04_w1 <- dfs_smart_emp$`0.04S`$`1s`
+df_smart_emp_sr0.04_w1.5 <- dfs_smart_emp$`0.04S`$`1.5s`
+df_smart_emp_sr0.04_w2 <- dfs_smart_emp$`0.04S`$`2s`
+df_smart_emp_sr0.04_w2.5 <- dfs_smart_emp$`0.04S`$`2.5s`
+df_smart_emp_sr0.04_w3 <- dfs_smart_emp$`0.04S`$`3s`
+
+#Sampling rate = 1/0.02 = 50Hz
+df_smart_emp_sr0.02_w0.5 <- dfs_smart_emp$`0.02S`$`0.5s`
+df_smart_emp_sr0.02_w1 <- dfs_smart_emp$`0.02S`$`1s`
+df_smart_emp_sr0.02_w1.5 <- dfs_smart_emp$`0.02S`$`1.5s`
+df_smart_emp_sr0.02_w2 <- dfs_smart_emp$`0.02S`$`2s`
+df_smart_emp_sr0.02_w2.5 <- dfs_smart_emp$`0.02S`$`2.5s`
+df_smart_emp_sr0.02_w3 <- dfs_smart_emp$`0.02S`$`3s`
+
+#Sampling rate = 1/0.01 = 100Hz
+df_smart_emp_sr0.01_w0.5 <- dfs_smart_emp$`0.01S`$`0.5s`
+df_smart_emp_sr0.01_w1 <- dfs_smart_emp$`0.01S`$`1s`
+df_smart_emp_sr0.01_w1.5 <- dfs_smart_emp$`0.01S`$`1.5s`
+df_smart_emp_sr0.01_w2 <- dfs_smart_emp$`0.01S`$`2s`
+df_smart_emp_sr0.01_w2.5 <- dfs_smart_emp$`0.01S`$`2.5s`
+df_smart_emp_sr0.01_w3 <- dfs_smart_emp$`0.01S`$`3s`
+
+
+#------------------------------------------------------
+# For testing the library:
+# We can select a single configuration:
+# - sampling rate = 0.04
+# - window size = 0.5
+#------------------------------------------------------
+data <- df_smart_emp_sr0.04_w0.5
diff --git a/examples/hypeparemter-tuning.R b/examples/hypeparemter-tuning.R
@@ -0,0 +1,69 @@
+#------------------------------------------------------
+# © 2020 Francisco M. Garcia-Moreno
+#------------------------------------------------------
+
+source("utils/utils.R")
+source("utils/predict_fun.R")
+source("utils/multiplot.R")
+source("utils/sensor_utils.R")
+Sys.setlocale("LC_ALL", "en_US.UTF-8")
+
+#------------------------------------------------------
+# Hyperparameter tuning example
+#------------------------------------------------------
+set.seed(33)
+if (Sys.getenv("JAVA_HOME")!="") Sys.setenv(JAVA_HOME="")
+
+
+#------------------------------------------------------
+# For testing purposes we will use the data generated
+# from /examples/extract-features.R
+# We can select a single configuration:
+# - sampling rate = 0.04
+# - window size = 0.5
+#------------------------------------------------------
+data <- df_smart_emp_sr0.04_w0.5
+
+#we have to pass a validation data split
+parts <- createDataPartition(data$y, p = 0.8, list = F)
+validation_data <- data[-parts, ]
+train_data <- data[parts, ]
+
+#------------------------------------------------------
+# Examples of hyperparameter tuning:
+#------------------------------------------------------
+
+
+#++++++
+#sr: 0.04; wz: 0.5
+# we test 3 models: Random forest ("rf"), SVM ("svm") and k-nn ("knn")
+# evaluation metric = f1-score ("f1"); 10-fold (non-repeated)
+#++++++
+f1_1_1 <- search_features_selection(train_data, method="rf", model="rf", minfeatTune=1, metric="f1", search_method="rfe", folds=10, reps=1, verboseOnlyFeatSel=T, verbose=F)
+f1_1_2 <- search_features_selection(train_data, method="rf", model="svm", minfeatTune=1, metric="f1", search_method="rfe", folds=10, reps=1, verboseOnlyFeatSel=T, verbose=F)
+f1_1_3 <- search_features_selection(train_data, method="rf", model="knn", minfeatTune=1, metric="f1", search_method="rfe", folds=10, reps=1, verboseOnlyFeatSel=T, verbose=F)
+
+#++++++
+# plotting results
+#++++++
+plot_features_vs_acc_list_features(list(f1_1_1$all, f1_1_2$all, f1_1_3$all), metric="f1", tech=c("RF", "SVM", "k-NN"), lineBest=F, interv = 8)
+plot_features_vs_acc_list_features(list(f1_1_3$all), metric="f1", tech=c("k-NN"), lineBest=F, interv = 8)
+ggsave("../../../Datasets")
+f1_1_3$all[f1_1_3$total_features]
+f1_1_3$all[25]
+length(f1_1_3$all)
+model_knn<-model.frame(f1_1_3$features,data)
+colnames(model_knn)
+
+
+#------------------------------------------------------
+# Validation
+#------------------------------------------------------
+features <- f1_1_3
+
+res <- run_cv_prediction(features$features, train_data, validation_data, "knn", features$bestParam)
+validation_met <- get_tpr_tnr_auc_in_multiclass(unique(train_data$y), res$prediction, validation_data$y) #get get_metrics(res$prediction, validation_data[,ncol(validation_data)], posClass = 1) #get_tpr_tnr_auc_in_multiclass(unique(train$y), prediction, test_labels, verbose=verbose)
+cat(paste("\n   >> VALIDATION Accuracy:", validation_met$acc, ". Precision:", validation_met$prec, ". Recall:", validation_met$rec, ". F1-Score:", validation_met$f, ". TPR:", validation_met$tpr, ". TNR:", validation_met$tnr))
+
+f1_1_3$all[f1_1_3$total_features]
+
diff --git a/examples/preprocess-data.R b/examples/preprocess-data.R
@@ -0,0 +1,46 @@
+#------------------------------------------------------
+# © 2020 Francisco M. Garcia-Moreno
+#------------------------------------------------------
+
+source("utils/utils.R")
+source("utils/predict_fun.R")
+source("utils/multiplot.R")
+source("utils/sensor_utils.R")
+Sys.setlocale("LC_ALL", "en_US.UTF-8")
+
+#------------------------------------------------------
+# Preprocessing example
+#------------------------------------------------------
+set.seed(33)
+if (Sys.getenv("JAVA_HOME")!="") Sys.setenv(JAVA_HOME="")
+
+
+#------------------------------------------------------
+# For testing purposes we will use the data generated
+# from /examples/extract-features.R
+# We can select a single configuration:
+# - sampling rate = 0.04
+# - window size = 0.5
+#------------------------------------------------------
+data <- df_smart_emp_sr0.04_w0.5
+
+#we have to pass a validation data split
+parts <- createDataPartition(data$y, p = 0.8, list = F)
+validation_data <- data[-parts, ]
+train_data <- data[parts, ]
+
+#------------------------------------------------------
+# Examples of preprocessing pipelines:
+#------------------------------------------------------
+
+
+# -- Feature selection: Random forest ("rf")
+# -- Classifier: K-Nearest Neighbors ("knn")
+# -- Evaluation metric: F1-Score ("f1")
+# Preprocessing methods in this order:
+# -- 1) impute missing values with 0 (na=0)
+# -- 2) smote for balancing data (up-60; down-300)
+prep_models <- preprocess_data_several_methods(train_data, validation_data, 
+	fs_method="rf", model="knn", metric = "f1",
+    prep_methods=list(na=0, smote=c(60, 300)),
+    folds=5, reps=1, verbose=T)
diff --git a/utils/install_packages.R b/utils/install_packages.R
@@ -0,0 +1,19 @@
+#------------------------------------------------------
+# © 2020 Francisco M. Garcia-Moreno
+#------------------------------------------------------
+#R Version 4.0.1: https://cloud.r-project.org/bin/macosx/
+#Compiling R on Mac: https://mac.r-project.org/tools/
+
+install.packages("prettyR")
+install.packages("parallelMap")
+install.packages("psych")
+install.packages("caret", type="source")
+install.packages("e1071")
+install.packages("ggplot2")
+install.packages("DMwR")
+install.packages("ROCR")
+install.packages("PRROC")
+install.packages("neuralnet")
+install.packages("naivebayes")
+install.packages("RWeka")
+install.packages("FSelector")
diff --git a/utils/multiclass_utils.R b/utils/multiclass_utils.R
@@ -0,0 +1,30 @@
+#------------------------------------------------------
+# © 2020 Francisco M. Garcia-Moreno
+#------------------------------------------------------
+
+calculate.accuracy <- function(predictions, ref.labels) {
+  return(length(which(predictions == ref.labels)) / length(ref.labels))
+}
+
+calculate.w.accuracy <- function(predictions, ref.labels, weights) {
+  lvls <- levels(ref.labels)
+  if (length(weights) != length(lvls)) {
+    stop("Number of weights should agree with the number of classes.")
+  }
+  if (sum(weights) != 1) {
+    stop("Weights do not sum to 1")
+  }
+  accs <- lapply(lvls, function(x) {
+    idx <- which(ref.labels == x)
+    return(calculate.accuracy(predictions[idx], ref.labels[idx]))
+  })
+  acc <- mean(unlist(accs))
+  return(acc)
+}
+
+calculate_multiclass_accuracy <- function(predictions, ref.labels){
+  overall_acc <- calculate.accuracy(predictions, ref.labels)
+  weights <- rep(1 / length(levels(ref.labels)), length(levels(ref.labels)))
+  w.acc <- calculate.w.accuracy(predictions, ref.labels, weights)
+  return(list(overall_acc=overall_acc, weighted_acc=w.acc))
+}
diff --git a/utils/multiplot.R b/utils/multiplot.R
@@ -0,0 +1,46 @@
+#
+# FILENAME: multiplot.R                                                          #
+#
+
+# Multiple plot function.
+# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
+# - cols:   Number of columns in layout
+# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
+#
+# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
+# then plot 1 will go in the upper left, 2 will go in the upper right, and
+# 3 will go all the way across the bottom.
+#
+# Source: http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/
+multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
+  library(grid)
+
+  # Make a list from the ... arguments and plotlist
+  plots <- c(list(...), plotlist)
+  numPlots = length(plots)
+
+  # If layout is NULL, then use 'cols' to determine layout
+  if (is.null(layout)) {
+    # Make the panel
+    # ncol: Number of columns of plots
+    # nrow: Number of rows needed, calculated from # of cols
+    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
+                     ncol = cols, nrow = ceiling(numPlots/cols))
+  }
+  if (numPlots==1) {
+    print(plots[[1]])
+  } else {
+    # Set up the page
+    grid.newpage()
+    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
+
+    # Make each plot, in the correct location
+    for (i in 1:numPlots) {
+      # Get the i,j matrix positions of the regions that contain this subplot
+      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
+
+      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
+                                      layout.pos.col = matchidx$col))
+    }
+  }
+}