-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_selection.R
80 lines (57 loc) · 3.24 KB
/
feature_selection.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
###scripts to perform feature selection in biomarkers data of COVID, sepsis, and septic shock patients
#I am isolating the dependent variables (X) from the other clinical information present in the data.
#then, I append the Y (label) to the new dataset.
data <- read.csv("~/Documents/covid/benchmark_paper/datasets/all_patients.csv")
data_x <- data[,7:36]
data_x$mof <- data$MOF
#####Boruta
install.packages('Boruta')
library(Boruta)
colnames(data_x[,1:30]) <- c("ICAM-1", "Lipo", "MPO",
"VCAM", "PDL1", "GCSF", "IL1b", "VEGFC", "DDimer", "ESelectin",
"Ferritin", "SPD", "IL10", "IL17A", "GMCSF", "IL7", "CXCL10", "ANG2",
"IL1ra", "IL6", "CCL2", "IL12", "IL2", "IL4", "IL15", "GranzymeB",
"IFNg", "TNFa", "sIL6R", "sGP130")
boruta_output <- Boruta(data_x$mof ~ ., data=data_x[,1:30], doTrace=0)
names(boruta_output)
boruta_signif <- getSelectedAttributes(boruta_output, withTentative = TRUE)
print(boruta_signif)
roughFixMod <- TentativeRoughFix(boruta_output)
boruta_signif <- getSelectedAttributes(roughFixMod)
print(boruta_signif)
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ]) # descending sort
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")
#####Lasso regression
install.packages("glmnet")
library(glmnet)
x <- as.matrix(data_x[,1:30])
y <- as.double(as.matrix(ifelse(data_x$mof==0, 0, 1))) # Only Class
set.seed(100)
cv.lasso <- cv.glmnet(x, y, family='binomial', alpha=1, parallel=TRUE, standardize=TRUE, type.measure='auc')
plot(cv.lasso)
cat('Min Lambda: ', cv.lasso$lambda.min, '\n 1Sd Lambda: ', cv.lasso$lambda.1se)
df_coef <- round(as.matrix(coef(cv.lasso, s=cv.lasso$lambda.min)), 12)
df_coef[df_coef[, 1] != 0, ]
####information gain
#these values were obtained in Python with the script namely information_gain.py
information_gain <- data.frame(c(0.10020154, 0. , 0.07415846, 0.05956189, 0.11999551,
0.07305173, 0.05569018, 0.09154714, 0. , 0.00695067,
0. , 0. , 0.01781756, 0.01804687, 0.09119376,
0.06876391, 0.01225613, 0. , 0.1560334 , 0.05213682,
0.02987779, 0.04186097, 0.0242144 , 0.03960486, 0.15434779,
0.09888347, 0. , 0.04390859, 0.02396775, 0. ))
colnames(information_gain) <- "information_gain"
information_gain$variable <- c("ICAM-1", "Lipo", "MPO",
"VCAM", "PDL1", "GCSF", "IL1b", "VEGFC", "DDimer", "ESelectin",
"Ferritin", "SPD", "IL10", "IL17A", "GMCSF", "IL7", "CXCL10", "ANG2",
"IL1ra", "IL6", "CCL2", "IL12", "IL2", "IL4", "IL15", "GranzymeB",
"IFNg", "TNFa", "sIL6R", "sGP130")
information_gain <- sort(information_gain$information_gain)
ggplot(data = information_gain, aes(x = variable, y = information_gain))+
geom_bar(stat = 'identity', color = "blue", fill = "blue", position = position_dodge(width=0.1), width=0.8)+
coord_flip()+
geom_hline(yintercept = 0.09888347, linetype = 2)+
xlab("")+
ylab("Information gain")