**Classification ability of the three-metabolite model based on machine learning algorithms**

In [None]:
# load library
library(dplyr)
packageVersion('dplyr')
library(readxl)
packageVersion('readxl')

In [None]:
# data directory
project.dir = '...'
data.dir = '...'
results.dir = '...'
regeps.dir = '...'
mets.dir = file.path(regeps.dir, '...')

In [None]:
# load machine learning results of three metabolites
three.mets <- read.csv(file.path(results.dir, 'Prediction_model_result_of_validated_sig_metabolites.csv'))
dim(three.mets)
head(three.mets)

In [None]:
# merge name of metabolite
# load metabolite info
mets.info <- read_excel(file.path(mets.dir,"DATA TABLES.XLSX"), sheet = "Chemical Annotation")
# add one more column: met_id
mets.info$metabolite <- paste('M', mets.info$CHEM_ID, sep = '')
mets.info <- mets.info %>% select(SUPER_PATHWAY, SUB_PATHWAY, CHEMICAL_NAME, metabolite)

In [None]:
# merge with machine learning result
three.mets <- three.mets %>% left_join(mets.info, by = 'metabolite')
dim(three.mets)
head(three.mets)

In [None]:
# move met info columns to after metabolite column
# Columns to move to the beginning
columns.to.move <- c("SUPER_PATHWAY", "SUB_PATHWAY", "CHEMICAL_NAME")

# Move columns to the beginning
three.mets <- three.mets %>%
  select(seed, metabolite, all_of(columns.to.move), everything())
head(three.mets)

In [None]:
# replace NA in chemical name column to all
three.mets <- three.mets %>%
  mutate(CHEMICAL_NAME = ifelse(is.na(CHEMICAL_NAME), "all", CHEMICAL_NAME))
head(three.mets)

In [None]:
# calculate logistic train AUC
log.train.mean <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(logistic_train_mean_auc = mean(logistic_train_auc))
log.train.mean$logistic_train_mean_auc <- round(log.train.mean$logistic_train_mean_auc, digits = 2)
log.train.mean

# calculate logistic test AUC
log.test.mean <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(logistic_test_mean_auc = mean(logistic_test_auc))
log.test.mean$logistic_test_mean_auc <- round(log.test.mean$logistic_test_mean_auc, digits = 2)
log.test.mean

In [None]:
# calculate logistic train AUC (min)
log.train.min <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(logistic_train_min_auc = min(logistic_train_auc))
log.train.min$logistic_train_min_auc <- round(log.train.min$logistic_train_min_auc, digits = 2)
log.train.min

# calculate logistic test AUC
log.test.min <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(logistic_test_min_auc = min(logistic_test_auc))
log.test.min$logistic_test_min_auc <- round(log.test.min$logistic_test_min_auc, digits = 2)
log.test.min

In [None]:
# calculate logistic train AUC (max)
log.train.max <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(logistic_train_max_auc = max(logistic_train_auc))
log.train.max$logistic_train_max_auc <- round(log.train.max$logistic_train_max_auc, digits = 2)
log.train.max

# calculate logistic test AUC
log.test.max <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(logistic_test_max_auc = max(logistic_test_auc))
log.test.max$logistic_test_max_auc <- round(log.test.max$logistic_test_max_auc, digits = 2)
log.test.max

In [None]:
# merge log test
log.test <- log.train.mean %>% left_join(log.test.mean, by = 'CHEMICAL_NAME') %>%
            left_join(log.train.min, by = 'CHEMICAL_NAME') %>%
            left_join(log.test.min, by = 'CHEMICAL_NAME') %>%
            left_join(log.train.max, by = 'CHEMICAL_NAME') %>%
            left_join(log.test.max, by = 'CHEMICAL_NAME')
log.test$logistic_train_auc <- paste(log.test$logistic_train_mean_auc, '(', log.test$logistic_train_min_auc, ',', log.test$logistic_train_max_auc, ')')
log.test$logistic_test_auc <- paste(log.test$logistic_test_mean_auc, '(', log.test$logistic_test_min_auc, ',', log.test$logistic_test_max_auc, ')')
log.test <- log.test %>% select(CHEMICAL_NAME, logistic_train_auc, logistic_test_auc)
log.test

In [None]:
# calculate SVC AUC
svc.train.mean <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(svc_train_mean_auc = mean(svc_train_auc))
svc.train.mean$svc_train_mean_auc <- round(svc.train.mean$svc_train_mean_auc, digits = 2)
svc.train.mean

svc.test.mean <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(svc_test_mean_auc = mean(svc_test_auc))
svc.test.mean$svc_test_mean_auc <- round(svc.test.mean$svc_test_mean_auc, digits = 2)
svc.test.mean

In [None]:
# calculate SVC AUC
svc.train.min <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(svc_train_min_auc = min(svc_train_auc))
svc.train.min$svc_train_min_auc <- round(svc.train.min$svc_train_min_auc, digits = 2)
svc.train.min

svc.test.min <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(svc_test_min_auc = min(svc_test_auc))
svc.test.min$svc_test_min_auc <- round(svc.test.min$svc_test_min_auc, digits = 2)
svc.test.min

In [None]:
# calculate SVC AUC
svc.train.max <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(svc_train_max_auc = max(svc_train_auc))
svc.train.max$svc_train_max_auc <- round(svc.train.max$svc_train_max_auc, digits = 2)
svc.train.max

svc.test.max <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(svc_test_max_auc = max(svc_test_auc))
svc.test.max$svc_test_max_auc <- round(svc.test.max$svc_test_max_auc, digits = 2)
svc.test.max

In [None]:
# merge svc test
svc.test <- svc.train.mean %>% left_join(svc.test.mean, by = 'CHEMICAL_NAME') %>%
            left_join(svc.train.min, by = 'CHEMICAL_NAME') %>%
            left_join(svc.test.min, by = 'CHEMICAL_NAME') %>%
            left_join(svc.train.max, by = 'CHEMICAL_NAME') %>%
            left_join(svc.test.max, by = 'CHEMICAL_NAME')
svc.test$svc_train_auc <- paste(svc.test$svc_train_mean_auc, '(', svc.test$svc_train_min_auc, ',', svc.test$svc_train_max_auc, ')')
svc.test$svc_test_auc <- paste(svc.test$svc_test_mean_auc, '(', svc.test$svc_test_min_auc, ',', svc.test$svc_test_max_auc, ')')
svc.test <- svc.test %>% select(CHEMICAL_NAME, svc_train_auc, svc_test_auc)
svc.test

In [None]:
# calculate forest AUC
forest.train.mean <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(forest_train_mean_auc = mean(forest_train_auc))
forest.train.mean$forest_train_mean_auc <- round(forest.train.mean$forest_train_mean_auc, digits = 2)
forest.train.mean

forest.test.mean <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(forest_test_mean_auc = mean(forest_test_auc))
forest.test.mean$forest_test_mean_auc <- round(forest.test.mean$forest_test_mean_auc, digits = 2)
forest.test.mean

In [None]:
# calculate forest AUC (min)
forest.train.min <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(forest_train_min_auc = min(forest_train_auc))
forest.train.min$forest_train_min_auc <- round(forest.train.min$forest_train_min_auc, digits = 2)
forest.train.min

forest.test.min <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(forest_test_min_auc = min(forest_test_auc))
forest.test.min$forest_test_min_auc <- round(forest.test.min$forest_test_min_auc, digits = 2)
forest.test.min

In [None]:
# calculate forest AUC (max)
forest.train.max <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(forest_train_max_auc = max(forest_train_auc))
forest.train.max$forest_train_max_auc <- round(forest.train.max$forest_train_max_auc, digits = 2)
forest.train.max

forest.test.max <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(forest_test_max_auc = max(forest_test_auc))
forest.test.max$forest_test_max_auc <- round(forest.test.max$forest_test_max_auc, digits = 2)
forest.test.max

In [None]:
# merge forest test
forest.test <- forest.train.mean %>% left_join(forest.test.mean, by = 'CHEMICAL_NAME') %>%
            left_join(forest.train.min, by = 'CHEMICAL_NAME') %>%
            left_join(forest.test.min, by = 'CHEMICAL_NAME') %>%
            left_join(forest.train.max, by = 'CHEMICAL_NAME') %>%
            left_join(forest.test.max, by = 'CHEMICAL_NAME')
forest.test$forest_train_auc <- paste(forest.test$forest_train_mean_auc, '(', forest.test$forest_train_min_auc, ',', forest.test$forest_train_max_auc, ')')
forest.test$forest_test_auc <- paste(forest.test$forest_test_mean_auc, '(', forest.test$forest_test_min_auc, ',', forest.test$forest_test_max_auc, ')')
forest.test <- forest.test %>% select(CHEMICAL_NAME, forest_train_auc, forest_test_auc)
forest.test

In [None]:
# calculate gbc AUC
gbc.train.mean <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(gbc_train_mean_auc = mean(gbc_train_auc))
gbc.train.mean$gbc_train_mean_auc <- round(gbc.train.mean$gbc_train_mean_auc, digits = 2)
gbc.train.mean

gbc.test.mean <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(gbc_test_mean_auc = mean(gbc_test_auc))
gbc.test.mean$gbc_test_mean_auc <- round(gbc.test.mean$gbc_test_mean_auc, digits = 2)
gbc.test.mean

In [None]:
# calculate gbc AUC (min)
gbc.train.min <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(gbc_train_min_auc = min(gbc_train_auc))
gbc.train.min$gbc_train_min_auc <- round(gbc.train.min$gbc_train_min_auc, digits = 2)
gbc.train.min

gbc.test.min <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(gbc_test_min_auc = min(gbc_test_auc))
gbc.test.min$gbc_test_min_auc <- round(gbc.test.min$gbc_test_min_auc, digits = 2)
gbc.test.min

In [None]:
# calculate gbc AUC (max)
gbc.train.max <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(gbc_train_max_auc = max(gbc_train_auc))
gbc.train.max$gbc_train_max_auc <- round(gbc.train.max$gbc_train_max_auc, digits = 2)
gbc.train.max

gbc.test.max <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(gbc_test_max_auc = max(gbc_test_auc))
gbc.test.max$gbc_test_max_auc <- round(gbc.test.max$gbc_test_max_auc, digits = 2)
gbc.test.max

In [None]:
# merge gbc test
gbc.test <- gbc.train.mean %>% left_join(gbc.test.mean, by = 'CHEMICAL_NAME') %>%
            left_join(gbc.train.min, by = 'CHEMICAL_NAME') %>%
            left_join(gbc.test.min, by = 'CHEMICAL_NAME') %>%
            left_join(gbc.train.max, by = 'CHEMICAL_NAME') %>%
            left_join(gbc.test.max, by = 'CHEMICAL_NAME')
gbc.test$gbc_train_auc <- paste(gbc.test$gbc_train_mean_auc, '(', gbc.test$gbc_train_min_auc, ',', gbc.test$gbc_train_max_auc, ')')
gbc.test$gbc_test_auc <- paste(gbc.test$gbc_test_mean_auc, '(', gbc.test$gbc_test_min_auc, ',', gbc.test$gbc_test_max_auc, ')')
gbc.test <- gbc.test %>% select(CHEMICAL_NAME, gbc_train_auc, gbc_test_auc)
gbc.test

In [None]:
# calculate mlp AUC
mlp.train.mean <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(mlp_train_mean_auc = mean(mlp_train_auc))
mlp.train.mean$mlp_train_mean_auc <- round(mlp.train.mean$mlp_train_mean_auc, digits = 2)
mlp.train.mean

mlp.test.mean <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(mlp_test_mean_auc = mean(mlp_test_auc))
mlp.test.mean$mlp_test_mean_auc <- round(mlp.test.mean$mlp_test_mean_auc, digits = 2)
mlp.test.mean

In [None]:
# calculate mlp AUC (min)
mlp.train.min <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(mlp_train_min_auc = min(mlp_train_auc))
mlp.train.min$mlp_train_min_auc <- round(mlp.train.min$mlp_train_min_auc, digits = 2)
mlp.train.min

mlp.test.min <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(mlp_test_min_auc = min(mlp_test_auc))
mlp.test.min$mlp_test_min_auc <- round(mlp.test.min$mlp_test_min_auc, digits = 2)
mlp.test.min

In [None]:
# calculate mlp AUC (max)
mlp.train.max <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(mlp_train_max_auc = max(mlp_train_auc))
mlp.train.max$mlp_train_max_auc <- round(mlp.train.max$mlp_train_max_auc, digits = 2)
mlp.train.max

mlp.test.max <- three.mets %>%
  group_by(CHEMICAL_NAME) %>%
  summarize(mlp_test_max_auc = max(mlp_test_auc))
mlp.test.max$mlp_test_max_auc <- round(mlp.test.max$mlp_test_max_auc, digits = 2)
mlp.test.max

In [None]:
# merge mlp test
mlp.test <- mlp.train.mean %>% left_join(mlp.test.mean, by = 'CHEMICAL_NAME') %>%
            left_join(mlp.train.min, by = 'CHEMICAL_NAME') %>%
            left_join(mlp.test.min, by = 'CHEMICAL_NAME') %>%
            left_join(mlp.train.max, by = 'CHEMICAL_NAME') %>%
            left_join(mlp.test.max, by = 'CHEMICAL_NAME')
mlp.test$mlp_train_auc <- paste(mlp.test$mlp_train_mean_auc, '(', mlp.test$mlp_train_min_auc, ',', mlp.test$mlp_train_max_auc, ')')
mlp.test$mlp_test_auc <- paste(mlp.test$mlp_test_mean_auc, '(', mlp.test$mlp_test_min_auc, ',', mlp.test$mlp_test_max_auc, ')')
mlp.test <- mlp.test %>% select(CHEMICAL_NAME, mlp_train_auc, mlp_test_auc)
mlp.test

In [None]:
# merge all
result <- log.test %>% left_join(svc.test, by = 'CHEMICAL_NAME') %>%
                        left_join(forest.test, by = 'CHEMICAL_NAME') %>%
                        left_join(gbc.test, by = 'CHEMICAL_NAME') %>%
                        left_join(mlp.test, by = 'CHEMICAL_NAME')
result

In [None]:
# Supplemental Table 2: Classification ability of the three-metabolite model based on machine learning algorithms
write.csv(result, file.path(results.dir, 'Classification_ability_of_the_three-metabolite_model_machine_learning.csv'), row.names = FALSE)

In [None]:
# load machine learning of three metabolites and other variables
three.mets.other.vars <- read.csv(file.path(results.dir, 'Prediction_model_result_of_validated_sig_metabolites_and_other_variables.csv'))
dim(three.mets.other.vars)
head(three.mets.other.vars)

In [None]:
mean.auc.values <- colMeans(three.mets.other.vars[, -1])

# Round the mean values to 2 decimal places
rounded.mean.auc.values <- round(mean.auc.values, digits = 2)
rounded.mean.auc.values 

In [None]:
min.auc.values <- sapply(three.mets.other.vars[, -1],min)
# Round the min values to 2 decimal places
rounded.min.auc.values <- round(min.auc.values, digits = 2)
rounded.min.auc.values

In [None]:
max.auc.values <- sapply(three.mets.other.vars[, -1],max)
# Round the max values to 2 decimal places
rounded.max.auc.values <- round(max.auc.values, digits = 2)
rounded.max.auc.values