In [1]:
suppressMessages(library(tidyverse))
suppressMessages(library(DESeq2))
suppressMessages(library(pheatmap))
suppressMessages(library(ggrepel))
suppressMessages(library(pROC))
suppressMessages(library(caret))
suppressMessages(library(pROC))

get_auc <- function(gene,train_counts_cpm,train_meta){
        gene_counts = train_counts_cpm[gene,,drop=FALSE] %>% t() 
        meta_train_wGene <- merge(train_meta, gene_counts,by.x="sample_id",by.y=0)
        roc_auc = roc(meta_train_wGene$diagnosis ~ meta_train_wGene[[gene]], plot = FALSE, print.auc = FALSE, quiet=TRUE)
return(as.numeric(roc_auc$auc))}

source("../theme_ggplot_prevail.R")
source("../global_variables.R")

gene_name_key_file = "../0_DATA/gencode.biotype.name.key.tsv"
gene_name_key = read.delim(gene_name_key_file)

---
## Load data & split

In [2]:
## read counts
counts_train <- read.csv("../0_DATA/counts_train.602020.csv",row.names=1)
counts_val <- read.csv("../0_DATA/counts_val.602020.csv",row.names=1)
counts_test <- read.csv("../0_DATA/counts_test.602020.csv",row.names=1)

## read metadata
meta_data_train <- read.csv("../0_DATA/meta_data_train.602020.csv")
meta_data_val <- read.csv("../0_DATA/meta_data_val.602020.csv")
meta_data_test <- read.csv("../0_DATA/meta_data_test.602020.csv")


counts_train_vst <- read.csv("../0_DATA/counts_train_vst.602020.csv",row.names=1)
counts_val_vst <- read.csv("../0_DATA/counts_val_vst.602020.csv",row.names=1)
counts_test_vst <- read.csv("../0_DATA/counts_test_vst.602020.csv",row.names=1)

---
## DAA

#### Run DESeq2

In [3]:
# ##-----------------------------------
# # Contstruct DESeq Data Set
# dds <- DESeqDataSetFromMatrix(counts_train,
#                                 colData = meta_data_train,
#                                 design = ~ diagnosis + 0)

# dds_train <- DESeq(dds)
# dds_train %>% saveRDS("./data/dds_train.rds")

# ##-----------------------------------
# # Manipulate results
# res_train <- results(dds_train,alpha=0.05,contrast = c("diagnosis","MISC", "KD"))
# summary(res_train)

# res_train_sig <- res_train %>% data.frame() %>% filter(padj < 0.05) 
# res_train_sig$gene_auc <- as.numeric(lapply(rownames(res_train_sig),
#                                                         get_auc,
#                                                         counts_train_vst,
#                                                         meta_data_train))
# res_train_sig %>% write.csv("./data/deseq_results.csv")


dds_train = readRDS("./data/dds_train.rds")
res_train_sig <- read.csv("./data/deseq_results.csv",row.names=1)

In [4]:
res_train_sig %>% write.csv("./output/Supplementary-File3_pedInflam.KDvMISC-DESEQ.csv",quote=FALSE,row.names=TRUE)

In [5]:
pv_thresh = 0.01
fc_thresh = 0.25
bm_thresh = 100
au_thresh = 0.65

black_list = read.delim("../0_DATA/genes_to_exlude.tsv",header=F)



res_train_sig %>%
        mutate(gene_id_sub = gsub("\\..*","",row.names(.))) %>% 
        filter(!(gene_id_sub %in% all_of(black_list$V2))) %>% 
        filter(padj < pv_thresh) %>%
        filter(abs(log2FoldChange) > fc_thresh) %>%
        filter(baseMean > bm_thresh) %>%
        filter(gene_auc > au_thresh) %>%
        nrow()


[1m[22m[36mℹ[39m In argument: `!(gene_id_sub %in% all_of(black_list$V2))`.
[1m[22m[33m![39m Using `all_of()` outside of a selecting function was deprecated in tidyselect
  1.2.0.
[36mℹ[39m See details at
  <https://tidyselect.r-lib.org/reference/faq-selection-context.html>”


In [7]:
tmp = res_train_sig

quantile(tmp$baseMean)
quantile(tmp$log2FoldChange)
quantile(tmp$gene_auc)


#### Gene characteristics

In [101]:
PVAL = 0.01
BM = 100
L2FC = 0.25
GAUC = 0.65



options(repr.plot.height = 2, repr.plot.width = 6)
WIDTH= 2.3
WIDTH= 2.35 ### with same y axis
HEIGHT= .75

###-----------------------------------
### Plot gene characteristics
numticksX = 5
numticksY = 3


library(scales)

# padj
padj_plt <- res_train_sig %>% ggplot()+
            annotate("rect", xmin = -log10(PVAL), xmax = Inf, ymin = 0, ymax = Inf, fill = "#32CD32", alpha = 0.25) +
            geom_histogram(aes(x=-log10(padj)),bins=100) + 
            theme_prevail() + 
            geom_vline(xintercept=-log10(PVAL), color = "red", linetype ="dashed") +
            theme(axis.text = element_text(size = 8),
                axis.title.x =element_blank(),
                axis.title.y = element_blank())+
             coord_cartesian(ylim = c(0,1200), xlim = c(0,20))+
             scale_x_continuous(breaks = c(0,5,10,15,20))+
             scale_y_continuous(breaks = c(0,500,1000))

pdf(file="plots/KD-MISC_GENECHAR_padj.pdf",
        width=2.35,height=HEIGHT, paper="special", bg="white",
        fonts="Helvetica", colormodel = "srgb", pointsize=6, useDingbats = FALSE)
print(padj_plt)
dev.off()

# baseMean
bm_plt <- res_train_sig %>% ggplot(aes(x=log10(baseMean)))+
            annotate("rect", xmin = log10(BM), xmax = Inf, ymin = 0, ymax = Inf, fill = "#32CD32", alpha = 0.25) +
            geom_histogram(bins=100) + 
            theme_prevail() + 
            geom_vline(xintercept=log10(BM), color = "red", linetype ="dashed") +
        #     geom_vline(xintercept=BM, color = "red", linetype ="dashed") +
            theme(axis.text = element_text(size = 8),
                axis.title.x =element_blank(),
                axis.title.y = element_blank()) +
            coord_cartesian(ylim = c(0,1200), xlim = c(-0.5,4))+
            scale_x_continuous(breaks = c(0,1,2,3,4))+
            scale_y_continuous(breaks = c(0,500,1000))


pdf(file="plots/KD-MISC_GENECHAR_basemean.pdf",
        width=WIDTH,height=HEIGHT, paper="special", bg="white",
        fonts="Helvetica", colormodel = "srgb", pointsize=6, useDingbats = FALSE)
print(bm_plt)
dev.off()

# log2FoldChange
l2fc_plt <- res_train_sig %>% ggplot(aes(x=log2FoldChange))+
            annotate("rect", xmin = L2FC, xmax = Inf, ymin = 0, ymax = Inf, fill = "#32CD32", alpha = 0.25) +
            annotate("rect", xmin = -Inf, xmax = -L2FC, ymin = 0, ymax = Inf, fill = "#32CD32", alpha = 0.25) +
            geom_histogram(bins=100) + 
            theme_prevail() + 
            geom_vline(xintercept=L2FC, color = "red", linetype ="dashed") + 
            geom_vline(xintercept=-L2FC, color = "red", linetype ="dashed") +
            theme(axis.text = element_text(size = 8),
                axis.title.x =element_blank(),
                axis.title.y = element_blank())+
            coord_cartesian(ylim = c(0,1200), xlim = c(-3,6.5))+
            scale_x_continuous(breaks = c(-3, 0, 3, 6))+
            scale_y_continuous(breaks = c(0,500,1000))

pdf(file="plots/KD-MISC_GENECHAR_log2fc.pdf",
        width=WIDTH,height=HEIGHT, paper="special", bg="white",
        fonts="Helvetica", colormodel = "srgb", pointsize=6, useDingbats = FALSE)
print(l2fc_plt)
dev.off()


# gene_auc
geneauc_plt <- res_train_sig %>% ggplot(aes(x=gene_auc))+
            annotate("rect", xmin = GAUC, xmax = Inf, ymin = 0, ymax = Inf, fill = "#32CD32", alpha = 0.25) +
            geom_histogram(bins=100) + 
            theme_prevail()+
            geom_vline(xintercept=GAUC, color = "red", linetype ="dashed") +
            theme(axis.text = element_text(size = 8),
                axis.title.x =element_blank(),
                axis.title.y = element_blank())+
            coord_cartesian(ylim = c(0,1200), xlim = c(0.25,1))+
             scale_x_continuous(breaks = c(0.25, 0.5, 0.75, 1.00))+
             scale_y_continuous(breaks = c(0,500,1000))

pdf(file="plots/KD-MISC_GENECHAR_geneAUC.pdf",
        width=WIDTH,height=HEIGHT, paper="special", bg="white",
        fonts="Helvetica", colormodel = "srgb", pointsize=6, useDingbats = FALSE)
print(geneauc_plt)
dev.off()

#### Volcano Plot

In [5]:
# Create a detable 
detable = res_train_sig %>% data.frame  
detable <- merge(detable,gene_name_key,by.x=0,by.y="gene_id")
detable <- detable %>%
    filter(!is.na(padj)) %>% 
    mutate(sig_dir = ifelse(padj<0.01 & log2FoldChange>0.5,"up",ifelse(padj<0.01 & log2FoldChange < -0.5,"down","notsig"))) %>%
    # mutate(sig_dir = ifelse(padj >0.05,"notsig",ifelse(log2FoldChange>0,"up","down"))) %>%
    mutate(delabel = ifelse( (sig_dir=="up" & -log(padj) > 35) | (sig_dir=="down" & -log(padj) > 28) , gene_name,""))

options(repr.plot.width = 5, repr.plot.height = 5)

volcano_plt <- detable %>% 
    filter(!is.na(padj)) %>%
    ggplot(aes(x=log2FoldChange, y=-log10(padj), color = sig_dir, label=delabel)) + #
    geom_point(size = 0.75) +
    geom_hline(yintercept = -log(0.01), size = 0.5, alpha = 0.5, linetype="dashed")+
    geom_vline(xintercept = c(-0.5, 0.5), size = 0.5, alpha = 0.5, linetype="dashed") + 
    geom_text_repel(size = 2) +  
    theme_prevail() +
    scale_color_manual(values = c("notsig"="grey","up"=INFLAMCAT_FILL_KEY[['MISC']],"down"=INFLAMCAT_FILL_KEY[['KD']]))+
    labs(x = bquote(Log[2]*"(Fold Change)"), y= bquote(-Log[10]*"(Adj p-value)"))+
    theme(axis.text = element_text(size = 8),
        axis.title = element_text(size = 8))

WIDTH= 2.4
HEIGHT= 2.15
pdf(file="plots/KD-MISC_DESEQ_volcano.pdf",
        width=WIDTH,height=HEIGHT, paper="special", bg="white",
        fonts="Helvetica", colormodel = "srgb", pointsize=6, useDingbats = FALSE)
print(volcano_plt)
dev.off()

---
## Modeling Results

#### AUC Dot plots

In [5]:
path = "./ML_pipeline/output/"
comp = "MISC<>KD"

get_auc <- function(alg, comp, group, path){
    stats <- read.delim(paste0(path,comp,"/",alg,"/",comp,".",alg,".",group,".txt"))
    return(stats[1,])
}


###############################
## Get All Model AUCs
###############################

## list models
models = list.files(paste0(path,comp))
models = models[!grepl(".rds|.png|feature",models)]
models

## get AUCs
train_aucs <- lapply(models,get_auc,
                     comp=comp,
                     group="train",
                     path=path) %>%
                unlist()

val_aucs <- lapply(models,get_auc,
                     comp=comp,
                     group="val",
                     path=path) %>%
                unlist()

## Organize AUCs
all_aucs <- rbind(train_aucs,val_aucs) %>% data.frame()
colnames(all_aucs) <- models
all_aucs <- all_aucs %>% 
            t() %>% data.frame() %>% 
            rownames_to_column(.,"algorithm") %>% 
            mutate(comparison = comp) %>%
            reshape2::melt(id.vars = c("algorithm","comparison")) %>% 
            dplyr::rename(auc = value) %>% mutate(auc = as.numeric(auc)) %>% 
            filter(algorithm != "LOG") %>%
            mutate(variable = gsub("_aucs","",variable))

all_aucs$auc <- as.numeric(all_aucs$auc)


###############################
## Plot and save
###############################

WIDTH = 1.8
HEIGHT = 1.8

pdf(file=paste0("./plots/KD-MISC.auc-dotplots.pdf"),
        width=WIDTH,height=HEIGHT, paper="special", bg="white",
        fonts="Helvetica", colormodel = "srgb", pointsize=6, useDingbats = FALSE)

all_aucs %>%
        mutate(algorithm = factor(algorithm, levels = all_aucs %>% 
                                                      filter(variable == "val") %>%
                                                      group_by(algorithm) %>%
                                                      summarize(auc = mean(auc)) %>%
                                                      arrange(auc) %>%
                                                      pull(algorithm))) %>%
        ggplot(aes(x = auc, y = algorithm, color = variable, shape = variable)) +
        geom_point(size = 1.5, stroke = 1) +  
        geom_hline(yintercept = seq(1.5, length(unique(all_aucs$algorithm)) - 0.5), 
                    size = 0.3) +
        theme_prevail() +
        theme(legend.position = "none",
            axis.title.x = element_text(size = 6),
            axis.text.x = element_text(size = 6),
            axis.title.y = element_blank(), 
            axis.text.y = element_text(size = 6, face = "bold"),
            plot.title = element_blank()) +
        labs(title = comp,x="AUC")+
        scale_color_manual(breaks = c("train","val"), 
                            values = c("#00A1D5FF","#B24745FF"))+
        scale_shape_manual(breaks = c("train","val"), 
                            values = c(16,1)) +
        xlim(.5,1)

dev.off()

“[1m[22mUsing `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
[36mℹ[39m Please use `linewidth` instead.”
“[1m[22mRemoved 1 rows containing missing values (`geom_point()`).”


#### Wrangle GLMNET outputs

In [29]:
path = "./ML_pipeline/output/"
comp = "MISC<>KD"
alg = "GLMNETLasso"
TYPE = "prob"

group1_name = "MISC"
group2_name = "KD"

mod_data <- readRDS(paste0(path,comp,"/",alg,"/",comp,".",alg,".rds"))
youden_threshold <- read.delim(paste0(path,comp,"/",alg,"/",comp,".",alg,".train.txt"))["youden",]
MODEL_FIT <- mod_data[['model']]

##------------------------------------
## Load train and test data 
##------------------------------------

metadata_train <- mod_data[['meta_data_train']] %>%                        ## get separate metadata files and coordinate
    mutate(set = "Train") %>%
    mutate(classifier_pred = ifelse(classifier_score > youden_threshold,group1_name,group2_name)) %>% 
    select(cfrna_id,set,group,classifier_score, classifier_pred)

metadata_val <- mod_data[['meta_data_val']] %>%                        ## get separate metadata files and coordinate
    mutate(set = "Validation") %>%
    mutate(classifier_pred = ifelse(classifier_score > youden_threshold,group1_name,group2_name)) %>% 
    select(cfrna_id,set,group,classifier_score, classifier_pred)

metadata_test <- mod_data[['meta_data_test']] %>%
    mutate(set = "Test") %>%
    mutate(classifier_pred = ifelse(classifier_score > youden_threshold,group1_name,group2_name)) %>% 
    select(cfrna_id,set,group,classifier_score, classifier_pred)


##------------------------------------
## PLOT
##------------------------------------

## Combine and save
mdf <- rbind(metadata_train, rbind(metadata_test, metadata_val))%>%          ## combine metadata and plot
        mutate(set = factor(set,levels = c("Train","Validation","Test"))) %>% 
        mutate(group = ifelse(cfrna_id == "cfrna_kd_165","KD",as.character(group))) %>%    ## fix readjudicated samples
        mutate(group = factor(group,    
                                    levels = c("MISC","KD")))    


## Combine and save
mdf %>% 
    write.table("./data/glmnet_lasso_predictions.csv",sep=",",quote=FALSE,row.names=FALSE)       

#### AUC curves

In [32]:
mod_df <- read.csv("./data/glmnet_lasso_predictions.csv")

##------------------------------------
## GET TPR + FPR
##------------------------------------

train_mdf = mod_df %>% filter(set == "Train")
roc_train <- suppressMessages(roc(response = train_mdf$group, predictor = train_mdf$classifier_score,
                percent=TRUE,
                ci=TRUE, 
                boot.n=10000, boot.stratified=TRUE,
                print.auc=TRUE,
                print.thres="best",
                print.thres.best.method="youden"))

validate_mdf = mod_df %>% filter(set == "Validation")
roc_validate <- suppressMessages(roc(response = validate_mdf$group, predictor = validate_mdf$classifier_score,
                percent=TRUE,
                ci=TRUE, 
                boot.n=10000, boot.stratified=TRUE,
                print.auc=TRUE,
                print.thres="best",
                print.thres.best.method="youden"))

test_mdf = mod_df %>% filter(set == "Test")
roc_test <- suppressMessages(roc(response = test_mdf$group, predictor = test_mdf$classifier_score,
                percent=TRUE,
                ci=TRUE, 
                boot.n=10000, boot.stratified=TRUE,
                print.auc=TRUE,
                print.thres="best",
                print.thres.best.method="youden"))



##------------------------------------
## get roc values
##------------------------------------

train_roc <- data.frame("sensitivity" = roc_train$sensitivities,
                        "specificity" = roc_train$specificities,
                       "set" = "train")

validate_roc <- data.frame("sensitivity" = roc_validate$sensitivities,
                        "specificity" = roc_validate$specificities,
                       "set" = "validate")

test_roc <- data.frame("sensitivity" = roc_test$sensitivities,
                        "specificity" = roc_test$specificities,
                       "set" = "test")


all_roc <- do.call("rbind",list(train_roc, validate_roc, test_roc)) %>%
    mutate(FPR = 1-(as.numeric(specificity)/100),
          TPR = as.numeric(sensitivity)/100)

##------------------------------------
## PLOT
##------------------------------------

WIDTH = 2
HEIGHT = 2

pdf(file = paste0("./plots/KD-MISC_MODEL_AUC.pdf"),
        width=WIDTH,height=HEIGHT, paper="special", bg="white",
        fonts="Helvetica", colormodel = "srgb", pointsize=6, useDingbats = FALSE)

all_roc %>% 
    mutate(set = factor(set,levels = c("train","validate","test"))) %>% 
    ggplot(aes(x= FPR, y= TPR, group = set, linetype = set, color = set))+
    geom_path()+
    theme_prevail()+
    theme(aspect.ratio = 1,
        axis.title.x = element_text(size = 6),
        axis.text.x = element_text(size = 6),
        axis.title.y = element_text(size = 6), 
        axis.text.y = element_text(size = 6)
        ) +
    scale_linetype_manual(values = c("train"="dashed","validate"="solid","test"="solid"))+
    scale_color_manual(values = c("train"="blue","validate"="red","test"="dark green"))
dev.off()


print(roc_train$auc)
print(roc_validate$auc)
print(roc_test$auc)



“ci.auc() of a ROC curve with AUC == 1 is always 1-1 and can be misleading.”


Area under the curve: 100%
Area under the curve: 98.25%
Area under the curve: 98.22%


#### Violin plot

In [33]:
## MAKE PLOT 
plt <- mdf %>% 
    ggplot(aes(x=group, y=classifier_score, color = group, fill = group))+
    geom_hline(yintercept = youden_threshold, linetype = "dashed", alpha=0.75)+
    geom_violin(color = NA, alpha = 0.3) +
    geom_point(size = 0.5, position = position_jitter(width= 0.2, height = 0))+
    facet_grid(.~set,scale = "free_x", space = "free_x")+
    theme_prevail() +
    theme(legend.position = "none",
            legend.box = "vertical",
            legend.text = element_text(size = 4),
            legend.title = element_blank(),
            strip.background = element_rect(color="black", fill="white", size=.75, linetype="solid"),
            axis.title.x = element_blank(),
            axis.text.x = element_text(size = 6),
            axis.title.y = element_text(size = 6), 
            axis.text.y = element_text(size = 6),
            plot.title = element_blank(),
            strip.text = element_text(size = 8)) +
    labs(title = alg,y="Classifier Score") +
    guides(colour = guide_legend(nrow = 2)) + 
    # theme(axis.text.x=element_text(angle = 45, hjust = 1, vjust= 1),
    #         plot.title = element_blank())+
    scale_fill_manual(values = INFLAMCAT_FILL_KEY)+
    scale_color_manual(values = INFLAMCAT_FILL_KEY)


## Save
HEIGHT = 1.9
WIDTH = 3

pdf(file = paste0("./plots/KD-MISC_",alg,"_violin.comp.pdf"),
        width=WIDTH,height=HEIGHT, paper="special", bg="white",
        fonts="Helvetica", colormodel = "srgb", pointsize=6, useDingbats = FALSE)
print(plt)
dev.off()

#### Gene selection

In [17]:
##########################################
### get genes

coefs <-  coef(mod_data$model$finalModel, mod_data$model$bestTune$lambda) %>% as.matrix() %>% as.data.frame()
ngs <- coefs %>% filter(s1 != 0) %>% rownames()
gene_sub <- ngs[ngs != "(Intercept)"]
gene_sub %>% length()

gene_name_key %>% filter(gene_id %in% gene_sub) %>% pull(gene_name) %>% paste(collapse=", ") %>% cat()

GENE_PANEL = gene_sub

TXNIP, ASH1L, YPEL5, PCBP1, STK17B, TRAK2, PTMA, RNA5SP149, AFF1, FHDC1, ANXA6, FKBP5, HSP90AB1, SYNE1, RBM33, VIM, PDCD4, CD44, RESF1, DAZAP2, COTL1, PFN1, EEF2, TRIR, FTL

In [19]:
gene_name_key %>% filter(gene_id %in% gene_sub) %>% write.csv("./data/Supplementary_KDMISC-genePanel.csv",row.names=F,quote=F)

In [20]:
##########################################
### Functions

GLMNETLasso_TRAIN_CUSTOMLAMBDA <- function(counts,meta_data,LAMBDA){
    
    set.seed(42)
    
    metric <- "Accuracy"

    control <- trainControl(method="cv",number=5)

    tunegrid <- expand.grid( #alpha = 1,
                            alpha = 1,
                           lambda = LAMBDA)
    
    counts$y <- factor(meta_data$inflam_cat)

    glmnet_fit <- train(y~., 
                    data=counts, 
                    metric = metric,
                    method='glmnet',
                    preProcess = c("center","scale"),
                    trControl=control,
                    tuneGrid = tunegrid)
    
    return(glmnet_fit)
}

##-------------------------------------
# Set variables and load in data
##-------------------------------------

mod <- mod_data$model

moddf <- mod$results
best <- mod$bestTune


##-------------------------------------
# Get features per lambda
##-------------------------------------

MIN_LAMBDA <- -round(log(min(moddf$lambda)))

output = c()

# for (LAMBDA in moddf$lambda){
# for (LAMBDA in moddf$lambda[seq(1,length(moddf$lambda),100)]){
for (LAMBDA in -seq(from = 0, to = MIN_LAMBDA, length.out = 9) ){
    
    LAMBDA = exp(LAMBDA)

    TYPE = "prob"
    MODEL_FIT <- GLMNETLasso_TRAIN_CUSTOMLAMBDA(data.frame(t(counts_train_vst[GENE_PANEL,])),meta_data_train,LAMBDA)
    
    coefs <-  coef(MODEL_FIT$finalModel, MODEL_FIT$bestTune$lambda) %>% as.matrix() %>% as.data.frame()
    ngs <- coefs %>% filter(s1 != 0) %>% rownames()
    gene_sub <- ngs[ngs != "(Intercept)"]
    
    cnt <- length(gene_sub)
     
    output <- c(output,cnt)
    }


##-------------------------------------
# Plot
##-------------------------------------

WIDTH = 1.85
HEIGHT = 1.85

pdf(file = paste0("./plots/KD-MISC_",alg,"_lambda-accuracy.pdf"),
        width=WIDTH,height=HEIGHT, paper="special", bg="white",
        fonts="Helvetica", colormodel = "srgb", pointsize=6, useDingbats = FALSE)

# moddf[seq(1,10000,length.out = 100),] %>% 
moddf %>% 
    ggplot(aes(x=log(lambda), y = Accuracy))+
#     geom_errorbar(aes(ymin = Accuracy - AccuracySD, ymax = Accuracy + AccuracySD),width=.1, size = 0.1)+
    geom_point(size = 0.15, color = "black")+
    geom_smooth(method = "loess",span= 0.1,size = 0.25) + 
    geom_vline(xintercept=log(best$lambda), color = "red", linetype = "dashed")+
    theme_prevail() +
    theme(legend.position = "none",
          strip.background = element_rect(
     color="black", fill="white", size=.75, linetype="solid"
     ),
        axis.text.x = element_text(size = 6),
        axis.text.y = element_text(size = 6),
        axis.title.x = element_text(size = 6),
        axis.title.y = element_text(size = 6),
#         plot.title = element_text(hjust = 0.5, size = 6),
        plot.title = element_blank(),
         strip.text = element_text(size = 8)) +
    labs(x = "log(lambda)",y="Accuracy") + theme(aspect.ratio=1) +
    scale_x_continuous(sec.axis = dup_axis(breaks = -seq(0,12,1.5), 
                                          labels = output,
                                          name = "genes selected"))

dev.off()

[1m[22m`geom_smooth()` using formula = 'y ~ x'


---
## UCSD only

In [51]:
mod_df <- read.csv("./data/glmnet_lasso_predictions.csv") %>% 
    # filter(set != "Train") %>%
     filter(grepl("cfrna_kd",cfrna_id))


tmp = confusionMatrix( factor(mod_df$classifier_pred), factor(mod_df$group))
tmp

Confusion Matrix and Statistics

          Reference
Prediction KD MISC
      KD   93    2
      MISC  5   41
                                          
               Accuracy : 0.9504          
                 95% CI : (0.9004, 0.9798)
    No Information Rate : 0.695           
    P-Value [Acc > NIR] : 3.537e-14       
                                          
                  Kappa : 0.8851          
                                          
 Mcnemar's Test P-Value : 0.4497          
                                          
            Sensitivity : 0.9490          
            Specificity : 0.9535          
         Pos Pred Value : 0.9789          
         Neg Pred Value : 0.8913          
             Prevalence : 0.6950          
         Detection Rate : 0.6596          
   Detection Prevalence : 0.6738          
      Balanced Accuracy : 0.9512          
                                          
       'Positive' Class : KD              
                              

### panel with directions

In [13]:
desq = read.csv("./output/Supplementary-File3_pedInflam.KDvMISC-DESEQ.csv")
panel = read.csv("./data/Supplementary_KDMISC-genePanel.csv")

panel <- merge(panel, desq, by.x="gene_id", by.y="X", all.x=TRUE) %>% select(gene_id, gene_name, gene_type, log2FoldChange) %>% 
    mutate(elevated_in = ifelse(log2FoldChange < 0, "KD", "MIS-C"))
panel %>% write.csv("./data/Supplementary_KDMISC-genePanel-DIR.csv",row.names=FALSE, quote=FALSE)