# Assess the correlation between prediction and tumor mutational burden (EBV vs MSI vs other)

In [None]:
library(openxlsx)

df <- read.xlsx("../results/prediction/training4/tcga.xlsx", sheet = "with aug with tcga")
df$Molecular.Subtype <- factor(df$Molecular.Subtype)

df2 <- data.frame(df$EBV, df$MSI, df$other)
predicted_label3 <- apply(df2, 1, which.max)
predicted_label3 <- factor(ifelse(predicted_label3 == 1, "EBV", ifelse(predicted_label3 == 2,
        "MSI", "other")))
df <- cbind(df, predicted_label3)
print('*** all ***')
summary(df)

dfebv <- subset(df, df$predicted_label3 == "EBV")
dfmsi <- subset(df, df$predicted_label3 == "MSI")
dfother <- subset(df, df$predicted_label3 == "other")
print('*** prediction: ebv ***')
summary(dfebv)
print('*** prediction: msi ***')
summary(dfmsi)
print('*** prediction: other ***')
summary(dfother)

## TMB and prediction

In [None]:
boxplot(dfebv$Tumor.Mutational.Burden, dfmsi$Tumor.Mutational.Burden,
        dfother$Tumor.Mutational.Burden, names = c("EBV", "MSI", "other"))

tiff(filename="../results/tmb/prediction.tiff", width=5, height=5, units="in", res=500)
boxplot(dfebv$Tumor.Mutational.Burden, dfmsi$Tumor.Mutational.Burden,
        dfother$Tumor.Mutational.Burden, names = c("EBV", "MSI", "other"))
dev.off()

In [None]:
var.test(dfebv$Tumor.Mutational.Burden, dfmsi$Tumor.Mutational.Burden, conf.level=0.95)
var.test(dfother$Tumor.Mutational.Burden, dfmsi$Tumor.Mutational.Burden, conf.level=0.95)

In [None]:
t.test(dfebv$Tumor.Mutational.Burden, dfmsi$Tumor.Mutational.Burden, var.equal = FALSE)
t.test(dfother$Tumor.Mutational.Burden, dfmsi$Tumor.Mutational.Burden, var.equal = FALSE)

In [None]:
dfmsi_truepositive <- subset(df, df$Molecular.Subtype == "MSI" & df$predicted_label3 == "MSI")
dfmsi_falsenegative <- subset(df, df$Molecular.Subtype == "MSI" & df$predicted_label3 != "MSI")
boxplot(dfmsi_truepositive$Tumor.Mutational.Burden, dfmsi_falsenegative$Tumor.Mutational.Burden)

In [None]:
var.test(dfmsi_truepositive$Tumor.Mutational.Burden, dfmsi_falsenegative$Tumor.Mutational.Burden,
        conf.level=0.95)

In [None]:
t.test(dfmsi_truepositive$Tumor.Mutational.Burden, dfmsi_falsenegative$Tumor.Mutational.Burden,
        var.equal = TRUE)

## TMB and ground truth

In [None]:
boxplot(subset(df, df$Molecular.Subtype == "EBV")$Tumor.Mutational.Burden,
        subset(df, df$Molecular.Subtype == "MSI")$Tumor.Mutational.Burden,
        subset(df, df$Molecular.Subtype == "GS"
        | df$Molecular.Subtype == "CIN")$Tumor.Mutational.Burden,
        names = c("EBV", "MSI", "other"))

tiff(filename="../results/tmb/ground_trugh.tiff", width=5, height=5, units="in", res=500)
boxplot(subset(df, df$Molecular.Subtype == "EBV")$Tumor.Mutational.Burden,
        subset(df, df$Molecular.Subtype == "MSI")$Tumor.Mutational.Burden,
        subset(df, df$Molecular.Subtype == "GS"
        | df$Molecular.Subtype == "CIN")$Tumor.Mutational.Burden,
        names = c("EBV", "MSI", "other"))

dev.off()

In [None]:
var.test(subset(df, df$Molecular.Subtype == "EBV")$Tumor.Mutational.Burden,
        subset(df, df$Molecular.Subtype == "MSI")$Tumor.Mutational.Burden, conf.level=0.95)
var.test(subset(df, df$Molecular.Subtype == "MSI")$Tumor.Mutational.Burden,
        subset(df, df$Molecular.Subtype == "GS"
        | df$Molecular.Subtype == "CIN")$Tumor.Mutational.Burden, conf.level=0.95)

In [None]:
t.test(subset(df, df$Molecular.Subtype == "EBV")$Tumor.Mutational.Burden,
        subset(df, df$Molecular.Subtype == "MSI")$Tumor.Mutational.Burden, var.equal = FALSE)
t.test(subset(df, df$Molecular.Subtype == "MSI")$Tumor.Mutational.Burden,
        subset(df, df$Molecular.Subtype == "GS"
        | df$Molecular.Subtype == "CIN")$Tumor.Mutational.Burden, var.equal = FALSE)