## PEPs and Simulations
This is a Jupyter notebook.

To run all cells in the notebook use `Cell --> Run All`.

To run cells one at a time click into the first code cell and key `Shift-Enter` in each cell in sequence.

More information on Jupyter notebooks can be found
[here](http://jupyter-notebook.readthedocs.io/en/stable/examples/Notebook/Running%20Code.html).

In [None]:
## Set the plot window sizes within the notebook (in inches)
options(repr.plot.width=6, repr.plot.height=3)

## Set up the command line arguments

In [None]:
## Set up reasonable defaults
opt <- list()
opt$iterations = 300          # Number of simulation iterations to run
opt$datadir = './user_data'   # User-mounted data directory
opt$countdir = 'dexseq_count' # Directory containing counts files
opt$outdir = './output'       # Directory to store results
opt$samplesCanine = 'samples_canine_updated.csv'

#### Housekeeping

In [None]:
## Add working directory to count and phenotype data file names
opt$countdir = paste(opt$datadir,opt$countdir,sep='/')
opt$samplesCanine = paste(opt$datadir,opt$samplesCanine,sep='/')

In [None]:
## If the output directory doesn't exist, create it
if(!dir.exists(opt$outdir)) {
  print(paste('Creating output directory',opt$outdir))
  system(paste('mkdir -p',opt$outdir))
}

## Load libraries
library(ggplot2)

### Functions for the script


In [None]:
## Given a patient number, return 1 sample ID of each histology type
pick.3.samples <- function(pat.num) {
  dat.pat <- dat.hist[dat.hist$PatientNumber==pat.num,]
  return( list(
    N=sample(dat.pat[dat.pat$Hist=='N','Qlabel'],1),
    B=sample(dat.pat[dat.pat$Hist=='B','Qlabel'],1),
    M=sample(dat.pat[dat.pat$Hist=='M','Qlabel'],1)
  ) )
}

In [None]:
## Given a list of patient numbers, return 2 samples for each. 
##    For half of the patients return 1 N and 1 B histology sample, for the other half return 1 N and 1 M histology sample
pick.2.samples <- function(pat.num) {

  # Split into 2 groups: ones with N&M histology and ones with N&B histology
  # Randomly sort the patient list since we use all patients every time
  pat.num <- sample(pat.num, length(pat.num), replace=F)
  pat.num.nb <- pat.num[1:(length(pat.num)/2)]
  pat.num.nm <- pat.num[(floor(length(pat.num)/2)):length(pat.num)]

  dat.pat.NB <- dat.hist[dat.hist$PatientNumber %in% pat.num.nb,]
  dat.pat.NM <- dat.hist[dat.hist$PatientNumber %in% pat.num.nm,]

  s.nb <- sapply(pat.num.nb, function(x) {
    dat.pat.NB <- dat.hist[dat.hist$PatientNumber %in% x,]
    list(
      sample(dat.pat.NB[dat.pat.NB$Hist=='N','Qlabel'],1),
      sample(dat.pat.NB[dat.pat.NB$Hist=='B','Qlabel'],1))
    } )
  s.nm <- sapply(pat.num.nm, function(x) {
    dat.pat.NM <- dat.hist[dat.hist$PatientNumber %in% x,]
    list(
      sample(dat.pat.NM[dat.pat.NM$Hist=='N','Qlabel'],1),
      sample(dat.pat.NM[dat.pat.NM$Hist=='M','Qlabel'],1))
    } )
  return( unlist(list(unlist(s.nb), unlist(s.nm))) )
}

### Begin analysis

In [None]:
## Load the function to generate PEP lists.
##   NOTE: This takes some time to load
load(paste(opt$datadir,'humanmapping.rda',sep='/')) ## TODO
source('run_genPEPs.R') 

In [None]:
## Load phenotype data for the dogs
## Make sure the required phenotype columns are in the data, quit if any are missing
print('Loading phenotype data...')
dat.hist <- read.csv(opt$samplesCanine) #dat.hist <- read.csv('samples_canine_updated.csv')
if( all(c('Qlabel','Hist') %in% colnames(dat.hist)) ) {
  print(paste('Success,',nrow(dat.hist),'dog samples loaded.'))
} else {
  print('ERROR: Qlabel and Hist columns required in the phenotype data.')
  quit(save='no',status=1) 
}

#### Create the 'true' PEP lists using the full data

In [None]:
print('Calculating PEPs.')
peps.real <- gen_PEPs(dat.hist, opt$countdir)
qlim <- 0.05 # Minimum value to be included in a given PEP list
peps <- list( Adenoma=peps.real[peps.real$Adenoma_Expression_Pattern < qlim,'HumanSymbol'], Carcinoma=peps.real[peps.real$Carcinoma_Expression_Pattern < qlim,'HumanSymbol'], Tumor=peps.real[peps.real$Tumor_Expression_Pattern < qlim,'HumanSymbol'])
print('Full PEP lengths:')
print(sapply(peps, length)) # Print num genes in each PEP 
write.table(peps.real, file=paste(opt$outdir,'CMT_PEPs.csv',sep='/'), sep=',', col.names=TRUE, row.names=FALSE, quote=FALSE)

#### Run simulations using subsets of the data -- 2 versions of simulations will be run

In [None]:
## first approach: a total of 30 samples from 10 patients (each with at least one normal, adenoma, and carcinoma sample) are selected at random.
print('Running PEP simulations version 1.')
n.iters <- opt$iterations
print(paste('Running',n.iters,'iterations for each version.'))
peps.cor.3hist <- matrix(NA, nrow=n.iters, ncol=3)
colnames(peps.cor.3hist) <- c('Tumor_Expression_Pattern','Carcinoma_Expression_Pattern','Adenoma_Expression_Pattern')

for(i in 1:n.iters) {
  ## Pick 10 random patients and 10 random samples(1 of each N,B,M from each patient)
  ids <- sort(sample(unique(dat.hist$PatientNumber), 10, replace=FALSE))
  s.ids <- unlist(sapply(ids, pick.3.samples))

  ## Generate the PEPs again
  peps.new <- gen_PEPs(dat.hist[dat.hist$Qlabel %in% s.ids,], opt$countdir)

  ## Calculate Spearman correlation
  ids.genes <- intersect( peps.new$EnsGene, peps.real$EnsGene )
  peps.cor.3hist[i,] <- sapply(colnames(peps.cor.3hist), function(x) { cor(peps.new[peps.new$EnsGene %in% ids.genes,x], peps.real[peps.real$EnsGene %in% ids.genes,x], method='spearman', use='complete') } )
  cat('.')
}
print('done!')

In [None]:
# second approach: simulates cases where only two histologic categories can be gathered from a patient (as is typical in normal versus disease studies).
# we used one normal and one adenoma sample from each of 8 randomly selected patients and 
#   used one normal and one carcinoma from each of the remaining patients, resulting in 32 samples per simulation.
print('Running PEP simulations version 2.')
peps.cor.2hist <- matrix(NA, nrow=n.iters, ncol=3)
colnames(peps.cor.2hist) <- c('Tumor_Expression_Pattern','Carcinoma_Expression_Pattern','Adenoma_Expression_Pattern')
for(i in 1:n.iters) {
  ## Pick 10 random patients and 10 random samples(1 of each N,B,M from each patient)
  ids <- unique(dat.hist$PatientNumber)
  s.ids <- pick.2.samples(ids)

  ## Generate the PEPs again
  peps.new <- gen_PEPs(dat.hist[dat.hist$Qlabel %in% s.ids,], opt$countdir)

  ## Calculate Spearman correlation
  ids.genes <- intersect( peps.new$EnsGene, peps.real$EnsGene ) 
  peps.cor.2hist[i,] <- sapply(colnames(peps.cor.2hist), function(x) { cor(peps.new[peps.new$EnsGene %in% ids.genes,x], peps.real[peps.real$EnsGene %in% ids.genes,x], method='spearman', use='complete') } )
  cat('.')
}
print('done!')

#### Store the results from simulations

In [None]:
save(peps.cor.2hist, peps.cor.3hist, file=paste(opt$outdir,'Hist_Sims.RData',sep='/'))

peps.cor.2hist <- as.data.frame(peps.cor.2hist)
peps.cor.3hist <- as.data.frame(peps.cor.3hist)
peps.cor.2hist$Hist <- rep('Hist2', nrow(peps.cor.2hist))
peps.cor.3hist$Hist <- rep('Hist3', nrow(peps.cor.3hist))

# Wilcoxon rank-sum test of comparable number of samples
print("Calculating Wilcoxon rank-sum tests on each pep.")
print(paste('Wilcox rank-sum test across all PEPs:', wilcox.test(unlist(peps.cor.2hist[,1:3]), unlist(peps.cor.3hist[,1:3]), na.ignore=T)$p.value) );flush.console()

## Generate plots with results

In [None]:
## Color Palette for manuscript
print('Setting color palettes.')
cols.palette <- c('#9DC7D8','#7FA1BE','#EBDA8C','#01B3CA','#4F6E35','#965354','#7DD1B9','#808040','#C6CBCE','#1D4E88','#C78C6C','#F69256','#D2B29E','#8B868C','#E38691','#B490B2')
cols.hist    <- c('#7DD1B9','#EBDA8C','#965354') # order = healthy, benign, malignant
cols.peps    <- c('#7FA1BE','#F69256','#E38691') # order = tumor, adenoma, carcinoma
cols         <- cols.palette[! (cols.palette %in% c(cols.hist, cols.peps)) ] # Don't resue the pep/histology colors

In [None]:
print('Generating simulations correlation plot.')
tmp <- melt( rbind(peps.cor.2hist, peps.cor.3hist) )
ggplot(tmp) + geom_boxplot((aes(x=variable,y=value,fill=Hist))) + coord_flip() + scale_fill_manual(values=sample(cols,2)) + ylab('Spearman Correlation to true PEPs') + xlab('') + theme_bw(base_size = 18) 
ggsave(paste(opt$outdir,'PEP_hist_simulations.pdf',sep='/'), width=8,height=4) 

print('Finished! Success!')