## PAGE analysis

This is a Jupyter notebook.

To run all cells in the notebook use `Cell --> Run All`.

To run cells one at a time click into the first code cell and key `Shift-Enter` in each cell in sequence.

More information on Jupyter notebooks can be found
[here](http://jupyter-notebook.readthedocs.io/en/stable/examples/Notebook/Running%20Code.html).

In [1]:
## Set the plot window sizes within the notebook (in inches)
options(repr.plot.width=6, repr.plot.height=8)

## Please comment or uncomment the following declarations as appropriate for your run of this notebook:

In [2]:
opt <- list()

# Set defaults for optional parameters
opt$PEP = 'CMT_peps.csv'       ## PEP lists file (generated by the PEP creation script/notebook)
opt$outdir = './results'        ## Output directory
opt$datadir = './data'         ## Directory containing data provided with FREYA
opt$workingdir = './user_data' ## Directory containing user data

opt$outdir <- paste(opt$workingdir,opt$outdir, sep='/')

In [3]:
## If the output directory doesn't exist, create it
if(!dir.exists(opt$outdir)) {
  print(paste('Creating output directory',opt$outdir))
  system(paste('mkdir -p',opt$outdir))
}

### Functions 

In [4]:
get.labs <- function(comps) {
  ## Load the data, subset to shared dog-human genes
  dat <- read.table(paste(opt$datadir, 'BRCA_rnaseq_paired_noMets.t.txt',sep='/'), sep='\t', header=T, row.names=1, check.names=F)
  dat <- as.matrix(dat) + 1
  dat <- log(dat, base=2)
  rownames(dat) <- substr(rownames(dat),1,15)

  labs <- read.table(paste(opt$datadir,'BRCA_PAM50_labels.csv',sep='/'), sep=',', header=TRUE, row.names=1, check.names=FALSE, stringsAsFactors=FALSE)

  ## Drop to Basal vs Luminal comparison
  if('Luminal' %in% comps) {
    labs[which(labs$PAM50=='LumB'),'PAM50'] <- 'Luminal'
    labs[which(labs$PAM50=='LumA'),'PAM50'] <- 'Luminal'
    labs <- labs[ which(labs$PAM50 %in% comps),,drop=FALSE ]
    ids <- intersect( rownames(labs), rownames(dat) )
    dat <- dat[ids,]
    labs <- labs[ids,]
  } else if('TUMOR' %in% comps) {
    labs <- read.table(paste(opt$datadir,'BRCA_tumorVSnormal_paired.txt',sep='/'), header=F, row.names=1, check.names=F)
    rownames(labs) <- substr(rownames(labs),1,15)
    ids <- intersect( rownames(labs), rownames(dat) )
    dat <- dat[ids,]
    labs <- labs[ids,]
  } else {
    labs <- labs[labs[,1] %in% comps,,drop=FALSE]
    ids <- intersect( rownames(labs), rownames(dat) )
    dat <- dat[ids,]
    labs <- labs[ids,]
  }

  genes.all <- read.table(paste(opt$datadir,'Canine_Human_Gene_Conversion.txt',sep='/'), sep='\t', header=T, stringsAsFactors=F) # TODO: Update filename & location
  dat <- dat[,colnames(dat) %in% genes.all$Hum_Symb]
  return( list(dat, labs) )
}

In [5]:
load.peps <- function(fn) {
  ## Load the PEPs
  peps <- read.table(fn, sep=',', header=TRUE, stringsAsFactors=FALSE)
  pep.genes <- c( peps[peps$Adenoma_Expression_Pattern < 0.05,'HumanSymbol'], peps[peps$Tumor_Expression_Pattern < 0.05,'HumanSymbol'], peps[peps$Carcinoma_Expression_Pattern < 0.05,'HumanSymbol'] )
  pep.lists <- data.frame(Gene=pep.genes, PEP=rep(NA, length(pep.genes)))
  pep.lists[ pep.lists$Gene %in% peps[peps$Adenoma_Expression_Pattern < 0.05,'HumanSymbol'], 'PEP'] <- 'Adenoma'
  pep.lists[ pep.lists$Gene %in% peps[peps$Carcinoma_Expression_Pattern < 0.05,'HumanSymbol'], 'PEP'] <- 'Carcinoma'
  pep.lists[ pep.lists$Gene %in% peps[peps$Tumor_Expression_Pattern < 0.05,'HumanSymbol'], 'PEP'] <- 'Tumor'
  peps <- pep.lists

  return(peps)
}

### Code to run

In [6]:
## Load the data - TODO do I need to load anything aside from the peps?
print("Loading PEPs.")
peps <- load.peps(paste(opt$workingdir, opt$PEP, sep='/'))
dat <- read.table(paste(opt$datadir, 'BRCA_rnaseq_paired_noMets.t.txt',sep='/'), sep='\t', header=T, row.names=1, check.names=F)
dat <- as.matrix(dat) + 1
dat <- log(dat, base=2)
rownames(dat) <- substr(rownames(dat),1,15)

[1] "Loading PEPs."


In [7]:
## Load the labels and subset to the labeled data
print('Loading BRCA labels and data.')
labs <- read.table(paste(opt$datadir,'BRCA_PAM50_labels.csv',sep='/'), sep=',', header=TRUE, row.names=1, check.names=FALSE, stringsAsFactors=FALSE)

[1] "Loading BRCA labels and data."


In [8]:
## Subset to shared samples
ids <- intersect( rownames(labs), rownames(dat) )
dat <- dat[ids,]
labs <- labs[ids,,drop=FALSE]

## Subset to homologous genes
print("Subsetting to homologous genes.")
genes.all <- read.table(paste(opt$datadir,'Canine_Human_Gene_Conversion.txt',sep='/'), sep='\t', header=T, stringsAsFactors=F) # TODO: Update filename & location
dat <- dat[,colnames(dat) %in% genes.all$Hum_Symb]

[1] "Subsetting to homologous genes."


In [9]:
## Calculate standard deviation and list of non-PEP genes, for later comparisons
sd.genes <- sd(dat)
non.pep.genes <- colnames(dat)[ !colnames(dat) %in% peps$Gene ]

## Define the list of comparisons we want to make
comparisons <- list( c('TUMOR','NORMAL'), c('Basal','Luminal'), c('LumA','Normal'), c('LumB','Normal'), c('Luminal','Normal'), c('Basal','Normal') )

In [None]:
## For each comparison & PEP pair, calculate PAGE score
write('PAGE scores',file=paste(opt$outdir,'PAGE.txt', sep='/'))
print("Calculating PAGE score for each PEP/comparison pair.")
for( comps in comparisons ) {

  ## Print the comparison we're currently doing
  write('', file=paste(opt$outdir,'PAGE.txt', sep='/'), append=TRUE)
  write(paste(comps[1], comps[2]), file=paste(opt$outdir,'PAGE.txt', sep='/'), append=TRUE)
  print(comps)

  ## For each PEP, calculate PAGE
  for( pep.name in c('Adenoma', 'Carcinoma', 'Tumor') ) {

    # Load the applicable labels for this comparison
    dat.list <- get.labs(comps)
    dat <- dat.list[[1]]
    labs <- dat.list[[2]]

    ## Get the list of genes in the current PEP
    genes <- c(non.pep.genes, as.character(peps[ peps$PEP==pep.name, 'Gene' ]))
    genes <- genes[ genes %in% colnames(dat) ]

    ## Calculate and print PAGE score for this comparison
    res <- sapply(genes, function(gene) { ( (mean( dat[labs==comps[1],gene] ) - mean( dat[labs==comps[2],gene] ) ) * sqrt(sum(peps$PEP==pep.name)) ) / sd.genes } )
    temp <- data.frame( PAGE=res, PEP=names(res) %in% peps[ peps$PEP==pep.name,'Gene' ] )
    write(paste(pep.name,signif(wilcox.test( PAGE ~ PEP, data=temp )$p.value,digits=3), sep='\t'), file=paste(opt$outdir,'PAGE.txt', sep='/'), append=TRUE)
    print( paste(pep.name,signif(wilcox.test( PAGE ~ PEP, data=temp )$p.value,digits=3)) )

  } # End PEPs loop
} # End comps loop

In [11]:
print(paste('Results stored in',paste(opt$outdir,'PAGE.txt', sep='/')) )
print('Success!')

[1] "Results stored in ./output/PAGE.txt"
[1] "Success!"
