## Mutation analysis

This is a Jupyter notebook.

To run all cells in the notebook use `Cell --> Run All`.

To run cells one at a time click into the first code cell and key `Shift-Enter` in each cell in sequence.

More information on Jupyter notebooks can be found
[here](http://jupyter-notebook.readthedocs.io/en/stable/examples/Notebook/Running%20Code.html).

In [None]:
## Minimum number of mutations per gene to be included in analysis
opt.num = 5 

## File arguments
fn.hist <- './user_data/UserDogData_Phenotype.csv' # User-provided
fn.muts <- './user_data/mutations_genesOnly.csv' # Created by the mutations pipeline (see shell scripts)
fn.peps <- './data/CMT_peps.csv'  # PEP lists created in the expression pipeline. This is also Supp Table 1 in the manuscript  
fn.pam50 <- NULL #set to NULL if you don't want to run, filename otherwise TODO this hsould point to synthetic example - is optional input

## Filenames that don't change between users
data.dir <- './data/' # Working directory - we should be providing this with the requisite files # TODO: This will need to match the layout we give the whole pipeline. Have it be wherever we store the data
fn.cosmic       <- paste0(data.dir,'genes_COSMIC.csv') # COSMIC genes list, should download most recent version instead of using included one?
fn.pam50.genes  <- paste0(data.dir,'PAM50_genes.csv') # This list will never change, no need to have as input

## Please comment or uncomment the following declarations as appropriate for your run of this notebook:

In [None]:
# PAM50 file generated by PAM50 processing (comment if needed)
#fn.pam50 <- './user_data/output/PAM50_dog.csv'
run.pam50 <- !is.null(fn.pam50)

# Synthetic and "canned" data for testing this notebook (uncomment if needed)
#fn.pam50 <- "./synthetic_data/User_PAM50_SampleData_dog.csv "
fn.hist <- "./synthetic_data/User_SampleData_Phenotype.csv"
fn.muts <- "./synthetic_data//User_SampleData_Mutations.csv"

In [None]:
# Create an output directory for artifacts (if it doesn't already exist)
system('mkdir -p ./user_data/output')

# output files
mut.rates.file <- './user_data/output/Sample_Mut_Rates.pdf'
density.plot.file <- './user_data/output/Sample_Mutation_Counts_Density.pdf'
cosmic.mutations.file <- './user_data/output/COSMIC_Genes_Mutations.pdf'
mutations.consistency.file <- './user_data/output/MutationConsistency.pdf'
freq.mutations.file <- './user_data/output/FreqMutatedGenes_ClinicalCorrelations.csv'

In [None]:

## Make sure all of the required files exist - quit if any are missing
for( fn in c(fn.hist, fn.muts, fn.peps, fn.cosmic, fn.pam50.genes) ) {
  if(!file.exists(fn)) { print(paste('ERROR: Unable to locate',fn)); quit(save='no',status=1) }
}


In [None]:

## Load the clinical data, extract dog IDs
dat.hist <- read.table(fn.hist, sep=',', header=TRUE, row.names=1)
if('9A' %in% rownames(dat.hist)) {rownames(dat.hist)[which(rownames(dat.hist)=='9A')] <- '9A1' } # TODO: Specific to our dataset only! Remove this line once we're done testing (or if we switch 9A1.bam to 9A.bam)


In [None]:

## Alphabetic IDs for each dog instead of numeric
## Generally don't need this- only CMTGA changes up the names halfway through
dat.hist$Patient <- as.character(dat.hist$Patient) # Ensure patient names are character strings for plotting consistency

In [None]:
## Load the PAM50 subtypes 
##   This file is created by PAM50_refactored.R, should just port straight over (don't need user to specify)
if( run.pam50 ) { pam50 <- read.table(fn.pam50, sep=',', row.names=1)
  print('PAM50 subtype counts per patient:'); flush.console()
  print(table(pam50[,1], dat.hist[rownames(pam50),'Patient']))
  dat.hist$PAM50 <- pam50[rownames(dat.hist),1]
}


In [None]:
## Load the list of COSMIC genes
genes.cosmic <- rownames(read.table(fn.cosmic, sep=',', header=TRUE, row.names=1))

In [None]:

## Load the mutations data, make 0/1 calls instead of # calls per gene
dat <- read.table(fn.muts, sep=',', header=TRUE, row.names=1, check.names=FALSE)

dat.bin <- dat
dat.bin[dat.bin>0] <- 1


In [None]:
dat

In [None]:

## Create 2 matrices: Benign and Malignant samples
dat.m <- dat.bin[,rownames(dat.hist)[dat.hist$Hist=='M']]
dat.b <- dat.bin[,rownames(dat.hist)[dat.hist$Hist=='B']]

dat.m <- t(aggregate(t(dat.m), by=list(dat.hist[colnames(dat.m),'Patient']), FUN=sum))
colnames(dat.m) <- dat.m[1,]
dat.m <- dat.m[-1,]

dat.b <- t(aggregate(t(dat.b), by=list(dat.hist[colnames(dat.b),'Patient']), FUN=sum))
colnames(dat.b) <- dat.b[1,]
dat.b <- dat.b[-1,]

## Convert from character to numeric
class(dat.b) <- 'numeric'
class(dat.m) <- 'numeric'

# For now we don't care about # samples mutated in each gene per patient, just that at least 1 sample is mutated
# So set >1 values to 1
dat.m[dat.m>0] <- 1
dat.b[dat.b>0] <- 1


In [None]:

#########################################
## Color Palette for manuscript
#####################################

cols <- c('#9DC7D8','#7FA1BE','#EBDA8C','#01B3CA','#4F6E35','#965354','#7DD1B9','#808040','#C6CBCE','#1D4E88','#C78C6C','#F69256','#D2B29E','#8B868C','#E38691','#B490B2') # All colors in palette
cols.hist <- c('#7DD1B9','#EBDA8C','#965354') # order = healthy, benign, malignant
cols.peps <- c('#7FA1BE','#F69256','#E38691') # order = tumor, adenoma, carcinoma


In [None]:
#########################################
## Figure 2b - red&blue histogram
#####################################
#print('Generating mutation histogram'); flush.console()

require(ggplot2)

## Create data frame for patient summaries, converting to alphabet patient IDs instead of numeric
mut.rates <- data.frame(Muts=apply(dat.bin, 2, sum), 
                        Hist=dat.hist[colnames(dat.bin),'Hist'], 
                        Dog=dat.hist[colnames(dat.bin),'Patient'], 
                        Sample=colnames(dat.bin))

In [None]:
dim(dat.hist)
dim(dat.bin)

In [None]:
## Plot the subfigure
# mut.rates.file <- 'Sample_Mut_Rates.pdf'
ggplot(mut.rates, aes(Dog, Muts)) + geom_bar(aes(fill = Hist), position = "dodge", stat="identity") + scale_fill_manual(values=cols.hist[2:3]) + theme_minimal() + coord_flip() + theme(axis.text.x=element_text(angle = -325, hjust = 1), text = element_text(size=30))
ggsave(mut.rates.file,width=4,height=10)

In [None]:
#########################################
## Figure 2a - red&blue density plot
#####################################
print('Generating density plot'); flush.console()

## Count mutations in each benign & malignant sample, create and save density plot
samples.freq <- data.frame(Mutations=apply(dat, 2, sum), Hist=dat.hist[colnames(dat),'Hist'])

In [None]:
# density.plot.file <- 'Sample_Mutation_Counts_Density.pdf'
ggplot(samples.freq) + geom_density(aes(Mutations,group=Hist,col=Hist),lwd=3) + scale_color_manual(values=cols.hist[2:3]) + theme_bw() + theme(text = element_text(size=20))
ggsave(density.plot.file,width=12, height=4)

## Print the median number of mutated genes per histology
print( paste('Median mutations in benign samples:', median( samples.freq[samples.freq$Hist=='B','Mutations']) )); flush.console()
print( paste('Median mutations in malignant samples:', median( samples.freq[samples.freq$Hist=='M','Mutations']) )); flush.console()

In [None]:
#########################################
## Figure 2b - navy&white dot plot
#####################################
#print('Generating pooled mutations plot'); flush.console()

require(reshape2)


In [None]:

## Calculate most frequently mutated (by % of samples of each type) to get balanced frequently mutated genes
##    Otherwise will give mostly benign mutations, since we have 2x benign samples
ids.benign <- rownames(dat.hist)[ dat.hist$Hist=='B' ]
ids.tumor <- rownames(dat.hist)[ dat.hist$Hist=='M' ]

benign.ratios <- apply(dat.bin[,ids.benign], 1, function(x){sum(x==1)/length(x)})
tumor.ratios <- apply(dat.bin[,ids.tumor], 1, function(x){sum(x==1)/length(x)})
max.ratios <- apply(cbind(benign.ratios,tumor.ratios), 1, max)


In [None]:
## INPUT: This should be an optional parameter (file with list of genes OR statistic to use for picking genes) with default 30 genes w/max ratios
genes <- names(sort(apply(dat[rownames(dat) %in% genes.cosmic,], 1, function(x){sum(x>0)}),decreasing=TRUE))[1:30] # Use this for the COSMIC plot
#genes            <- names(max.ratios)[max.ratios>0.15]  # Another option - pick some cutoff of mutated ratios for benign/malignant
#genes <- sample(rownames(dat)[!rownames(dat) %in% genes.cosmic], 30) #Use this for the random sampling plot (randomly samples from non-cosmic genes)

## Melt the malignant sample matrix
dat.m.melted <- melt( as.matrix(dat.m[rownames(dat.m) %in% genes,]) )
dat.m.melted$value <- as.numeric(as.character(dat.m.melted$value))
dat.m.melted$value[dat.m.melted$value>0] <- 1
dat.m.melted$value <- as.factor(dat.m.melted$value) # For color scales


In [None]:
## Melt the benign sample matrix
dat.b.melted <- melt( as.matrix(dat.b[rownames(dat.b) %in% genes,]) )
dat.b.melted$value <- as.numeric(as.character(dat.b.melted$value))
dat.b.melted$value[dat.b.melted$value>0] <- 1
dat.b.melted$value <- as.factor(dat.b.melted$value) # For color scales


In [None]:

## Combine the 2 melted matrices
dat.melted<- cbind(dat.m.melted, dat.b.melted$value)
colnames(dat.melted) <- c('Gene','Dog','Tumor','Benign')
dat.melted$Dog <-  as.character(dat.melted$Dog) # So the plot sorts them alphabetically


In [None]:

## Plot the result
# cosmic.mutations.file <- 'COSMIC_Genes_Mutations.pdf'
ggplot(dat.melted) + geom_point(aes(Gene, Dog, col=Tumor), size=8, pch=15) +
  geom_point(aes(Gene,Dog,col=Benign), size=4, pch=16) +
  theme(axis.text.x=element_text(angle = -325, hjust = 1)) +
  scale_color_manual(values=c('white',cols[10]))
ggsave(cosmic.mutations.file, width=10, height=5.5)


In [None]:
#########################################
## Supplemental Figure 4 - Frequently mutated genes
#####################################

print('Generating per-sample mutations plot'); flush.console()

genes            <- names(max.ratios)[max.ratios>0.15]
s.counts         <- table(dat.hist[colnames(dat.bin),'Patient'])

In [None]:
# For our dataset only, reorder the names (because we used numeric patient names :/
# TODO: Nick once we've finished testing for our data, we should remove this next line
s.counts         <- s.counts[sort(paste0(names(s.counts),'A'),index=TRUE)$ix] 

In [None]:
dat.bin.melted           <- melt(as.matrix(dat.bin[genes,]))
dat.bin.melted           <- dat.bin.melted[,c(2,1,3)]
colnames(dat.bin.melted) <- c('Sample','Gene','Alteration')
dat.bin.melted$Sample    <- as.character(dat.bin.melted$Sample)

In [None]:
dat.bin.melted$Hist      <- dat.hist[ dat.bin.melted$Sample, 'Hist' ]

In [None]:
ggplot(dat.bin.melted) +
  geom_point(aes(Sample, Gene, color=interaction(factor(Alteration),Hist)),pch=15,size=3) +
  scale_color_manual(values=c('white',cols.hist[2],'white',cols.hist[3])) +
  theme_classic() +
  theme(legend.position='none',axis.text.x=element_text(angle = -325, hjust = 1)) +
  geom_vline(xintercept=cumsum(s.counts[-length(s.counts)])+0.5,col=cols[14],size=2)
# mutations.consistency.file <- 'MutationConsistency.pdf'
ggsave(mutations.consistency.file,width=13,height=7)

In [None]:
#########################################
## Added after sharing with SF
#####################################

## Do the subtypes have different numbers of mutations (total, not just in PAM50 genes)
##   For samples of each subtype, print median num mutations in the samples
num.muts <- apply(dat.bin, 2, sum)
if( run.pam50 ) {
  colnames(pam50)[1] <- 'PAM50'
  pam50$Muts <- NA
  pam50[colnames(dat.bin),'Muts'] <- num.muts
  print('PAM50 sample counts:'); flush.console()
  print(sapply( levels(pam50$PAM50), function(x) {median( pam50[pam50$PAM50==x,'Muts'], na.rm=TRUE )} )); flush.console()
}

### Are COSMIC genes more frequently mutated than non-COSMIC?
print(res.ttest <- t.test( apply(dat.bin, 1, sum) ~ factor(rownames(dat.bin) %in% genes.cosmic) ))
if( res.ttest$p.value < 0.05) {
  print( paste('COSMIC genes are significantly more frequently mutated than non-COSMIC genes, p-value =', signif(res.ttest$p.value,digits=3)) )
} else {
  print( paste('COSMIC genes are NOT significantly more frequently mutated than non-COSMIC genes, p-value =', signif(res.ttest$p.value,digits=3)) )
}
flush.console()
rm(res.ttest)

In [None]:
print(colnames(dat.hist))

In [None]:

### Are PAM50 genes more frequently mutated than non-PAM50?
genes.pam50 <- rownames(read.table(fn.pam50.genes, sep=',', row.names=1))
print( res.ttest <- t.test( apply(dat.bin, 1, sum) ~ factor(rownames(dat.bin) %in% genes.pam50) ) ) 
if( res.ttest$p.value < 0.05) {
  print( paste('PAM50 genes are significantly more frequently mutated than non-COSMIC genes, p-value =', signif(res.ttest$p.value,digits=3)) )
} else {
  print( paste('PAM50 genes are NOT significantly more frequently mutated than non-COSMIC genes, p-value =', signif(res.ttest$p.value,digits=3)) )
}
rm(res.ttest)
flush.console()

## Correlate mutations w/clinical factors of interest- this will return a matrix of dat.hist columns by genes, filled with corrected pvals 
print(paste('Calculating correlations between mutations and phenotype data (Patient, Location, Histology, etc) in genes with >',opt.num,'mutations in the cohort.'))
get.pvals <- function(id) {
  phen.cols <- c('Patient','Location','Goldschmidt','Hist','SimHist','DetHist','PAM50') # Which clinical factors we care about
  phen.cols <- phen.cols[ phen.cols %in% colnames(dat.hist) ] # Make sure these are in the provided phenotype/clinical data
  p.adjust(apply(dat.hist[colnames(dat.bin),phen.cols], 2, function(x) {try(chisq.test(table( factor(x), unlist(dat.bin[id,])))$p.value)}))
}
genes       <- names(which(apply(dat.bin, 1, sum)>opt.num)) # Only care about frequently mutated genes
print(length(genes))
if(length(genes)>0) {
  genes.pvals <- sapply(genes, get.pvals)
  # freq.mutations.file <- 'FreqMutatedGenes_ClinicalCorrelations.csv'
  write.table(signif(t(genes.pvals),digits=5), file=freq.mutations.file, sep=',', col.names=TRUE, row.names=TRUE, quote=FALSE)
  print('Phenotype/Clinical correlations stored to file.')
} else {
    print("No frequently mutated genes, skipping this step.")
}

In [None]:

## Are PEP list genes more frequently mutated?
## Load the PEPs & print PEP genes that are frequently mutated
print('Loading PEPs. PEP list lengths:'); flush.console()
peps <- read.table(fn.peps, sep=',', header=TRUE, stringsAsFactors=FALSE)

peps <- list( Adenoma=peps[peps$Adenoma_Expression_Pattern < 0.05,'HumanSymbol'], 
             Carcinoma=peps[peps$Carcinoma_Expression_Pattern < 0.05,'HumanSymbol'], 
             Tumor=peps[peps$Tumor_Expression_Pattern < 0.05,'HumanSymbol'])
print(sapply(peps, length)); flush.console() # Print num genes in each PEP 
print(paste('Checking for frequently mutated PEP genes (>',opt.num,'mutations):'))
pep.mut.counts <- sapply(peps, function(x) { apply(dat.bin[rownames(dat.bin) %in% x,], 1, sum)} )
print(sapply(pep.mut.counts, function(x){ names(x)[which(x>opt.num)] }))
print(sapply(pep.mut.counts, summary))
flush.console()

print('Done with mutation analysis.') 


Unless the default paths have been changed above the
image files generated by this notebook may be downloaded using the
Jupyter directory browser interface at
[`./user_data/output`](./user_data/output).