-
Notifications
You must be signed in to change notification settings - Fork 0
/
compile.R
executable file
·106 lines (93 loc) · 4.93 KB
/
compile.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#Compile MiSeq library and sample data into single phyloseq object
#Also add taxonomy and remove non-target sequences
library(tidyverse)
library(magrittr)
library(foreach)
library(dada2)
library(Biostrings)
library(phyloseq)
library(DECIPHER)
#read in denoised sequence table
seqTab <- readRDS("output/dada/seqTab.rds")
#read in sample data
meta <- read.csv("data/Sample_data.csv",as.is=T)
#merge into phyloseq object and give OTUs arbitrary names
seqs <- getSequences(seqTab) %>% DNAStringSet()
colnames(seqTab) <- paste0("OTU.",1:ncol(seqTab))
names(seqs) <- colnames(seqTab)
meta %<>% column_to_rownames("SampID")
phy <- phyloseq(otu_table(seqTab, taxa_are_rows = F),
sample_data(meta),
refseq(seqs))
#remove host contamination by aligning against genome with Bowtie2
#Requires a precompiled bowtie database file created from the Populus trichocarpa v3 genome
dir.create("output/scratch")
refseq(phy) %>% DNAStringSet %>% writeXStringSet("output/scratch/otus.fa")
system("/home/harpua/miniconda3/bin/bowtie2 --threads 10 -x data/referenceDBs/Ptri.v.3.db -f output/scratch/otus.fa --un output/scratch/noHost.fa --al output/scratch/host.fa --very-sensitive")
hostFiltered <- readDNAStringSet("output/scratch/noHost.fa") %>% names
phy %<>% prune_taxa(hostFiltered, .)
unlink("output/scratch",recursive=T)
#collapse sequences with >= 99% similarity
cluster <- function(phy.in,method="single",dissimilarity=0.01){
clusters <- DECIPHER::DistanceMatrix(refseq(phy.in), includeTerminalGaps = T, processors=NULL) %>%
DECIPHER::IdClusters(method=method, cutoff=dissimilarity, processors=NULL)
clusters[,1] %<>% as.character()
tax_table(phy.in) <- clusters %>% as.matrix %>% tax_table
phy.in %<>% speedyseq::tax_glom("cluster")
tax_table(phy.in) <- NULL
return(phy.in)
}
phy %<>% cluster
#Identify expected taxa (within 99% of expected species)
focalSeqs <- readDNAStringSet("data/SangerSeqs/dadaPriors.fasta")
seqDist <- append(focalSeqs,refseq(phy)) %>% DistanceMatrix(verbose=F, processors=NULL) %>%
.[!(rownames(.) %in% names(focalSeqs)) , (rownames(.) %in% names(focalSeqs))]
toMerge <- foreach(i=1:dim(seqDist)[1], .combine=rbind) %do% {
minDist <- min(seqDist[i,])
if(minDist > 0.02){return(c(rownames(seqDist)[i],NA))}
if(minDist <= 0.02){return(c(rownames(seqDist)[i],colnames(seqDist)[seqDist[i,]==minDist]))}
} %>% data.frame(stringsAsFactors = F) %>% drop_na()
toMerge$X2[toMerge$X2=="MelampsoraB"] <- "Melampsora"
for(i in unique(toMerge$X2)){
otus <- toMerge$X1[toMerge$X2==i]
taxa_names(phy)[taxa_names(phy)==otus[1]] <- i; otus[1] <- i
phy %<>% merge_taxa(otus)}
#assign taxonomy with database including expected taxa
tax <- assignTaxonomy(refseq(phy),"data/referenceDBs/sh_general_release_dynamic_s_04.02.2020.fasta.gz",multithread=T)
rownames(tax) <- taxa_names(phy)
tax_table(phy) <- tax_table(tax)
#remove OTUs not assigned to Order
phy %<>% subset_taxa(!is.na(Order))
#Remove putative contam (taxa more preveent or proportionally abundanct in negative control samples)
#After this filtering a number of unexpected OTUs remain. These appeaar to be primarily associated with uninoculated control plants. Their
contam.dat <- data.frame(OTU=taxa_names(phy),
prev.neg = phy %>% prune_samples(grepl("NEG",sample_names(.)),.) %>%
otu_table %>% data.frame %>% apply(2,function(x){sum(x>0)/length(x)}),
prev.samps = phy %>% prune_samples(grepl("^G",sample_names(.)),.) %>%
otu_table %>% data.frame %>% apply(2,function(x){sum(x>0)/length(x)}),
mean.neg = phy %>% prune_samples(grepl("NEG",sample_names(.)),.) %>%
otu_table %>% data.frame %>% apply(2,function(x){ifelse(sum(x)==0,0, sum(x)/sum(x>0))}),
mean.samps = phy %>% prune_samples(grepl("^G",sample_names(.)),.) %>%
otu_table %>% data.frame %>% apply(2,function(x){ifelse(sum(x)==0,0, sum(x)/sum(x>0))}))
contam.dat %<>% mutate(contam=(prev.neg>prev.samps|mean.neg>mean.samps))
phy %<>% prune_taxa(!contam.dat$contam,.)
#remove negative control samps
phy %<>% prune_samples(!grepl("NEG",sample_names(.)),.)
#' ### Look at sequencing depth
minDepth <- 5000
data.frame(SeqDepth=sample_sums(phy), TP=factor(sample_data(phy)$Timepoint),Type=sample_data(phy)$Treatment ) %>%
mutate(cutoff=SeqDepth>minDepth,neg=ifelse(Type=="Negative","neg","pos"),TP=paste(TP,neg)) %>%
ggplot(aes(x=TP, y=SeqDepth)) +
geom_violin() +
geom_point(aes(color=cutoff),position=position_jitter(width=0.1)) +
theme_classic()
#' ### Remove samples below sequencing depth cutoff
(phy %<>% prune_samples(sample_sums(.)>minDepth,.))
###############
### OUTPUTs ###
###############
#Save phyloseq object
saveRDS(phy,"output/compiled/phy.rds")
#Save components for manual inspection
otu_table(phy) %>% write.csv("output/compiled/OTU.table.csv")
tax_table(phy) %>% write.csv("output/compiled/taxonomy.table.csv")