In [1]:
%load_ext rpy2.ipython

In [3]:
%%R
library(ggplot2)
library(dplyr)
library(gridExtra)
library(grid)

In [4]:
from Bio import SeqIO
from IPython.display import FileLink

# Interleaving files

### Pyloric caeca library

```
!pairs join -t data/6110_5146_27472_AKJ5B_AFPC1_GGACTCCT_CTCTCTAT_R1.fq.gz \
data/6110_5146_27472_AKJ5B_AFPC1_GGACTCCT_CTCTCTAT_R2.fq.gz | pigz -p 20 --fast -c > \
data/AfPC.fq.gz
```

In [6]:
!du -h data/AfPC.fq.gz

5.1G	data/AfPC.fq.gz


### Epidermis library

```
!pairs join -t data/6361_5146_27473_AKDNM_AFTF1_TAGGCATG_CTCTCTAT_R1.fq.gz \
data/6361_5146_27473_AKDNM_AFTF1_TAGGCATG_CTCTCTAT_R2.fq.gz | pigz -p 20 --fast -c > \
data/AfTF.fq.gz
```

In [7]:
!du -h data/AfTF.fq.gz

1.6G	data/AfTF.fq.gz


### Both libraries

```
!cat data/AfTF.fq.gz \
data/AfPC.fq.gz > data/Af.fq.gz
```

In [9]:
!du -h data/Af.fq.gz

6.6G	data/Af.fq.gz


# Quality filtering, trimming and contamination removal

### Pyloric caeca library

```
!bbduk.sh ref=/opt/bbmap/resources/adapters.fa \
in=data/AfPC.fq.gz \
out=data/AfPC.trimmed.fq.gz \
interleaved=t qtrim=rl ktrim=r mink=12 k=23 tbo=t tpe=t
```

```
!bbduk.sh ref=/opt/bbmap/resources/phix174_ill.ref.fa.gz \
in=data/AfPC.trimmed.fq.gz \
out=data/AfPC.trimmed.contam.fq.gz \
interleaved=t
```

### Epidermis library

```
!bbduk.sh ref=/opt/bbmap/resources/adapters.fa \
in=data/AfTF.fq.gz \
out=data/AfTF.trimmed.fq.gz \
interleaved=t qtrim=rl ktrim=r mink=12 k=23 tbo=t tpe=t
```

```
!bbduk.sh ref=/opt/bbmap/resources/phix174_ill.ref.fa.gz \
in=data/AfTF.trimmed.fq.gz \
out=data/AfTF.trimmed.contam.fq.gz \
interleaved=t
```

### Both libraries

```
!bbduk.sh ref=/opt/bbmap/resources/adapters.fa \
in=data/Af.fq.gz \
out=data/Af.trimmed.fq.gz \
interleaved=t qtrim=rl ktrim=r mink=12 k=23 tbo=t tpe=t
```

```
!bbduk.sh ref=/opt/bbmap/resources/phix174_ill.ref.fa.gz \
in=data/Af.trimmed.fq.gz \
out=data/Af.trimmed.contam.fq.gz \
interleaved=t
```

# Merging reads

In [12]:
!bbmerge.sh in=data/Af.trimmed.contam.fq.gz \
out=data/Af.trimmed.contam.merged.fq.gz \
outu=data/Af.trimmed.contam.unmerged.fq.gz

java -Djava.library.path=/opt/bbmap/jni/ -ea -Xmx1000m -Xmx1000m -cp /opt/bbmap/current/ jgi.BBMerge in=data/Af.trimmed.contam.fq.gz out=data/Af.trimmed.contam.merged.fq.gz outu=data/Af.trimmed.contam.unmerged.fq.gz
Executing jgi.BBMerge [in=data/Af.trimmed.contam.fq.gz, out=data/Af.trimmed.contam.merged.fq.gz, outu=data/Af.trimmed.contam.unmerged.fq.gz]

BBMerge version 37.10
Writing mergable reads merged.
Started output threads.
Total time: 84.199 seconds.

Pairs:               	15914827
Joined:              	10175386   	63.937%
Ambiguous:           	5064746   	31.824%
No Solution:         	674695   	4.239%
Too Short:           	0       	0.000%

Avg Insert:          	256.6
Standard Deviation:  	120.2
Mode:                	195

Insert range:        	35 - 493
90th percentile:     	425
75th percentile:     	354
50th percentile:     	253
25th percentile:     	158
10th percentile:     	93


# Normalizaton and error correction

```
!bbnorm.sh target=100 mindepth=1 ecc=t threads=10 \
in=data/Af.trimmed.contam.merged.fq.gz \
out=data/Af.trimmed.contam.merged.normed.fq.gz
```

```
!bbnorm.sh target=100 mindepth=1 ecc=t threads=10 \
in=data/Af.trimmed.contam.unmerged.fq.gz \
out=data/Af.trimmed.contam.unmerged.normed.fq.gz
```

# Spades assembly

```
!spades.py --thread 20 --only-assembler \
-s data/Af.trimmed.contam.merged.normed.fq.gz \
--12 data/Af.trimmed.contam.unmerged.normed.fq.gz \
-o data/spades_AF_2
```

```
!bioawk -c fastx '{if ( length($seq) >= 3000 ) print ">"$name"\n"$seq }' \
data/spades_AF_2/contigs.fasta > data/spades_AF_2/contigs.gt3000.fasta
```

## Annotation

```
!tblastx -db data/ssDNAviruses \
-query data/spades_AF_2/contigs.gt3000.fasta \
-outfmt 6 -evalue 0.00000001 -max_target_seqs 5 -num_threads 25 \
-out data/spades_AF_2/DNAvirusesPO.m6
```

### Grabbing densovirus contigs

```
%%R

options(dplyr.width=Inf)

df = read.csv("data/spades_AF_2/DNAvirusesPO.m6", sep = "\t",
         stringsAsFactors = FALSE, header = FALSE) %>%
    group_by(V1) %>%
    filter(V2 =="Sea_star_associated_densovirus-like_genome_fragment") %>%
    filter(rank(-V12, ties.method = "random") == 1)  %>%
    filter(V4 >=10) %>%
    select(V1, V2, V3, V4, V12, V11)

ids = df$V1
```

In [20]:
ids = %Rget ids
keepers= []

for rec in SeqIO.parse("data/spades_AF_2/contigs.gt3000.fasta", "fasta"):
    if rec.name in ids:
        print(rec.name, len(rec.seq))
        keepers.append(rec)

with open("data/spades_AF_2/seastar_hit_contigs.fa", "w") as out_handle:
    SeqIO.write(keepers, out_handle, "fasta")

NODE_319_length_6089_cov_86.6124 6089


# Read Mapping

```
!bbmap.sh threads=15 mappedonly=t minid=0.95 idfilter=0.98 \
in=data/Af.trimmed.contam.fq.gz \
ref=data/spades_AF_2/seastar_hit_contigs.fa \
covstats=data/spades_AF_2/covstats \
basecov=data/spades_AF_2/bascov.txt \
covhist=data/spades_AF_2/hicov.txt
```

### Mapping by library

```
!bbmap.sh threads=20 k=15 vslow=t idfilter=0.95 \
in=data/AfTF.trimmed.contam.fq.gz \
ref=data/spades_AF_2/seastar_hit_contigs.fa \
covstats=data/spades_AF_2/AfTFcovstats
```

```
!bbmap.sh threads=20 k=15 vslow=t idfilter=0.95 \
in=data/AfPC.trimmed.contam.fq.gz \
ref=data/spades_AF_2/seastar_hit_contigs.fa \
covstats=data/spades_AF_2/AfPCcovstats
```