In [None]:
#This pipeline was usen ONLY for Soil_1 to Soil_6 and Rhizosphere_1 to Rhizosphere_10 metagenomic samples#
#The resulting contigs were used to construct the Environmental Extended Pangenome (EEP)#

In [None]:
#Host filtering for Rhizosphere samples
#Create host genome index database
#Solanum lycopersicum (tomato) and Arabidopsis thaliana were downloaded from NCBI and renamed#
#Example#
$PATH/sratoolkit.2.10.8-ubuntu64/bin/fasterq-dump SRR23012434 --split-files

##Script name: make_db_tomatogenome.scr##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash
bowtie2-build $PATH/solirubrobacter/mg_hybrid_assb/tomato_genomic.fna.gz tomato_genome


##Script name: make_db_athgenome.scr##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash
bowtie2-build $PATH/solirubrobacter/mg_hybrid_assb/athaliana_genomic.fna.gz ath_genome

In [None]:
#Map tomato rhizosphere metagenomes against the host (tomato) genome with Bowtie2.
#Align against reference genome
##Script name: tmap_maker.scr##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

SEQS=$PATH/solirubrobacter/mg_hybrid_assb


for FAA in `ls T*.fastq.gz | perl -pe 's/\_.*//g' | sort | uniq`
do

echo "#!/bin/bash" >tmap_$FAA.sh
echo "#$ -cwd" >>tmap_$FAA.sh
echo "#$ -j y" >>tmap_$FAA.sh
echo "#$ -S /bin/bash" >>tmap_$FAA.sh
echo "bowtie2 -x tomato_genome -1 $SEQS/$FAA"_R1.fastq.gz" -2 $SEQS/$FAA"_R2.fastq.gz" -S $FAA.sam"  -p 20 >>tmap_$FAA.sh
chmod +x *.sh; done


##Script name: amap_maker.scr##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

SEQS=$PATH/solirubrobacter/mg_hybrid_assb

for FAA in `ls *.fastq.gz | grep -v "T" | grep "^\R" | perl -pe 's/\_.*//g' | sort | uniq`
do

echo "#!/bin/bash" >amap_$FAA.sh
echo "#$ -cwd" >>amap_$FAA.sh
echo "#$ -j y" >>amap_$FAA.sh
echo "#$ -S /bin/bash" >>amap_$FAA.sh
echo "bowtie2 -x ath_genome -1 $SEQS/$FAA"_R1.fastq.gz" -2 $SEQS/$FAA"_R2.fastq.gz" -S $FAA.sam"  -p 20 >>amap_$FAA.sh
chmod +x *.sh; done

In [None]:
#Convert sam to bam
##Script name: samtobam.scr ##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

FILES=$PATH/solirubrobacter/mg_hybrid_assb

for FAA in `ls *.sam | sed -e 's/.sam//g'`
do
echo "#!/bin/bash" >samtobam_$FAA.sh
echo "#$ -cwd" >>samtobam_$FAA.sh
echo "#$ -j y" >>samtobam_$FAA.sh
echo "#$ -S /bin/bash" >>samtobam_$FAA.sh
echo "#!/bin/bash" >>samtobam_$FAA.sh
echo "samtools view -bS $FILES/$FAA".sam "> $FAA.bam" >>samtobam_$FAA.sh
chmod +x *.sh; done

In [None]:
#Get unmapped paired sequences against tomato genome (bothEndsUnmapped)
##Script name: getunmapped_maker.scr##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

FILES=$PATH/solirubrobacter/mg_hybrid_assb

for FAA in `ls *.bam | sed -e 's/.bam//g'`
do

echo "#!/bin/bash" >getunmapped_$FAA.sh
echo "#$ -cwd" >>getunmapped_$FAA.sh
echo "#$ -j y" >>getunmapped_$FAA.sh
echo "#$ -S /bin/bash" >>getunmapped_$FAA.sh
echo "#!/bin/bash" >>getunmapped_$FAA.sh

echo "samtools view -b -f 12 -F 256  $FILES/$FAA".bam  "> $FAA"_umh_bothends.bam  >>getunmapped_$FAA.sh
chmod +x *.sh; done

#Running all the scripts#
for N in `ls getunmapped_*.sh`; do qsub $N; done

In [None]:
#Obtain R1 and R2 paired sequences for host filtered samples
#sort bam file by read name (-n) to have paired reads next to each other as required by bedtools
##Script name: sortedbam_maker.scr##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash
#!/bin/bash

FILES=$PATH/solirubrobacter/mg_hybrid_assb

for FAA in `ls *_umh_bothends.bam | sed -e 's/_umh_bothends.bam//g'`
do

echo "#!/bin/bash" >sortedbam_$FAA.sh
echo "#$ -cwd" >>sortedbam_$FAA.sh
echo "#$ -j y" >>sortedbam_$FAA.sh
echo "#$ -S /bin/bash" >>sortedbam_$FAA.sh
echo "#!/bin/bash" >>sortedbam_$FAA.sh

echo "samtools sort -n $FILES/$FAA"_umh_bothends.bam -o "$FAA"_sorted.bam  >>sortedbam_$FAA.sh
chmod +x *.sh; done

#Running all the scripts#
for N in `ls sortedbam_*.sh`; do qsub $N; done

In [None]:
#Get the filtered fastq files against host genome with bedtools
##Script name: filth_maker.scr (Cuallicua)##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash
#!/bin/bash

FILES=$PATH/solirubrobacter/mg_hybrid_assb
BIN=$PATH/binc/bedtools2


for FAA in `ls *_sorted.bam  | sed -e 's/_sorted.bam//g'`
do
echo "#!/bin/bash" >filth_$FAA.sh
echo "#$ -cwd" >>filth_$FAA.sh
echo "#$ -j y" >>filth_$FAA.sh
echo "#$ -S /bin/bash" >>filth_$FAA.sh

echo "$BIN"/bamToFastq  -i  "$FILES/$FAA"_sorted.bam "-fq $FILES/$FAA"_filtered_R1.fastq "-fq2 $FILES/$FAA"_filtered_R2.fastq >> filth_$FAA.sh
chmod +x *.sh; done

#Running all the scripts#
for N in `ls filth_*.sh`; do qsub $N; done

In [None]:
#Quality trimming
####Obtaining quality filtered, paired and unpaired sequences using Trimmomatic.
##Script name: trimming_maker.scr##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash
SEQS=$PATH/solirubrobacter/mg_hybrid_assb
BIN=$PATH/bin/Trimmomatic-0.36
COUNT=0
for FAA in `ls *.fastq | perl -pe 's/\_.*//g' | sort | uniq`
do
echo "#!/bin/bash" >trim_$FAA.sh
echo "#$ -cwd" >>trim_$FAA.sh
echo "#$ -j y" >>trim_$FAA.sh
echo "#$ -S /bin/bash" >>trim_$FAA.sh
echo "java -jar $BIN/trimmomatic-0.36.jar PE -threads 2 -phred33 -trimlog $SEQS/trim.log $SEQS/$FAA"_filtered_R1.fastq" $SEQS/$FAA"_filtered_R2.fastq" $SEQS/$FAA"_paired_R1.fastq.gz" $SEQS/$FAA"_unpaired_R1.fastq.gz "$SEQS/$FAA"_paired_R2.fastq.gz" $SEQS/$FAA"_unpaired_R2.fastq.gz" ILLUMINACLIP:/home/luis/bin/Trimmomatic-0.36/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36" >>trim_$FAA.sh
chmod +x *.sh; done

In [None]:
#Hybrid assembly
#Assembly Spades
##Script name: spades_maker.scr##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

SEQS=$PATH/solirubrobacter/mg_hybrid_assb
BIN=$PATH/binc/SPAdes-3.12.0-Linux/bin
COUNT=0
for FAA in `ls *_paired*.fastq.gz | perl -pe 's/\_.*//g' | sort | uniq`
do
echo "#!/bin/bash" >$FAA.scr
echo "#$ -cwd" >>$FAA.scr
echo "#$ -j y" >>$FAA.scr
echo "#$ -S /bin/bash" >>$FAA.scr
echo date >> $FAA.scr
echo  "$BIN"/spades.py --meta -1 "$SEQS/$FAA"_paired_R1.fastq.gz -2 "$SEQS/$FAA"_paired_R2.fastq.gz -o "$SEQS/contigs_spades/contig_$FAA"/ >>$FAA.scr
echo date >>$FAA.scr
chmod +x *.scr; done

#Running all scripts#
for N in `ls T*.scr`; do qsub -pe completenode 15 -l h_vmem=130G $N; done
for N in `ls R*.scr`; do qsub -pe completenode 16 -l h_vmem=130G $N; done
for N in `ls *.scr | grep -v "_" | grep -v "^\R" | grep -v "^\T" | grep -v "m"`; do qsub -pe completenode 15 -l h_vmem=130G $N; done

In [None]:
#Creating a file with contig files from all metagenomes
mkdir $PATH/solirubrobacter/hybrid_assb/map_spades
for N in `ls $PATH/solirubrobacter/mg_hybrid_assb/contigs_spades/contig_*/contigs_*.fasta`; do ln -s $N $PATH/solirubrobacter/mg_hybrid_assb/map_spades/; done

#Make symbolic link to paired trimmed sequences
for N in `ls $PATH/solirubrobacter/mg_hybrid_assb/*_paired*`; do ln -s $N $PATH/solirubrobacter/mg_hybrid_assb/map_spades/; done

#Map unassembled sequences in spades in order to make a second assembly using Velvet. 
##Script name: spades_map_maker.scr##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

CONT=$PATH/solirubrobacter/mg_hybrid_assb/map_spades

COUNT=0

for FAA in `ls *paired_R1* | sed -e 's/_paired_R1.fastq.gz//g'`
do
let COUNT=COUNT+1
echo "#!/bin/bash" >spadesmap_$FAA.scr
echo "#$ -cwd" >>spadesmap_$FAA.scr
echo "#$ -j y" >>spadesmap_$FAA.scr
echo "#$ -S /bin/bash" >>spadesmap_$FAA.scr
echo "/PATH/bbmap/bbwrap.sh ref="$CONT"/"$FAA"_contigs.fasta in="$CONT/$FAA"_paired_R1.fastq.gz in2="$CONT/$FAA"_paired_R2.fastq.gz build="$COUNT" out="$FAA".sam kfilter=22 subfilter=15 maxindel=80" >>spadesmap_$FAA.scr
chmod +x *.scr; done

#Generating scripts#
bash spades_map_maker.scr

In [None]:
#Get Spades unmapped reads
##Script name: get_unspa_maker.scr (Cuallicua) ##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

SEQS=$PATH/solirubrobacter/mg_hybrid_assb/map_spades

for FAA in `ls *.sam | sed -e 's/.sam//g'`
do
echo "#!/bin/bash" >get_un_spa_$FAA.scr
echo "#$ -cwd" >>get_un_spa_$FAA.scr
echo "#$ -j y" >>get_un_spa_$FAA.scr
echo "#$ -S /bin/bash" >>get_un_spa_$FAA.scr
echo "source activate concoct_env" >>get_un_spa_$FAA.scr
echo "$PATH/anaconda3/envs/concoct_env/bin/samtools view -u -f 12 -F 256 "$SEQS/$FAA".sam" "|" "$PATH/anaconda3/envs/concoct_env/bin/samtools bam2fq - > "$FAA"_ump.fastq"  >> get_un_spa_$FAA.scr
echo "source deactivate" >>get_un_spa_$FAA.scr
chmod +x *.scr; done

#Separation of unmapped reads into forward and reverse
for FAA in `ls *_ump.fastq | sed -e 's/_ump.fastq//g'`; do 
grep '^@.*/1$' -A 3 --no-group-separator "$FAA"_ump.fastq > "$FAA"_um_r1.fastq
grep '^@.*/2$' -A 3 --no-group-separator "$FAA"_ump.fastq > "$FAA"_um_r2.fastq; done

In [None]:
#Assembly Velvet
#Creating a symbolic link to no mapped sequences in Spades

ln -s $PATH/solirubrobacter/mg_hybrid_assb/map_spades/*_um_r*.fastq $PATH/solirubrobacter/mg_hybrid_assb/velvet/

#Velvet assembly of reads that did not map to Spades contigs
##Script name: velvet_maker.scr (Cuallicua) ##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

SEQS=$PATH/solirubrobacter/hybrid_assb/velvet
BIN=$PATH/binc/velvet


COUNT=0
for FAA in `ls *_um_r1.fastq | perl -pe 's/\_um_r1.fastq//g' | sort | uniq`
do
let COUNT=COUNT+1
echo "#!/bin/bash" >velvet_$FAA.scr
echo "#$ -cwd" >>velvet_$FAA.scr
echo "#$ -j y" >>velvet_$FAA.scr
echo "#$ -S /bin/bash" >>velvet_$FAA.scr

echo "$BIN"/velveth "$SEQS/$FAA"_assemblyVelvet 31 -shortPaired -fastq "$FAA"_um_r1.fastq "$FAA"_um_r2.fastq >>velvet_$FAA.scr
echo "$BIN"/velvetg "$SEQS/$FAA"_assemblyVelvet -exp_cov 2 -ins_length 350 >> velvet_$FAA.scr
chmod +x *.scr; done

#Running all the scripts#
for N in `ls velvet_*`;do qsub -pe completenode 10 -l h_vmem=80G $N; done

for N in `ls $PATH/solirubrobacter/mg_hybrid_assb/contigs_spades/contig_*/contigs_*.fasta`; do ln -s $N $PATH/solirubrobacter/mg_hybrid_assb/all_contigs/; done
for N in `ls $PATH/solirubrobacter/mg_hybrid_assb/velvet/*_assemblyVelvet/*_v_*`; do ln -s $N $PATH/solirubrobacter/mg_hybrid_assb/all_contigs/; done

In [None]:
#Spades and Velvet contigs were merged
#Ran in /hybrid_assb/all_contigs
for FAA in `ls contigs_*.fasta | sed -e 's/contigs_*//g' |  sed -e 's/.fasta//g' |  grep -v "v_"`;
do
cat $PATH/solirubrobacter/mg_hybrid_assb/all_contigs/contigs_"$FAA".fasta $PATH/solirubrobacter/mg_hybrid_assb/all_contigs/contigs_v_"$FAA".fasta > $PATH/solirubrobacter/mg_hybrid_assb/assh_merge/"$FAA"_hb_contigs.fasta; done

#Remove reads shorter than 100pb
#Ran in /home/angelica/solirubrobacter/mg_hybrid_assb/assh_merge/
for FAA in `ls *_hb_contigs.fasta| sed -e 's/_hb_contigs.fasta//g'`; do perl $PATH/scripts/remove_small.pl 100 "$FAA"_hb_contigs.fasta> "$FAA"_hb100_contigs.fasta; done

#Hybrid contigs were indexed according to sample name
for i in `ls *_hb100_contigs.fasta | sed -e 's/_hb100_contigs.fasta//g'`; do perl $PATH/scripts/header.fasta.numbers.pl "$i" "$i"_hb100_contigs.fasta; done