In [None]:
#Completeness evaluation for each reference genome#

In [None]:
checkm lineage_wf $PATH/solirubrobacter/genomas_ncbi/genomes $PATH/solirubrobacter/genomas_ncbi/checkm_results

In [None]:
#PHYLOGENOMIC ANALYSIS#

In [None]:
#Searching 16S rRNA genes using barrnap#
#The following line was run for every genome#
$PATH/mbin/quast-5.0.0/quast_libs/barrnap/bin/barrnap -k bac Strain1_genomic.fna > Strain1_rRNA.gff
#Creating a file with 16S sequence for each genome#
for N in `ls *_genomic.fna | sed s'/_genomic.fna//g'`; do $PATH/bedtools2/bin/bedtools getfasta -fi $N"_genomic.fna" -bed $N"_rRNA.gff" -fo $N"_16S.fasta"; done
#The N_16S.fasta files were manually edited to remove 23S and 5S sequences#
#Creating a file with the 16S sequences of all the genomes#
cat *_16S.fasta > soli_outgroups_16S.fasta

In [None]:
#Creating 16S phylogeny#
/usr/local/bin/ssu-align soli_outgroups_16S.fasta soli_outgroups_align
/usr/local/bin/ssu-mask soli_outgroups_align/
/usr/local/bin/ssu-mask --stk2afa soli_outgroups_align
#The resulting *.afa file was evaluated using Seaview#
#Creating Tree#
$PATH/raxmlHPC-PTHREADS -T 40 -f a -x 31416 -p 31416 -# 1000 -m GTRGAMMA -s soli_outgroups_align.bacteria.afa -n 16Sout
#The resulting tree was visualized using iTOL

In [None]:
#Analysis using Genome Similarity Score (GSS)#
#Every sequence was renamed StrainX_sequenceY using an inhouse script. The resulted file was StrainX_edited.faa
ls *_edited.faa > my_genomes

#Running GSS scripts#
gss_blast my_genomes
gss_bbh my_genomes
gss_comp my_genomes

#This generates a table with similarity values for each paired comparison and a newick file with a dendrogram#

In [None]:
#Analysis using average nucleotide identity (ANI)#
for N in `ls *.fna`; do $PATH/fastani-1.32-he1c1bb9_0/bin/fastANI -q $N --rl reference_list -o $N.ani; done
#reference_list is a text file containing the name of the files of all genomes to be compared

In [None]:
#Using GET_HOMOLOGUES#
$PATH/mbin/get_homologues/get_homologues.pl -d ./proteomas -c -t0 -G
$PATH/mbin/get_homologues/compare_clusters.pl -o sample_intersection -m -T -d $PATH/solirubrobacter/proteomas_homologues/Strain1protein_f0_0taxa_algCOG_e0_/
$PATH/mbin/get_homologues/parse_pangenome_matrix.pl -m sample_intersection/pangenome_matrix_t0.tab -s -x
##Using the list pangenome_matrix_t0__core_list.txt all core sequences were moved to a new directory $PATH/solirubrobacter/sample_intersection/core_seqs/##
#Using align_maker.scr script#

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

for FAA in `ls *.faa | grep -v ".1.faa" | sed 's/.faa//g'`; do
echo "#!/bin/bash" >algn_$FAA.sh
echo "#$ -cwd" >>algn_$FAA.sh
echo "#$ -j y" >>algn_$FAA.sh
echo "#$ -S /bin/bash" >>algn_$FAA.sh
echo "mafft $FAA.faa > "a_"$FAA" >>algn_$FAA.sh
chmod +x *.sh; done

##The resulting alignments were concatennated using Seaview under the name big_align_core.aln##
$PATH/raxmlHPC-PTHREADS -T 40 -f a -x 31416 -p 31416 -# 1000 -m PROTGAMMALG -s big_align_core.fst -n output_raxml
#The resulting tree was visualized using iTOL

In [None]:
#CONSTRUCTION OF REFERENCE PANGENOME USING ROARY#

In [None]:
#Creating gff3 files with Prokka#
#This was done for every genome#
prokka --kingdom Bacteria --outdir prokka_Strain1 --genus Solirubrobacter --locustag Strain1 $PATH/solirubrobacter/genomas_ncbi/Strain1_genomic.fna
#Running Roary#
$PATH/bin/roary -f output -e -n -i 50 -v ./*.gff
$PATH/anaconda3/bin/FastTreeMP -nt -gtr core_gene_alignment.aln > tree_core.nwk
python roary_plots.py ./output_outgroup/tree_core.nwk ./output_outgroup/gene_presence_absence.csv
Rscript create_pan_genome_plots.R
#Translating the pangenome to upload it to KEGG#
$PATH/transeq $PATH/solirubrobacter/pangenome/pan_genome_reference.fa $PATH/solirubrobacter/pangenome/pan_genome_reference_prot.faa

In [None]:
#Solirubrobacter RECRUITMENT FROM METAGENOMES#

In [None]:
##Transforming metagenomes in fastq format to fasta##
seqret -sequence reads.fastq -outseq reads.fasta

##Renaming and changing fasta files to another directory##
for i in *.fasta; do 
NEWNAME="${i/%.fastq.fasta/.fasta}" 
mv "$i" "$NEWNAME";  done
mv *.fasta $PATH/solurobrobacter/tblastn

##Making a database for each metagenome so we can run tblastn##
##Script name: db_maker.scr##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

for FAA in `ls *.fasta | sed 's/.fasta//g'`; do
echo "#!/bin/bash" >dbmk_$FAA.sh
echo "#$ -cwd" >>dbmk_$FAA.sh
echo "#$ -j y" >>dbmk_$FAA.sh
echo "#$ -S /bin/bash" >>dbmk_$FAA.sh
echo "makeblastdb -in $FAA".fasta" -dbtype nucl" >>dbmk_$FAA.sh
chmod +x *.sh; done

##Running tblastn using metagenomes as db and pangenome as query##
##Script name: $PATH/solirubrobacter/tblastn/search_soli.scr##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

SEQS=$PATH/solirubrobacter/tblastn

for FAA in `ls *.fasta | sed 's/.fasta//g'`; do
echo "#!/bin/bash" >search_$FAA.sh
echo "#$ -cwd" >>search_$FAA.sh
echo "#$ -j y" >>search_$FAA.sh
echo "#$ -S /bin/bash" >>search_$FAA.sh
echo "tblastn  -query "$SEQS"/solirubrobacter.rep.fna -db "$FAA".fasta" -out "$FAA".tsv -outfmt "'"6 qseqid qlen sseqid slen qseq evalue bitscore pident length mismatch gapopen qstart qend sstart send"'" >>search_$FAA.sh
chmod +x *.sh; done

bash search_soli.scr
#Running all the scripts##
for N in `ls search_*.sh`; do qsub $N; done 

##This generates files with the reads in each metagenome that align to the pangenome of Solirubrobacter##
##Filtering results according to bitscore value##
for FAA in `ls *.tsv | sed -e 's/.tsv//g'`; do cat "$FAA".tsv | perl -pe '$name_col=0; $score_col=11; while(<>) { s/\r?\n//; @F=split /\t/, $_; ($n, $s) = @F[$name_col, $score_col]; if (! exists($max{$n})) { push @names, $n }; if (! exists($max{$n}) || $s > $max{$n}) { $max{$n} = $s; $best{$n} = () }; if ($s == $max{$n}) { $best{$n} .= "$_\n" }; } for $n (@names) { print $best{$n} } ' > best; perl -e ' $column=0; $unique=0; while(<>) { s/\r?\n//; @F=split /\t/, $_; if (! ($save{$F[$column]}++)) { print "$_\n"; $unique++ } } ' best > "$FAA"_best_uniq; rm best; done

#Filter in order to get only 60% identity
for N in `ls *_best_uniq| sed -e 's/\_best_uniq//g'`; do awk -F "\t" '{if ($3>60) print $0}' "$N"_best_uniq > "$N"_pident_m5nr.bout;done

#Simplify output of best hits files
for i in `ls *_pident_best_uniq| sed -e 's/\_pident_best_uniq//g'`; do awk '{print $1"\t"$3"\t"$11"\t"$12"\t"$2}' "$i"_pident_best_uniq > "$i"_best.simple.tsv; done

##Extract metagenomic reads and create a fasta file for assemble##
##Extracting subject sequence ids##
for N in `ls *_best_uniq | sed 's/_best_uniq//g'`; do cut -f3 $N"_best_uniq" > $N"_id.txt"; done 

##Extracting subject sequences from fasta files##
##Script name: filter_soli.scr##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

SEQS=$PATH/solirubrobacter/metagenomes

for FAA in `ls *.fasta | sed 's/.fasta//g'`; do
echo "#!/bin/bash" >filter_$FAA.sh
echo "#$ -cwd" >>filter_$FAA.sh
echo "#$ -j y" >>filter_$FAA.sh
echo "#$ -S /bin/bash" >>filter_$FAA.sh
echo "/home/qiime/bin/filter_fasta.py -f $SEQS/$FAA".fastq" -o $FAA"_soli.fastq" -s $FAA"_id.txt"" >>filter_$FAA.sh
chmod +x *.sh; done

bash filter_soli.scr

#Running all scripts#
for N in `ls filter_*.sh`; do qsub $N; done 

##The resulting fasta files contain only sequences from Solirubrobacter##
##Sequences of Solirubrobacter: Renaming fastq files##

for i in *_R1_soli.fastq; do 
NEWNAME="${i/%_R1_soli.fastq/_soli_R1.fastq}" 
mv "$i" "$NEWNAME";  done

for i in *_R2_soli.fastq; do 
NEWNAME="${i/%_R2_soli.fastq/_soli_R2.fastq}" 
mv "$i" "$NEWNAME";  done


#Obtaining quality filtered, paired and unpaired Solirubrobacter sequences using Trimmomatic.
##Script name: trimm_maker.scr##
##This is run in tblastn directory, where the fastq files with Solirubrobacter reads are found ##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

FILES=$PATH/solirubrobacter/tblastn
SEQS=$PATH/solirubrobacter/trimming
BIN=$PATH/bin/Trimmomatic-0.36

COUNT=0
for FAA in `ls *.fastq| perl -pe 's/\_.*//g' | sort | uniq`
do
echo "#!/bin/bash" >trim_$FAA.sh
echo "#$ -cwd" >>trim_$FAA.sh
echo "#$ -j y" >>trim_$FAA.sh
echo "#$ -S /bin/bash" >>trim_$FAA.sh
echo "java -jar $BIN/trimmomatic-0.36.jar PE -threads 2 -phred33 -trimlog $SEQS/trim.log $FILES/$FAA"_soli_R1.fastq" $FILES/$FAA"_soli_R2.fastq" $SEQS/$FAA"_s_paired_R1.fastq.gz" $SEQS/$FAA"_s_unpaired_R1.fastq.gz" $SEQS/$FAA"_s_paired_R2.fastq.gz" $SEQS/$FAA"_s_unpaired_R2.fastq.gz" ILLUMINACLIP:/home/luis/bin/Trimmomatic-0.36/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36" >>trim_$FAA.sh
chmod +x *.sh; done

#Running all the scripts#
for N in `ls trim_*.sh`; do qsub $N; done

#Converting fastq files to fasta#
#Script name convert_fastqfasta.scr in $PATH/solirubrobacter/trimming#

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

SEQS=$PATH/solirubrobacter/trimming

COUNT=0
for FAA in `ls *_s_paired_R*.fastq.gz | sed 's/.fastq.gz//g'`
do
echo "#!/bin/bash" >conv_$FAA.sh
echo "#$ -cwd" >>conv_$FAA.sh
echo "#$ -j y" >>conv_$FAA.sh
echo "#$ -S /bin/bash" >>conv_$FAA.sh
echo "gunzip $SEQS/$FAA".fastq.gz"" >>conv_$FAA.sh
echo "/usr/bin/seqret -sequence $SEQS/$FAA".fastq" -outseq $SEQS/$FAA".fasta"" >>conv_$FAA.sh
echo "gzip $SEQS/$FAA".fastq"" >>conv_$FAA.sh
chmod +x *.sh; done


bash convert_fastqfasta.scr

#Running all the scripts#
for N in `ls conv_*R*.sh`; do qsub $N; done

#Concatenating paired reads#
for N in `ls *_s_paired_R*.fasta | sed 's/_s_paired_R1.fasta//g' | sed 's/_s_paired_R2.fasta//g'`; do cat $N"_s_paired_R1.fasta" $N"_s_paired_R2.fasta" > $N"_s_reads.fasta"; done

#Files *_s_reads.fasta contain reads of Solirubrobacter. There’s one file per metagenome#
#Collapsing fasta files according to environment#
cat r*_s_reads.fasta > rhizo_s_reads.fasta
cat Soil_1_s_reads.fasta Soil_2_s_reads.fasta Soil_3_s_reads.fasta Soil_4_s_reads.fasta Soil_5_s_reads.fasta Soil_6_s_reads.fasta > soil_s_reads.fasta
#The same was carried out with lakes, phyllospheres, foreign soils, foreign rhizospheres, and sediments#

In [None]:
#Finding Solirubrobacter in metagenomes#
#Running promer for every reference using each metagenome Solirubrobacter reads as query#
for N in `ls soil*_s_reads.fasta | sed 's/_s_reads.fasta//g'`; do
promer pan_genome_reference.fa $N"_s_reads.fasta" 
mv out.delta "pangenome_"$N".delta"; done
#To edit promer graphs
for N in `ls pangenome_soil_*.delta | sed 's/.delta//g'`; do
show-coords -c -d -k -l -r -T $N".delta" > $N".show-coords.tsv"
python3 promer_deid.py $N".show-coords.tsv" $N;
done
#promer_deid.py it's an inhouse script#

In [None]:
#CREATING ENVIRONMENTAL EXTENDED PANGENOME (EEP)#

In [None]:
#Search for Solirubrobacter contigs that contain a reference sequence and adjacently a gene not found int he reference pangenome#
#Using the script bbmap_maker.scr

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

COUNT=0

for FAA in `ls *_hb100_contigs.fasta.numbered.fas | sed -e 's/_hb100_contigs.fasta.numbered.fas//g'`; do
let COUNT=COUNT+1
echo "#!/bin/bash" >bbmap_$FAA.scr
echo "#$ -cwd" >>bbmap_$FAA.scr
echo "#$ -j y" >>bbmap_$FAA.scr
echo "#$ -S /bin/bash" >>bbmap_$FAA.scr
echo "$PATH/binc/bbmap/bbmap.sh ref=$FAA"_hb100_contigs.fasta.numbered.fas" nodisk in=pan_genome_reference.fa covstats=$FAA"_p_constats.txt" covhist=$FAA"_p_covhist.txt" basecov=$FAA"_p_basecov.txt" bincov=$FAA"_p_bincov.txt" outm=$FAA"_m.bam" outu=$FAA"_um.bam" k=14 minid=0.6" >> bbmap_$FAA.scr

chmod +x *.scr; done

#Retrieve contigs’ id’s and sequences

for N in `ls *_m.sam |sed 's/_m.sam//g' `; do grep "^S" $N"_m.sam" | awk '{print $4, $5}' > $N"_scontigs_id.txt"; done
for N in `ls *_hb100_contigs.fasta.numbered.fas |sed 's/_hb100_contigs.fasta.numbered.fas//g' `; do grep -A1 -Fwf $N"_scontigs_id.txt" $N"_hb100_contigs.fasta.numbered.fas" > $N"_scontigs.fasta"; done

#Filtering the selected contigs

makeblastdb -in $PATH/solirubrobacter/contig_search/pan_genome_reference_prot.faa -dbtype prot 

#For false_postivies.scr in metagenome soli contigs: 

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

SEQS=$PATH/solirubrobacter/contig_search
COUNT=0

for FAA in `ls *_scontigs.fasta | sed 's/_scontigs.fasta//g'`; do
let COUNT=COUNT+1
echo "#!/bin/bash" >fp_$FAA.scr
echo "#$ -cwd" >>fp_$FAA.scr
echo "#$ -j y" >>fp_$FAA.scr
echo "#$ -S /bin/bash" >>fp_$FAA.scr
echo "blastx -db $SEQS/pan_genome_reference_prot.faa  -query $SEQS/$FAA"_scontigs.fasta" -outfmt 7 -evalue 1e-5 -num_alignments 500 -num_threads 4 -out $FAA"_falsepos.bout"" >> fp_$FAA.scr

chmod +x *.scr; done

for N in `ls *_falsepos.bout`; do grep -cw "0 hits found" $N; done 
for N in `ls *_falsepos.bout | sed 's/_falsepos.bout//g'`; do grep -w -B2 "0 hits found" $N"_falsepos.bout" | grep "# Query: " | sed 's/# Query: //g' > $N"_id_contig_REMOVE.txt"; done  

#This generates a file with the names of the contigs that were false positives and should be removed from the new pangenome#

#Removing contigs that did not pass the filtering step
for N in `ls *_scontigs.fasta | sed 's/_scontigs.fasta//g'`; do /home/qiime/bin/filter_fasta.py -f $N"_scontigs.fasta" -o $N"_scontigs_filt.fasta" -n -s $N"_id_contig_REMOVE.txt"; done

In [None]:
#Renaming sequences using the script header.fasta.numbers.pl (rename_seq.scr)#
for genome in `ls *_genomic.fna | sed 's/_genomic.fna//g'`; do perl $PATH/scripts/header.fasta.numbers.pl $genome "$genome"_genomic.fna; done
for genome in `ls *_genomic.fna.numbered.fas | sed 's/_genomic.fna.numbered.fas//g'`; do awk '{if ($1~">"){print $1} else {(":");print}}' $genome"_genomic.fna.numbered.fas" > $genome"_scontigs.fasta"; done

#A symbolic link for all *numbered* files from Solirubrobacter was created in the pangenome_new directory in Cuallicua#

ln -s $PATH/solirubrobacter/genomas_ncbi/*scontigs.fasta ./
ln -s $PATH/solirubrobacter/pangenome/S*.fasta ./

for N in `ls T*_scontigs.fasta | sed 's/_scontigs.fasta//g'`; do $PATH/anaconda3/bin/prodigal -i $N"_scontigs.fasta" -o $N"_sgenes.fasta" -a $N"_sproteins.faa" -p meta; done 

#Concatenate all genomes with .fasta extension and modified headers#
cat *.faa Ssoli.fasta Staibaiensis.fasta SpURHD.fasta SpCPCC.fasta Spauli.fasta Sginsenosidimutans.fasta Sphytolaccae.fasta > solirubrobacter_new_filt.faa

#Clustering using CD-HIT at 50% identity using the script cdhit_50.scr#

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash
cd-hit -i solirubrobacter_new.faa -o soli_new_50 -n 3 -c 0.50 -aS 0.7 -d 0  -M 30000 -T 10 >  soli_new_50_filt.cdhit.out


#Converting soli_50.clstr file for QIIME#
perl -pne 's/\t//g;s/^.*,//g;s/\.\.\..*$//g;s/\n/\t/g;s/\>Cluster\ /\n/g;s/\>//g; eof && do{chomp; print "$_ \n"; exit}' soli_new_50.clstr > solirubrobacter_new_filt.otu

#Extracting representative sequence from each cluster# 
pick_rep_set.py -i solirubrobacter_new_filt.otu -f solirubrobacter_new.faa -o solirubrobacter_new.rep.fna
grep -c ">" solirubrobacter_new_filt.rep.fna

#Obtaining ID from the representative sequences#
grep ">" solirubrobacter_new_filt.rep.fna > identif_new_soli
awk '{if ($1~">"){print $2} else {(":");print}}' identif_new_soli > identif_new_edit
sed 's/^/>/g' identif_new_edit > id_new_edit

#Generating count table using QIIME with the script count_table.scr#

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash
/home/qiime/bin/make_otu_table.py -i solirubrobacter_new_filt.otu -o solirubrobacter_new_filt.biom
/home/qiime/bin/biom convert --to-tsv -i solirubrobacter_new_filt.biom -o solirubrobacter_new_filt.tsv

#File solirubrobacter_new.tsv is further analyzed using R. PLEASE REFER TO THE CORRESPONDING FILE #

In [None]:
#GENE FAMILIES SEARCH#

In [None]:
#Search for gene families#
#The file solirubrobacter_new_filt.otu was duplicated and renamed solirubrobacter_clsr.otu
sed -i 's/_//g' solirubrobacter_clsr.otu
cut -f1 solirubrobacter_clsr.otu > clstr_id 
sed -i 's/[0-9]//g' solirubrobacter_clsr.otu
paste -d"\t" clstr_id solirubrobacter_clsr.otu > solirubrobacter_clstr.otu
sed -i '1d' solirubrobacter_clstr.otu
sed -i '1d' clstr_id

#This file contains the name of the metagenome/genome that has a sequence of the cluster (first column id of the file)
rm solirubrobacter_clsr.otu

#Getting counts per cluster
for N in `cat lista`; do awk -F' ' '{ count=0; for (i=1; i<=NF; i++) if ($i == "$N") count++; print count }' solirubrobacter_clstr.otu > $N.genefam.counts.txt; done
#Adding cluster number to each file
for N in `ls *.genefam.counts.txt | sed 's/.genefam.counts.txt//g'`; do paste -d"\t" clstr_id $N.genefam.counts.txt > $N.genefam.counts.tsv; done
rm *.genefam.counts.txt

#Adding headers to the columns
for N in `ls *.genefam.counts.tsv | sed 's/.genefam.counts.tsv//g'`; 
do 
awk -F "\t" 'BEGIN {print "Cluster Counts"} {print $0}' $N.genefam.counts.tsv > $N.tmp
    mv $N.tmp $N.genefam.counts.tsv;
done

for N in `ls S*genefam.counts.tsv | sed 's/.genefam.counts.tsv//g'`; do sed -i 's/Counts/'$N'/g' $N.genefam.counts.tsv; done

#Obtaining annotation from the pangenome’s annotation
for N in `ls gene_*.tsv | sed 's/gene_//g' | sed 's/.tsv//g'`; 
do  
cut -f1 $PATH/Drive/Solirubrobacter/gene_fam/"gene_"$N.tsv > $PATH/Drive/Solirubrobacter/gene_fam/"id_"$N
sed -i 's/^/RepSeq_/g' $PATH/Drive/Solirubrobacter/gene_fam/"id_"$N
sed -i 's/$/_/g' $PATH/Drive/Solirubrobacter/gene_fam/"id_"$N
grep -Ff $PATH/Drive/Solirubrobacter/gene_fam/"id_"$N $PATH/Drive/Solirubrobacter/pangenome_cuallicua/pangenome_new/filtered/pangenome_annot.tsv > $PATH/Drive/Solirubrobacter/gene_fam/"gene"$N"_annot.tsv";
done

In [None]:
#SEARCH FOR PLANT-MICROORGANISM INTERCATION GENES#

In [None]:
#Using HMM for Actinobacteria plant-microorganism interaction genes#
#For every file a similar script was run# 

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash
$PATH/anaconda3/bin/hmmsearch -A 3 $PATH/solirubrobacter/hmm_databases/orthogroups_database_actinobacteria_one.hmm $PATH/solirubrobacter/hmm_databases/soils_s_reads.fasta > $PATH/solirubrobacter/hmm_databases/soils_hmm/soils_pm

#Renaming file
mv 3 results_one_soils


#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash
$PATH/anaconda3/bin/hmmsearch -A 3 $PATH/solirubrobacter/hmm_databases/orthogroups_database_actinobacteria_two.hmm $PATH/solirubrobacter/hmm_databases/soils_s_reads.fasta > $PATH/solirubrobacter/hmm_databases/soils_hmm/soils_pm2

#Renaming file
mv 3 results_two_soils

In [None]:
#SEARCH FOR SULFUR METABOLISM PROTEINS#

In [None]:
#Creating a db from contigs of all metagenomes using script db_maker.scr#

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

for FAA in `ls *.fasta.numbered.fas | sed 's/.fasta.numbered.fas//g'`; do
echo "#!/bin/bash" >dbmk_$FAA.sh
echo "#$ -cwd" >>dbmk_$FAA.sh
echo "#$ -j y" >>dbmk_$FAA.sh
echo "#$ -S /bin/bash" >>dbmk_$FAA.sh
echo "makeblastdb -in $FAA".fasta.numbered.fas" -dbtype nucl" >>dbmk_$FAA.sh
chmod +x *.sh; done

##Running tblastn using metagenomes contigs as db and sulfur proteins as query ##
##Sulfur proteins NCBI Reference Sequences: NP_417244.1 and WP_012265056.1##
##Script name: search_sulph.scr ##

#!/bin/bash
#$ -cwd
#$ -j y
#$ -S /bin/bash

SEQS=$PATH/solirubrobacter/mg_hybrid_assb/assh_merge

for FAA in `ls *.fasta.numbered.fas | sed 's/.fasta.numbered.fas//g'`; do
echo "#!/bin/bash" >search_$FAA.sh
echo "#$ -cwd" >>search_$FAA.sh
echo "#$ -j y" >>search_$FAA.sh
echo "#$ -S /bin/bash" >>search_$FAA.sh
echo "tblastn  -query "$SEQS"/sulph_prot_miss.fasta -db "$FAA".fasta.numbered.fas" -out "$FAA".tsv -outfmt "'"6 qseqid qlen sseqid slen qseq evalue bitscore pident length mismatch gapopen qstart qend sstart send"'" >>search_$FAA.sh
chmod +x *.sh; done

bash search_sulph.scr

for N in `ls search_*.sh`; do qsub $N; done 

##This generates files with the sequence in each metagenome that aligns to the pangenome of Solirubrobacter##
##Filtering results according to bitscore value##

for FAA in `ls *_hb100_contigs.tsv | sed -e 's/_hb100_contigs.tsv//g'`; do cat $FAA"_hb100_contigs.tsv" | perl -pe '$name_col=0; $score_col=11; while(<>) { s/\r?\n//; @F=split /\t/, $_; ($n, $s) = @F[$name_col, $score_col]; if (! exists($max{$n})) { push @names, $n }; if (! exists($max{$n}) || $s > $max{$n}) { $max{$n} = $s; $best{$n} = () }; if ($s == $max{$n}) { $best{$n} .= "$_\n" }; } for $n (@names) { print $best{$n} } ' > best; perl -e ' $column=0; $unique=0; while(<>) { s/\r?\n//; @F=split /\t/, $_; if (! ($save{$F[$column]}++)) { print "$_\n"; $unique++ } } ' best > "$FAA"_best_uniq; rm best; done



#Filter in order to get only 60% identity
for N in `ls *_best_uniq | sed -e 's/\_best_uniq//g'`; do awk -F "\t" '{if ($8>50) print $0}' "$N"_best_uniq > "$N"_pident.bout; done

#Simplify output of best hits files
for i in `ls *_pident_best_uniq | sed -e 's/\_pident_best_uniq//g'`; do awk '{print $1"\t"$3"\t"$11"\t"$12"\t"$2}' "$i"_pident_best_uniq > "$i"_best.simple.tsv; done