In [None]:
import subprocess
import io
import gzip
import os

import pandas as pd

from collections import OrderedDict
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC

# Get the gene list from the reference fasta file

In [None]:
fasta_file = gzip.open(fld[ref], mode='rt')
record_iterator = SeqIO.parse(fasta_file, "fasta")
od = OrderedDict()

for record in record_iterator:
    record.id = str(record.id).split("_")[0]
    record.name = ""
    record.description = ""
    if record.id in od:
        if len(record.seq) > len(od[record.id].seq):
            od[record.id] = record
    else:
        od[record.id] = record
        
len(od.keys())
rbh_df = pd.DataFrame(index=od.keys())

In [None]:
import os

# make a file name dictionary
fld = {}
name_list = []
for fl in os.listdir("."):
    if fl[-4:] == "a.gz":
        fld[fl.split("-")[1]] = fl

# generate a diamond db for every fasta.gz file
for fl in fld.keys():    
    os.system("gunzip --keep --stdout {0} | \
               diamond makedb --in /dev/stdin --db {1}".format(fld[fl], fl))

# blast every non-reference fasta.gz file against the reference fasta.gz
ref = "B73"
for fl in fld.keys():
    # Skip ref fasta
    if fl == ref:
        continue
    
    # /dev/stdout only works if provided with permission: sudo chown -R $USER /dev
    # Run Fwd comparison - ref vs. query
    fwd = pd.read_csv(io.StringIO(subprocess.check_output("gunzip --keep --stdout {0} | \
               diamond blastp --quiet -p 32 -d {1} -q /dev/stdin -o /dev/stdout \
               --more-sensitive --outfmt 6 qseqid sseqid bitscore".format(fld[ref], fl),
               shell=True, text=True)), sep="\t", header=None)
    
    # Run Reciprocal Rev comparison - query vs ref    
    rev = pd.read_csv(io.StringIO(subprocess.check_output("gunzip --keep --stdout {0} | \
               diamond blastp --quiet -p 32 -d {1} -q /dev/stdin -o /dev/stdout \
               --more-sensitive --outfmt 6 qseqid sseqid bitscore".format(fld[fl], ref),
               shell=True, text=True)), sep="\t", header=None)
    
    # Add headers to forward and reverse results dataframes
    headers = ["query", "subject", "bitscore"]
    fwd.columns = headers
    rev.columns = headers
    
    # https://widdowquinn.github.io/2018-03-06-ibioic/02-sequence_databases/05-blast_for_rbh.html
    # Merge forward and reverse results
    rbbh = pd.merge(fwd, rev[['query', 'subject']],
                    left_on='subject', right_on='query', how='outer')

    # Discard rows that are not RBH
    rbbh = rbbh.loc[rbbh.query_x == rbbh.subject_y]

    # Discard genes with multiple RBHs based, keeping highest biscore pairs
    rbbh = rbbh.sort_values(by="bitscore", axis=0, ascending=False) 
    rbbh = rbbh.drop_duplicates(subset=["subject_y"])
    
    rbbh = rbbh.drop(["bitscore", "query_y", "subject_y"], axis=1)
    rbbh.columns = [ref, fl]
    rbbh = rbbh.set_index(ref)

    # join the results with the main rbh_df
    rbh_df = rbh_df.join(rbbh, how='outer')

In [None]:
rbh_df

# Backup code

In [None]:
# To save diamond results as tsv file
outf = "{0}-{1}.tsv".format(ref, fl)
os.system("gunzip --keep --stdout {0} | \
               diamond blastp -p 32 -d {1} -q /dev/stdin -o temp/{2} \
               --more-sensitive --outfmt 6 qseqid sseqid bitscore".format(fld[fl], ref, outf))

In [None]:
# make a file name dictionary
name_list = []
for fl in os.listdir("./temp"):
    if fl.split(".")[-1] == "tsv":
        name_list.append(fl)
print(name_list)

In [None]:
# Load the BLAST results into Pandas dataframes
fwd_results = pd.read_csv('temp/Mo17-B73.tsv', sep="\t", header=None)
rev_results = pd.read_csv('temp/B73-Mo17.tsv', sep="\t", header=None)