# topDIAMOND
* Generate a table of the top DIAMOND BLASTP hits for each gene in a reference proteomes
* These tables generated were generated for Athaliana (ARAPORT) and Zmays (v4) proteomes 
* For each table 45 other references proteomes were used to generate the top hits
* For all proteomes the canonical representetives sequences and sequence IDs were pre-generated

In [112]:
import glob
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import gzip
import os
from collections import OrderedDict
import pandas as pd

# A list of all fasta files in a folder
fasta_list = glob.glob('*.gz') # returns a list of all .fa files in the folder

# A dictionary of fasta file keys and they short names as items 
names_dict = pd.read_csv("name_mapping.csv", sep=",", header=None, index_col=0).to_dict()[1]

# Delete the reference fasta file from the list
#ref = "Zmays_493_RefGen_V4.protein_primaryTranscriptOnly.canonical.fa.gz"
ref = "Athaliana_447_Araport11.protein_primaryTranscriptOnly.canonical.fa.gz"
del fasta_list[fasta_list.index(ref)]

In [113]:
count = 1
first = True
for fl in fasta_list:
    print("Processing:",count, fl)
    count += 1
    os.system("diamond makedb --in "+fl+" --db temp")
    # Run blastp while returning only the top hit and report unaligned queries
    os.system("diamond blastp -d temp -q "+ref+" -o temp.tsv --max-target-seqs 1 --unal 1 --quiet")
    
    if first:
        df = pd.read_csv("temp.tsv", sep="\t", index_col=0, header=None)
        df = pd.DataFrame(df[1])
        df.columns = [names_dict[fl]]
        
        # For some unkown reason some sequences (about 100) get duplicated rows 
        # Deduplicating returns the correct number of rows though (=number of seqs in fasta)
        df = df[~df.index.duplicated(keep="first")]
        first = False
    else:
        tmp = pd.read_csv("temp.tsv", sep="\t", index_col=0, header=None)
        # See brief note above about unexplained duplicated rows
        tmp = tmp[~tmp.index.duplicated(keep="first")]
        
        # Using concat will throw an error if number of rows don't match, which is the assertion
        df = pd.concat([df, pd.DataFrame(tmp[1])], axis=1)
        df.columns = list(df.columns[:-1])+[names_dict[fl]]
df.index.name = names_dict[ref]
df.to_csv("topDIAMOND_"+df.index.name+".csv")

Processing: 1 Zm-CML247-REFERENCE-NAM-1.0_Zm00023ab.1.protein.canonical.fa.gz
Processing: 2 Phallii_590_v3.2.protein_primaryTranscriptOnly.canonical.fa.gz
Processing: 3 Zm-CML228-REFERENCE-NAM-1.0_Zm00022ab.1.protein.canonical.fa.gz
Processing: 4 Zm-CML277-REFERENCE-NAM-1.0_Zm00024ab.1.protein.canonical.fa.gz
Processing: 5 Zmays_493_RefGen_V4.protein_primaryTranscriptOnly.canonical.fa.gz
Processing: 6 ZmaysPH207_443_v1.1.protein_primaryTranscriptOnly.canonical.fa.gz
Processing: 7 Zm-Ky21-REFERENCE-NAM-1.0_Zm00031ab.1.protein.canonical.fa.gz
Processing: 8 Zm-HP301-REFERENCE-NAM-1.0_Zm00027ab.1.protein.canonical.fa.gz
Processing: 9 Zm-Oh7B-REFERENCE-NAM-1.0_Zm00038ab.1.protein.canonical.fa.gz
Processing: 10 Gmax_508_Wm82.a4.v1.protein_primaryTranscriptOnly.canonical.fa.gz
Processing: 11 Bhybridum_463_v1.1.protein_primaryTranscriptOnly.canonical.fa.gz
Processing: 12 Slycopersicum_691_ITAG4.0.protein_primaryTranscriptOnly.canonical.fa.gz
Processing: 13 Zm-CML333-REFERENCE-NAM-1.0_Zm00026ab