# 4. Reannotate DRAM proteins based on foldseek hits

**Context**: DRAM was unable to annotate some proteins (which it called "RankE"). This notebook reads in the foldseek annotations, and uses them to replace any missing DRAM annots.


In [197]:
from Bio import SeqIO
import pandas as pd
import numpy as np
from os.path import exists

PATH_out_reannotations_table = "reannot_all_prot_from_contigs_with_rhodopsins.tsv"
PATH_dram_rhod_contigs_FAA = "../3_foldseek_on_neighbor_AAs/0_all_prot_from_contigs_with_rhodopsins.faa"
PATH_foldseek_annots = "../3_foldseek_on_neighbor_AAs/2_best_annot_foldseek_hits.tsv"

DF_fs = pd.read_csv(PATH_foldseek_annots, sep="\t")
DF_fs.drop(columns=['Unnamed: 0'],inplace=True)

##### I. Derive new annots from foldseek hits
print("##### I. Derive new annots from foldseek hits")

# Format Pfam info
DF_fs['TMP'] = "["+DF_fs['fs0_Pfam'].astype(str)+"]" # e.g. "PF09140;PF01656" -> "[PF09140;PF01656]"
DF_fs['TMP'] = DF_fs['TMP'].str.replace('\[nan\]','')  # gets rid of any "[]"
DF_fs['TMP'] = DF_fs['TMP'].str.replace(';',',').replace()   # e.g. "[PF09140;PF01656]" -> "[PF09140,PF01656]"

# Contextualize description with its bitscore
DF_fs['TMP2'] = DF_fs['fs1_description']+" bit-"+DF_fs['fs1_bits'].replace(np.nan,0).astype(int).astype(str)+"/"+DF_fs['fs0_bits'].astype(str)
# ^ e.g. "Segregation protein B bit-242/264"
# where "bit-242/264" means that description is from a hit with bitscore 242, while the best hit had a bitscore of 264
# Rationale: To know whether our description is from the best hit, and IF NOT, how its bitscore differs from the best hit

# Combine description+bitscore and Pfam info -> col 'new_annot'
DF_fs['new_annot'] = DF_fs['TMP2'] + " " + DF_fs['TMP']
# e.g. "Segregation protein B bit-242/264 [PF09140,PF01656]"

print(str(len(DF_fs))+ " foldseek annotations added")

DICT_proteinID2newAnnot = dict(zip(DF_fs['query'], DF_fs['new_annot']))
print("Loaded new annotations into dictionary")


###### II. Consider the original DRAm annotations

print("\n\n###### II. Consider the original DRAm annotations")


# Get metrics about the original Dram annotations

DICT_seqid2description={}
DICT_seqid2AA={}
for record in SeqIO.parse(PATH_dram_rhod_contigs_FAA, "fasta"):
    DICT_seqid2description[record.id]=record.description
    DICT_seqid2AA[record.id]=str(record.seq)
    
SER = pd.Series(DICT_seqid2description, name='description')
SER.index.name = 'locus_tag'
DF_dram_annot = SER.reset_index()

DF_dram_annot['AA'] = DF_dram_annot['locus_tag'].map(DICT_seqid2AA) 

DF_dram_annot['description'] = 'rank:' + DF_dram_annot['description'].str.replace(' rank:','~').str.split('~',expand=True)[1]


LIST_proteins_rankC = DF_dram_annot.loc[DF_dram_annot['description'].str.startswith('rank: C')]['locus_tag'].to_list()
print(str(len(LIST_proteins_rankC))+ " of the proteins have Dram annot rank C (Good whole-protein annot.)")

LIST_proteins_rankD = DF_dram_annot.loc[DF_dram_annot['description'].str.startswith('rank: D')]['locus_tag'].to_list()
print(str(len(LIST_proteins_rankD))+ " of the proteins have Dram annot rank D (Meh. Domain-level)")

LIST_proteins_rankE = DF_dram_annot.loc[DF_dram_annot['description']=='rank: E']['locus_tag'].to_list()
print(str(len(LIST_proteins_rankE))+ " of the proteins have NO Dram annot (a.k.a. 'rank E')")



#### III. Replace the Dram annotations where necessary

print("\n\n###### III. Replace the Dram annotations where necessary")

DF_reannot = DF_dram_annot

DF_reannot['new_rankE_description'] = np.where(DF_reannot['description'].str.startswith('rank: E'), 'rank: G; '+DF_reannot['locus_tag'].map(DICT_proteinID2newAnnot), DF_reannot['description'])
DF_reannot['new_rankD_and_rankE_description'] = np.where(DF_reannot['description'].str.startswith(('rank: D','rank: E'), na=False), 'rank: G; '+DF_reannot['locus_tag'].map(DICT_proteinID2newAnnot), DF_reannot['description'])

print("Saving table to keep track of what will be reannotated")
if not exists(PATH_out_reannotations_table):
    DF_reannot.to_csv(PATH_out_reannotations_table, sep="\t")

##### I. Derive new annots from foldseek hits
38477 foldseek annotations added
Loaded new annotations into dictionary


###### II. Consider the original DRAm annotations




19537 of the proteins have Dram annot rank C (Good whole-protein annot.)
13644 of the proteins have Dram annot rank D (Meh. Domain-level)
5951 of the proteins have NO Dram annot (a.k.a. 'rank E')


###### III. Replace the Dram annotations where necessary
Saving table to keep track of what will be reannotated


>AG-538-A02_AG-538-A02_contigs_AG-538-A02_NODE_4_6 AG-538-A02 AG-538-A02_contigs_AG-538-A02_NODE_4_6 rank: D; Serine aminopeptidase, S33 [PF12146.11]; alpha/beta hydrolase fold [PF00561.23]; Alpha/beta hydrolase family [PF12697.10]; Prolyl oligopeptidase family [PF00326.24]; TAP-like protein [PF08386.13] (db=pfam)
