# 2. Tabulate hmmscan hits

In [4]:
import pandas as pd
import numpy as np
from os.path import exists

MODE_verbose = False

bitscore_threshold = 10

PATH_out = "retinal_biosynth_gdark_rhodopsinSAGs_minbitscore_"+str(bitscore_threshold)+".csv"

# looked up ncbi Protein Family Models for 2.5.1.29
PATH_1a = "../1_hmmscan_for_carotenoids_and_flotillins/1_crtE/1a_gdark4.hmmscan.txt" # hmmsearch v. NF041003.1.HMM (ncbi accession: NF041003.1)
PATH_1b = "../1_hmmscan_for_carotenoids_and_flotillins/1_crtE/1b_gdark4.hmmscan.txt" # hmmsearch v. NF045549.1.HMM (ncbi accesion: NF045549.1)

# looked up based on ncbi HMMs for term "geranylgeranyl diphosphate synthase" (because no hits for 2.5.1.99)
PATH_2a = "../1_hmmscan_for_carotenoids_and_flotillins/2_crtB/2a_gdark4.hmmscan.txt" # hmmsearch v. NF042419.1.HMM
PATH_2b = "../1_hmmscan_for_carotenoids_and_flotillins/2_crtB/2b_gdark4.hmmscan.txt" # hmmsearch v. NF045686.1.HMM

# looked up ncbi Protein family model for "Phytoene desaturase" with " biological process:carotenoid biosynthetic process"(because no hits for 1.3.99.26)
PATH_3 = "../1_hmmscan_for_carotenoids_and_flotillins/3_crtI/3_gdark4.hmmscan.txt" # hmmsearch v. TIGR02734.1.HMM (ncbi accession: TIGR02734.1)

# Lycopene cyclase. I.e. with EC number of 
PATH_4a = "../1_hmmscan_for_carotenoids_and_flotillins/4_crtY/4a_gdark4.hmmscan.txt" # hmmsearch v. TIGR01789.1.HMM
PATH_4b = "../1_hmmscan_for_carotenoids_and_flotillins/4_crtY/4b_gdark4.hmmscan.txt" # hmmsearch v. TIGR01790.1.HMM

PATH_5 = "../1_hmmscan_for_carotenoids_and_flotillins/5_brp_blh/5_gdark4.hmmscan.txt" # hmmsearch v. TIGR03753.1.HMM

PATH_6 = "../1_hmmscan_for_carotenoids_and_flotillins/6_flotillin/6_gdark4.hmmscan.txt" # hmmsearch v. Flotillin.tagged.hmm

def hmmer_to_DF(path, program="hmmsearch", format="tblout", verbose=False):   
    if format in {"tblout","domtblout"}:
        cut_index = {"tblout":18, "domtblout":22}[format]
        data = list()
        header = list()
        with open(path) as FILE:
            for line in FILE.readlines():
                if line.startswith("#"):
                    header.append(line)
                else:
                    row = list(filter(bool, line.strip().split(" ")))
                    row = row[:cut_index] + [" ".join(row[cut_index:])]
                    data.append(row)
        DF = pd.DataFrame(data)
        if not DF.empty:
            columns = ["target_name","target_accession","query_name","query_accession","e-value","score","bias","best_domain_e-value","best_domain_score","best_domain_bias","exp","reg","clu","ov","env","dom","rep","inc","query_description"]
            DF.columns = columns
            DF['SAG'] = DF['query_name'].str.replace('SCGC_','').str[0:10]
    return DF

def filter_hits_DF(_DF):
    if MODE_verbose == True:
        print(str(len(_DF)) + " hits in hmmer")
        print(str(_DF['score'].astype(float).mean()) + " with mean bitscore")
        _DF1 = _DF.loc[_DF['score'].astype(float)>=80]
        print(str(len(list(_DF1['SAG'].unique()))) + " hits over "+str(80))

        _DF2 = _DF.loc[_DF['score'].astype(float)>=50]
        print(str(len(list(_DF2['SAG'].unique()))) + " hits over "+str(50))

        _DF3 = _DF.loc[_DF['score'].astype(float)>=30]
        print(str(len(list(_DF3['SAG'].unique()))) + " hits over "+str(30))
        
        _DF4 = _DF.loc[_DF['score'].astype(float)>=10]
        print(str(len(list(_DF4['SAG'].unique()))) + " hits over "+str(10))
        
        
        print("Using cutoff "+str(bitscore_threshold))
    
    _DF_threshold = _DF.loc[_DF['score'].astype(float)>=bitscore_threshold]
    print("\n")
    return _DF_threshold

# I. crtE search #############################

print("\n\n\n##################### CrtE #############")

DF_1a = hmmer_to_DF(PATH_1a)
DF_1b = hmmer_to_DF(PATH_1b)

DF_1a = filter_hits_DF(DF_1a)
DF_1b = filter_hits_DF(DF_1b)

print("Pooling hits to different HMMs\n")
DF_1 = pd.concat([DF_1a, DF_1b])
DF_1.insert(loc=0,column='hmmsearching_for',value='CrtE')

print("How many SAGs had hits (at this bitscore threshold?)")
LIST_SAGs_with_1 = list(DF_1['SAG'].unique())
print(str(len(LIST_SAGs_with_1))+ " SAGs with crtE")

DF_1.to_csv("./output/1_crtE_hmmsearch_minbit"+str(bitscore_threshold)+".csv")

# II. crtB #############################

print("\n\n\n##################### CrtB ################")

### HMM
DF_2a = hmmer_to_DF(PATH_2a)
DF_2b = hmmer_to_DF(PATH_2b)

DF_2a = filter_hits_DF(DF_2a)
DF_2b = filter_hits_DF(DF_2b)

print("Pooling hits to different HMMs\n")
DF_2 = pd.concat([DF_2a, DF_2b])
DF_2.insert(loc=0,column='hmmsearching_for',value='CrtB')

print("How many SAGs had hits (at this bitscore threshold?)")
LIST_SAGs_with_2 = list(DF_2['SAG'].unique())

print(str(len(LIST_SAGs_with_2))+ " SAGs with crtB")
DF_2.to_csv("./output/2_crtB_hmmsearch_minbit"+str(bitscore_threshold)+".csv")

# III. crtI #############################
print("\n\n\n##################### CrtI ##################")

### HMM
DF_3 = hmmer_to_DF(PATH_3)
DF_3 = filter_hits_DF(DF_3)

DF_3.insert(loc=0,column='hmmsearching_for',value='CrtI')
print("How many SAGs had hits (at this bitscore threshold?)")
LIST_SAGs_with_3 = list(DF_3['SAG'].unique())

print(str(len(LIST_SAGs_with_3))+ " SAGs with crtI")
DF_3.to_csv("./output/3_crtI_hmmsearch_minbit"+str(bitscore_threshold)+".csv")

# IV. crtY #############################
print("\n\n\n##################### crtY ###################")

### HMM
DF_4a = hmmer_to_DF(PATH_4a)
DF_4b = hmmer_to_DF(PATH_2b)

DF_4a = filter_hits_DF(DF_4a)
DF_4b = filter_hits_DF(DF_4b)

print("Pooling hits to different HMMs\n")
DF_4 = pd.concat([DF_4a, DF_4b])
DF_4.insert(loc=0,column='hmmsearching_for',value='CrtY')

print("How many SAGs had hits (at this bitscore threshold?)")
LIST_SAGs_with_4 = list(DF_4['SAG'].unique())
print(str(len(LIST_SAGs_with_4))+ " SAGs with crtY")
DF_4.to_csv("./output/4_crtY_hmmsearch_minbit"+str(bitscore_threshold)+".csv")

# V. brp/blh #########################
print("\n\n\n##################### blh ###################")


### HMM
DF_5 = hmmer_to_DF(PATH_5)

DF_5 = filter_hits_DF(DF_5)

DF_5.insert(loc=0,column='hmmsearching_for',value='brp/blh')

print("How many SAGs had hits (at this bitscore threshold?)")
LIST_SAGs_with_5 = list(DF_5['SAG'].unique())
print(str(len(LIST_SAGs_with_5))+ " SAGs with brp/blh")
DF_5.to_csv("./output/5_brp_blh_hmmsearch_minbit"+str(bitscore_threshold)+".csv")

# V. flotillin #########################
print("\n\n\n##################### flotillin ###################")


### HMM
DF_6 = hmmer_to_DF(PATH_6)

DF_6 = filter_hits_DF(DF_6)

DF_6.insert(loc=0,column='hmmsearching_for',value='flotillin')

print("How many SAGs had hits (at this bitscore threshold?)")
LIST_SAGs_with_6 = list(DF_6['SAG'].unique())
print(str(len(LIST_SAGs_with_6))+ " SAGs with flotillin")

DF_6.to_csv("./output/6_flotillin_hmmsearch_minbit"+str(bitscore_threshold)+".csv")






##################### CrtE #############




Pooling hits to different HMMs

How many SAGs had hits (at this bitscore threshold?)
582 SAGs with crtE



##################### CrtB ################




Pooling hits to different HMMs

How many SAGs had hits (at this bitscore threshold?)
152 SAGs with crtB



##################### CrtI ##################


How many SAGs had hits (at this bitscore threshold?)
889 SAGs with crtI



##################### crtY ###################




Pooling hits to different HMMs

How many SAGs had hits (at this bitscore threshold?)
633 SAGs with crtY



##################### blh ###################


How many SAGs had hits (at this bitscore threshold?)
153 SAGs with brp/blh



##################### flotillin ###################


How many SAGs had hits (at this bitscore threshold?)
412 SAGs with flotillin


In [9]:
DF_1

Unnamed: 0,target_name,hmmsearching_for,target_accession,query_name,query_accession,e-value,score,bias,best_domain_e-value,best_domain_score,...,exp,reg,clu,ov,env,dom,rep,inc,query_description,SAG
0,GGPP_syn,CrtE,NF041003.1,AM-685-N21_AM-685-N21_contigs_SCGC_AM-685-N21_...,-,7.1e-28,83.6,0.5,8.4e-28,83.3,...,1.0,1,0,0,1,1,1,1,NCBIFAM: geranylgeranyl diphosphate synthase,AM-685-N21
1,GGPP_syn,CrtE,NF041003.1,AH-613-D20_AH-613-D20_contigs_AH-613-D20_NODE_...,-,1.4e-90,289.6,0.9,1.7e-90,289.4,...,1.0,1,0,0,1,1,1,1,NCBIFAM: geranylgeranyl diphosphate synthase,AH-613-D20
2,GGPP_syn,CrtE,NF041003.1,AM-276-J22_AM-276-J22_contigs_AM-276-J22_NODE_...,-,2.6e-56,177.0,0.1,4.2e-56,176.3,...,1.3,1,1,0,1,1,1,1,NCBIFAM: geranylgeranyl diphosphate synthase,AM-276-J22
3,GGPP_syn,CrtE,NF041003.1,AH-473-P19_AH-473-P19_contigs_AH-473-P19_NODE_2_2,-,5.2e-06,11.7,0.0,8.3e-06,11.0,...,1.2,1,0,0,1,1,1,1,NCBIFAM: geranylgeranyl diphosphate synthase,AH-473-P19
4,GGPP_syn,CrtE,NF041003.1,AH-473-P19_AH-473-P19_contigs_AH-473-P19_NODE_...,-,6.9e-54,169.0,0.0,8.8e-54,168.7,...,1.1,1,0,0,1,1,1,1,NCBIFAM: geranylgeranyl diphosphate synthase,AH-473-P19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
981,GGPPsyn_IdsB,CrtE,NF045549.1,AM-332-E05_AM-332-E05_contigs_AM-332-E05_NODE_...,-,1.8e-28,85.5,0.0,2.3e-28,85.2,...,1.1,1,0,0,1,1,1,1,NCBIFAM: geranylgeranyl diphosphate synthase IdsB,AM-332-E05
982,GGPPsyn_IdsB,CrtE,NF045549.1,AM-332-E05_AM-332-E05_contigs_AM-332-E05_NODE_...,-,1.7e-40,125.1,0.3,2.3e-40,124.7,...,1.1,1,0,0,1,1,1,1,NCBIFAM: geranylgeranyl diphosphate synthase IdsB,AM-332-E05
983,GGPPsyn_IdsB,CrtE,NF045549.1,AM-332-K08_AM-332-K08_contigs_AM-332-K08_NODE_7_7,-,3.8e-39,120.7,0.2,5e-39,120.2,...,1.2,1,0,0,1,1,1,1,NCBIFAM: geranylgeranyl diphosphate synthase IdsB,AM-332-K08
984,GGPPsyn_IdsB,CrtE,NF045549.1,AM-332-K08_AM-332-K08_contigs_AM-332-K08_NODE_...,-,7.6e-30,90.1,0.1,9.4e-30,89.7,...,1.1,1,0,0,1,1,1,1,NCBIFAM: geranylgeranyl diphosphate synthase IdsB,AM-332-K08


In [None]:


DF_1.['CrtE']
