In [1]:
# Add higher directory to python modules path

import sys

sys.path.append("..")

In [2]:
import os

import pandas as pd

import plotly.express as px

from modules.hmm import get_hits, get_seqs

In [3]:
DATA_DIR = "../data/runs/aquificota/2024-09-08/"

In [4]:
hits_path = os.path.join(
    DATA_DIR,
    "HighQ_Aquificota_Sequences_AA.fa_hmmer.txt"
)
seqs_path = os.path.join(
    DATA_DIR,
    "HighQ_Aquificota_Sequences_AA.fa"
)

hits_df = get_hits(hits_path)
seqs_df = get_seqs(seqs_path)

hits_df = pd.merge(
    left=hits_df,
    right=seqs_df.rename(columns={"seq_id": "target_name"}),
    how="left",
    on="target_name"
)

# Create MAG and gene caller ID columns
hits_df["mag"] = hits_df["target_name"]\
    .str.split("_").str[:-1]\
    .apply(lambda row: "_".join(row))
hits_df["gene_caller_id"] = hits_df["target_name"]\
    .str.split("_").str[-1]

hits_df

Unnamed: 0,target_name,target_accession,query_name,query_accession,e_value_full_seq,score_full_seq,bias_full_seq,e_value_best_dom,score_best_dom,bias_best_dom,...,clu,ov,env,dom,rep,inc,description_of_target,seq,mag,gene_caller_id
0,Persephonella_sp_M17_metabat2_scaf2bin_002_257,-,baker_rubisco_form_IV_alignment,-,8.700000e-108,364.4,0.0,9.800000e-108,364.3,0.0,...,0,0,1,1,1,1,-,MNYIEVTYLLTTKQHVDPEKKAEELAISLSIGGWGDLSENKRKNLE...,Persephonella_sp_M17_metabat2_scaf2bin_002,257
1,Persephonella_sp_A1_metabat2_scaf2bin_131_1263,-,baker_rubisco_form_IV_alignment,-,1.800000e-107,363.4,0.0,2.000000e-107,363.2,0.0,...,0,0,1,1,1,1,-,MNYIEVTYLLTSKKHIEPEKKAEELAISLSIGGWGDLPENKRKKLE...,Persephonella_sp_A1_metabat2_scaf2bin_131,1263
2,Aquificota_bacterium_L_MetaBat_11_1112,-,baker_rubisco_form_IV_alignment,-,1.700000e-104,353.6,0.0,1.900000e-104,353.4,0.0,...,0,0,1,1,1,1,-,MNYIEVTYLLTTKEEINPEEKAKEIAISLSIGGTGDLPPEKIKELE...,Aquificota_bacterium_L_MetaBat_11,1112
3,Hydrogenothermaceae_bacterium_134_614_metabat2...,-,baker_rubisco_form_IV_alignment,-,4.500000e-104,352.2,0.0,5.000000e-104,352.1,0.0,...,0,0,1,1,1,1,-,MNYINVTYLLSSNKKFNVEEKAKRLAEELTIGSENNLRFNPKLKTY...,Hydrogenothermaceae_bacterium_134_614_metabat2...,573
4,Hydrogenothermaceae_bacterium_S141_maxbin2_sca...,-,baker_rubisco_form_IV_alignment,-,4.400000e-85,289.7,0.0,4.900000e-85,289.5,0.0,...,0,0,1,1,1,1,-,MNYIEAMYLIISDRKFDIEERAEELKRDVYIWNEKNYISDKERLRN...,Hydrogenothermaceae_bacterium_S141_maxbin2_sca...,1167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150951,Aquificaceae_bacterium_T2_maxbin2_scaf2bin_342...,-,metascan_WP_013516312.1,KO:K17052,1.300000e-10,45.2,0.0,1.900000e-10,44.8,0.0,...,0,0,1,1,1,1,-,MRYIGLFLLLTLATVFSGNVENGKKIYDQWCAQCHGYEGDGMGYAE...,Aquificaceae_bacterium_T2_maxbin2_scaf2bin_342,790
1150952,Aquificaceae_bacterium_S141_87_esom_1070,-,metascan_WP_013516312.1,KO:K17052,1.100000e-09,42.3,0.4,1.300000e-09,42.0,0.4,...,0,0,1,1,1,1,-,IHPNYAYDWYPHAKPPYKYPEDWANQYALAYIGGEKVFRKNTFKTP...,Aquificaceae_bacterium_S141_87_esom,1070
1150953,Aquificaceae_bacterium_354_166_metabat1_scaf2b...,-,metascan_WP_013516312.1,KO:K17052,1.300000e-06,32.2,0.0,2.000000e-06,31.6,0.0,...,0,0,1,1,1,1,-,MKYIGLFLLLILSTVFAGNAENGKKIYDQWCAQCHGYEGEGNGYAA...,Aquificaceae_bacterium_354_166_metabat1_scaf2b...,656
1150954,Persephonella_sp_PIR_30_metabat2_scaf2bin_079_...,-,metascan_WP_013516312.1,KO:K17052,5.100000e-03,20.4,0.0,5.800000e-03,20.2,0.0,...,0,0,1,1,1,1,-,MNRGLKAGLLGLSLIAFTATAGEKEFFKYEVINGKYVEGEISADPD...,Persephonella_sp_PIR_30_metabat2_scaf2bin_079,1582


In [5]:
# Get only those hits with the lowest E-value
hits_df_min = hits_df.loc[
    hits_df.groupby("query_name")["e_value_full_seq"].idxmin()
].reset_index(drop=True)

hits_df_min

Unnamed: 0,target_name,target_accession,query_name,query_accession,e_value_full_seq,score_full_seq,bias_full_seq,e_value_best_dom,score_best_dom,bias_best_dom,...,clu,ov,env,dom,rep,inc,description_of_target,seq,mag,gene_caller_id
0,Aquifex_aeolicus_VF5_1693,-,baker_/export/uec-gs1/pdthomas/panther/famlib/...,-,5.500000e-111,375.7,5.2,6.200000e-111,375.5,5.2,...,0,0,1,1,1,1,-,MAKHVVVIGGGVGGIATAYNLRNLMPDLKITLISDRPYFGFTPAFP...,Aquifex_aeolicus_VF5,1693
1,Aquificota_bacterium_L_MetaBat_131_330,-,baker_Hydrogenase_Vignais_Group_1_alignment,-,2.600000e-231,773.1,0.0,2.900000e-231,773.0,0.0,...,0,0,1,1,1,1,-,MEKRVVVDPITRIEGHLRIEAQLKNGKIEKAYSSGTMVRGIEIILK...,Aquificota_bacterium_L_MetaBat_131,330
2,Hydrogenothermaceae_bacterium_S141_maxbin2_sca...,-,baker_Hydrogenase_Vignais_Group_2a_alignment,-,2.700000e-233,779.3,0.0,3.100000e-233,779.0,0.0,...,0,0,1,1,1,1,-,MATKEVKTASETKELHISPVGRVEGDLDVKVIIEDGVVKDAWTEAS...,Hydrogenothermaceae_bacterium_S141_maxbin2_sca...,5
3,Hydrogenothermaceae_bacterium_S141_maxbin2_sca...,-,baker_Hydrogenase_Vignais_Group_2b_alignment,-,3.300000e-115,389.9,0.0,4.100000e-115,389.6,0.0,...,0,0,1,1,1,1,-,MATKEVKTASETKELHISPVGRVEGDLDVKVIIEDGVVKDAWTEAS...,Hydrogenothermaceae_bacterium_S141_maxbin2_sca...,5
4,Thermovibrio_guaymasensis_DSM_15521_874,-,baker_Hydrogenase_Vignais_Group_3a_alignment,-,1.300000e-112,380.7,0.0,1.500000e-112,380.6,0.0,...,0,0,1,1,1,1,-,MKKVVKIEGIPLTEGHSGLFLKVEEGVIEEGLYYALVPVRGFETLL...,Thermovibrio_guaymasensis_DSM_15521,874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3197,Hydrogenobacter_thermophilus_DRTY6_201601_bins...,-,metascan_YP_002497622,KO:K14028,8.900000e-200,668.7,16.2,1.100000e-199,668.4,16.2,...,0,0,1,1,1,1,-,MKRRAYHRLLFLLILFAVSWVGVSIANPEVEELSKKASLWPAPGRD...,Hydrogenobacter_thermophilus_DRTY6_201601_bins_11,1469
3198,Hydrogenobacter_thermophilus_TK6_390,-,metascan_YP_003205093,KO:K14028,6.200000e-241,804.9,16.2,7.100000e-241,804.7,16.2,...,0,0,1,1,1,1,-,MKRRAYHRLLFLLILSAVSWVGLSIANPQVEELSKKAGLWPAPGRD...,Hydrogenobacter_thermophilus_TK6,390
3199,Hydrogenobacter_thermophilus_RBS10_74_438,-,metascan_YP_007912793,KO:K14028,3.100000e-213,713.4,12.4,3.800000e-213,713.0,12.4,...,0,0,1,1,1,1,-,MKRRAYHRLLFLLILSAVSWVGVSIANPEVEELSKKASLWPAPGRD...,Hydrogenobacter_thermophilus_RBS10_74,438
3200,Persephonella_sp_IF05_L8_1904,-,metascan_tigr03478,KO:K17051,1.800000e-167,560.1,4.5,2.700000e-167,559.5,4.5,...,0,0,1,1,1,1,-,MAKRQLAMVMDLNKCIGCQTCTVACKTQWTNRNGREYMYWNNVETH...,Persephonella_sp_IF05_L8,1904


In [8]:
# Presence/abscence (remove for having the counts of many-vs-many)
heat_df = hits_df_min[["mag", "query_name"]].copy()
heat_df = heat_df.rename(columns={"query_name": "profile"})
heat_df = heat_df\
    .value_counts()\
    .reset_index(drop=False)

# Filter by METABOLIC profiles
heat_df["source"] = heat_df["profile"].str.split("_").str[0]
heat_df = heat_df[heat_df["source"] == "metabolic"]

heat_df = heat_df.pivot(
    index="mag",
    columns="profile",
    values="count"
)
heat_df

profile,metabolic_Bac_luciferase,metabolic_Cyc1,metabolic_Cyc2_repCluster1,metabolic_Cyc2_repCluster2,metabolic_Cyc2_repCluster3,metabolic_CymA,metabolic_Cys_Met_Meta_PP,metabolic_DFE_0448,metabolic_DFE_0449,metabolic_DFE_0450,...,metabolic_rubisco_form_II_III_alignment,metabolic_rubisco_form_II_alignment,metabolic_rubisco_form_IV_alignment,metabolic_rubisco_form_I_alignment,metabolic_soxC,metabolic_soxD,metabolic_sqr_alignment,metabolic_sulfocyanin,metabolic_sulfur_dioxygenase_sdo_alignment,metabolic_thiosulfate_reductase_phsA_alignment
mag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aquifex_aeolicus_HyVt_501,,,,,,,,,,,...,,,,,,,,,,
Aquifex_aeolicus_SZUA_1413,,,,,,,,,,,...,,,,,,,,,,
Aquifex_aeolicus_VF5,,,,,,,,,,,...,,,,,,,1.0,,,
Aquifex_sp_S012_82_esom,,,,,,,,,,,...,,,,,,,,,,
Aquificaceae_bacterium_354_166_metabat1_scaf2bin_127,,,,,,,,,,,...,,,,,,1.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Thermocrinis_sp_2016_B02_str_C_8,,,,,,,,,,,...,,,,,,,,,,
Thermovibrio_guaymasensis_DSM_15521,,,,,,,,,,,...,,,,,,,,,,
Thermovibrio_sp_S012_127_esom,,,,,,,,,,,...,,,,,,,,,,
Venenivibrio_stagnispumantis_DSM_18763,,,,,,,,,,,...,,,,,,,,,,


In [10]:
fig = px.imshow(
    img=heat_df,
    zmin=0,
    # zmax=1,
    # width=900,
    # height=1100,
    template="simple_white"
)
fig.update_layout(
    font_size=7
)
fig.show()