# 8. Finalize Table S8 by adding sequences

### Goals:
1. Get start/end coords of rhodopsins -> add to Table S8
2. Add faa and CDs for rhodopsins and flanking flotillins/ferredoxins->  Table S8
3. *Almost forgot!* Add 'retinal_binding_residue' and 'has_retinal_binding' -> Table S8 

In [11]:
import pandas as pd
import numpy as np
from glob import glob
import os
from os.path import exists
import shutil
from Bio import SeqIO
import sys

# Output
PATH_out = "../Table_S8_Rhodopsin_sequences_metadata_and_gene_neighborhoods.tsv"

# Inputs
PATH_rhod_meta = "../7_integrate_alphafold_assessment/GORG_dark_rhodopsin_metadata_with_phylogeny_and_alphafold3.tsv" # This is the table that we're improving here
# Retinal binding info that Tianyi forgot to add til this step
PATH_retinal_CSV = "./input_retinal_binding_residue/retinal_binding_residue.csv"
PATTERN_operons_CSV = "../7_integrate_alphafold_assessment/rhodopsin_operon_presence_absence*csv"  # Using a pattern because the date in the filename can change
PATH_annot_rhodopsin_neighbors_TSV = "../4_reannotate_dramAAs_based_on_foldseek_annot/reannot_all_prot_from_contigs_with_rhodopsins.tsv"
PATH_rhod_neighbors_FAA = "../4_reannotate_dramAAs_based_on_foldseek_annot/reannot_Erank_proteins_from_rhodopsin_contigs.fasta"
PATH_dram_all_annots = "../0_get_DRAM_AAs_for_SAGs_w_rhodopsins/dram_926_SAGs_combined.csv"
PATH_dram_all_CDS = "../0_get_DRAM_AAs_for_SAGs_w_rhodopsins/dram_926_SAGs_combined.fna"
PATH_dram_all_FAA = "../0_get_DRAM_AAs_for_SAGs_w_rhodopsins/dram_926_SAGs_combined.faa"

print("Starting")
### Add sequence columns to the rhodopsin metadata table
# Results in col called either 'rhodopsin_protein(DRAM)', 'rhodopsin_CDS(DRAM)', 'flanking_ferredoxin_protein(DRAM)', 'flanking_ferredoxin_CDS(DRAM)', 'flanking_flotillin_protein(DRAM)', or 'flanking_flotillin_CDS(DRAM)' 
def add_col_with_sequence(DF_in,CDS_or_protein,rhodopsin_ferredoxin_flotillin):
    
    # Check input arguments
    if CDS_or_protein not in ['CDS','protein']:
        sys.exit("ERROR, argument 2 of add_col_with_sequence() must be 'CDS' or 'protein'")
    if rhodopsin_ferredoxin_flotillin not in ['rhodopsin', 'ferredoxin', 'flotillin']:
        sys.exit("ERROR, argument 3 of add_col_with_sequence() must be 'rhodopsin', 'ferredoxin', 'flotillin'")
    
    # What's the new column's name?
    newColName = rhodopsin_ferredoxin_flotillin + "_" + CDS_or_protein + "(DRAM)"
    if rhodopsin_ferredoxin_flotillin in ['ferredoxin', 'flotillin']:
        newColName = "flanking_" + newColName
        
    # Choose the lists/dicts that will be searched against eachother e.g. ( rhodopsin vs. CDS ) | ( ferredoxin vs. protein ) | etc.
    # Also, for flotillins and ferredoxins, need DICT that says which rhodopsinID each of them neighbors
    
    ## query: list of seqIDs
    if rhodopsin_ferredoxin_flotillin == 'rhodopsin':
        LIST_IDS_to_find = LIST_rhodopsin_IDs
    if rhodopsin_ferredoxin_flotillin == 'ferredoxin':
        LIST_IDS_to_find = LIST_ferredoxin_IDs
    if rhodopsin_ferredoxin_flotillin == 'flotillin':
        LIST_IDS_to_find = LIST_flotillin_IDs
    
    ## target: DICT of seqID2seqRecord (in Biopython format)
    if CDS_or_protein == 'CDS':
        DICT_to_search = DICT_dramCDS_ID2string
    if CDS_or_protein == 'protein':
        DICT_to_search = DICT_dramFAA_ID2string
    
    ## neighbor: DICT of rhodID2neighborID
    if rhodopsin_ferredoxin_flotillin == 'ferredoxin':
        DICT_rhod2neighbor = DICT_rhodID_2_ferredoxinID
    if rhodopsin_ferredoxin_flotillin == 'flotillin':
        DICT_rhod2neighbor = DICT_rhodID_2_flotillinID
        
    # Check that we'll be able to find what we need (i.e. that every query SeqID has a matching key in the target DICT)
    print("\t"+str(len(LIST_IDS_to_find)) + " " + CDS_or_protein + "s to find")
    count_found = 0
    for ID in LIST_IDS_to_find:
        if ID in list(DICT_to_search.keys()):
            count_found += 1
    print("\t"+str(count_found) + " found")

    #### Finally, add the new column and populate it with the sequence:
    _DF = DF_in
    
    # Every rhodopsin row will have a protein and CDS to retrieve
    if rhodopsin_ferredoxin_flotillin == 'rhodopsin':
        _DF[newColName] = DF_in['rhodopsin_gene'].map(DICT_to_search)
    
    
    #But *NOT* every rhodopsin row will have an associated ferredoxin or flotillin
    #So we need to use "where" logic,
    if rhodopsin_ferredoxin_flotillin in ['ferredoxin', 'flotillin']:
        
        _DF[newColName] = np.where(DF_in['rhodopsin_gene'].isin(list(DICT_rhod2neighbor.keys())),
                                   DF_in['rhodopsin_gene'].map(DICT_rhod2neighbor).map(DICT_to_search),
                                   #DF_in['rhodopsin_gene'].apply(lambda x: ">"+ str(DICT_to_search[DICT_rhod2neighbor[x]].id) + "\n" + str(DICT_to_search[DICT_rhod2neighbor[x]].seq) ),
                                   np.nan)
        ### ^ What does this "where" logic mean?
        #                 np.where( **IF** this row's rhodopsin has a neighbor 
        #                          **THEN**  populate this cell with that neighbor's seq like ">SeqID/nATATATAGGTATATATAT"
        #                          **ELSE**  leave cell empty ) 
                        
            
            
    DICT_seq_populated = ~_DF[newColName].isna().value_counts() # Count how many arent empty
    print("\t"+str(abs(DICT_seq_populated[False])-1) + " rows of the rhodopsin metadata table now have a "+CDS_or_protein+" added for "+rhodopsin_ferredoxin_flotillin)
    
    return _DF



print("### I.  Subsetting dataset to work only with rhodopsin-bearing SAGs.")
print("\tOtherwise, using full GORG Dark takes 10 min for each combined dataframe to load.")

DF_rhod_meta = pd.read_csv(PATH_rhod_meta, sep="\t")

DF_annot = pd.read_csv(PATH_dram_all_annots)

DF_annot = DF_annot.rename(columns={'...1':'rhodopsin_gene'})

LIST_contigs_with_rhodopsin = DF_rhod_meta['rhodopsin_scaffold'].unique().tolist()
print("\tDropping all genes not on the " +str(len(LIST_contigs_with_rhodopsin)) +" contigs with rhodopsin")
DF_annot = DF_annot.loc[DF_annot['scaffold'].isin(LIST_contigs_with_rhodopsin)]
print("\t"+str(len(DF_annot))+ " annotation rows remaining.\n")

DF_annot = DF_annot.rename(columns={'...1':'rhodopsin_gene'})



print("### II.  Reading tables -> dataframes. Fastas -> dictionaries.")
    
LIST_operon_CSV = glob(PATTERN_operons_CSV)
if len(LIST_operon_CSV) != 1:
    sys.exit("ERROR. Could not find a .csv file beginning with 'rhodopsin_operon_presence_absence' in working directory OR found too many of them")
else:
    DF_operons = pd.read_csv(LIST_operon_CSV[0])
    
LIST_rhodopsin_geneIDs = DF_rhod_meta['rhodopsin_gene'].unique().tolist()
LIST_rhods_flanking_flotillin = list(DF_operons.loc[DF_operons["3' flotillin"]==1]['rhodopsin_gene'])
LIST_rhods_flanking_ferredoxin = list(DF_operons.loc[DF_operons["3' 2Fe-2S binding dom."]==1]['rhodopsin_gene'])
print("\t"+str(len(LIST_rhodopsin_geneIDs)) + " rhodopsins in GORG Dark")
print("\t"+str(len(LIST_rhods_flanking_ferredoxin)) + " rhods flanking ferredoxin")
print("\t"+str(len(LIST_rhods_flanking_flotillin)) + " rhods flanking flotillin")

### A DRAM annotation table with columns 'locus tag', 'description', 'new_rankE_description'
### LACKS gene start and stop positions
DF_simple_annot = pd.read_csv(PATH_annot_rhodopsin_neighbors_TSV, sep="\t")

DICT_proteins = SeqIO.index(PATH_rhod_neighbors_FAA, "fasta")
LIST_protIDs_from_rhod_contigs = list(DICT_proteins.keys())
print("\tLoaded "+str(len(LIST_protIDs_from_rhod_contigs))+ " prots from rhodopsin-bearing contigs")

### Load protein and CDS fastas to search for sequences
DICT_dramCDS_ID2record = SeqIO.index(PATH_dram_all_CDS, "fasta")
DICT_dramCDS_ID2string = {}
# Convert it from biopython Seq.record object to string
for key in DICT_dramCDS_ID2record.keys():
    record = DICT_dramCDS_ID2record[key]
    STR_record = ">" + record.id + "\n" + record.seq
    DICT_dramCDS_ID2string[key]=STR_record
LIST_IDS_dramCDS = [ i for i in DICT_dramCDS_ID2record.keys() ]
print("\t"+str(len(LIST_IDS_dramCDS))+ " CDS loaded from DRAM")

DICT_dramFAA_ID2record = SeqIO.index(PATH_dram_all_FAA, "fasta")
DICT_dramFAA_ID2string = {}

# Convert it from biopython Seq.record object to string
for key in DICT_dramFAA_ID2record.keys():
    record = DICT_dramFAA_ID2record[key]
    STR_record = ">" + record.id + "\n" + record.seq
    DICT_dramFAA_ID2string[key]=STR_record
LIST_IDS_dramFAA = [ i for i in DICT_dramFAA_ID2record.keys() ]
print("\t"+str(len(LIST_IDS_dramFAA))+ " FAA loaded from DRAM")



print("\n### III.    Finding the flotillin and ferredoxin seqs that neighbored rhodopsin")

LIST_ferredoxin_records = []
DICT_rhodID_2_ferredoxinID = {} # To track which rhodopsin neighbors which ferredoxin

LIST_flotillin_records = []
DICT_rhodID_2_flotillinID = {} # To track which rhodopsin neighbors which flotillin

for rhodID in LIST_rhodopsin_geneIDs:
    reformat_rhodID = rhodID[0:11] + rhodID
    record_upstream = ''
    record_downstream = ''
    position_upstream = ''
    position_downstream = ''
    # e.g. 'AG-538-A02_contigs_AG-538-A02_NODE_4_1' -> 'AG-538-A02_A02_AG-538-A02_contigs_AG-538-A02_NODE_4_1'
    if reformat_rhodID in LIST_protIDs_from_rhod_contigs:
        position_upstream = LIST_protIDs_from_rhod_contigs.index(reformat_rhodID)-1 # e.g. 30
        seqID_upstream = LIST_protIDs_from_rhod_contigs[position_upstream] # e.g. AG-538-B02_AG-538-B02_contigs_AG-538-B02_NODE_14_3
        record_upstream = DICT_proteins[seqID_upstream]
        if rhodID in LIST_rhods_flanking_ferredoxin:
            if 'ferredoxin' in str(record_upstream.description) or 'PF00111' in str(record_upstream.description):
                # Check that I'm only taking ferredoxins from SAGs with rhodopsins in the 2Fe-2S clade
                #if seqID_upstream[0:10] in LIST_SAGS_of_interest:
                LIST_ferredoxin_records.append(record_upstream)
                DICT_rhodID_2_ferredoxinID[rhodID]=record_upstream.id[11:]
        if rhodID in LIST_rhods_flanking_flotillin:
            if 'lotillin' in str(record_upstream.description):
                LIST_flotillin_records.append(record_upstream)
                DICT_rhodID_2_flotillinID[rhodID]=record_upstream.id[11:]
        
        position_downstream = LIST_protIDs_from_rhod_contigs.index(reformat_rhodID)+1
        seqID_downstream = LIST_protIDs_from_rhod_contigs[position_downstream]
        record_downstream = DICT_proteins[seqID_downstream]
        if rhodID in LIST_rhods_flanking_ferredoxin:
            if 'ferredoxin' in str(record_downstream.description) or 'PF00111' in str(record_downstream.description):
                # Check that I'm only taking ferredoxins from SAGs with rhodopsins in the 2Fe-2S clade
                #if seqID_downstream[0:10] in LIST_SAGS_of_interest:
                LIST_ferredoxin_records.append(record_downstream)
                DICT_rhodID_2_ferredoxinID[rhodID]=record_downstream.id[11:]
        if rhodID in LIST_rhods_flanking_flotillin:
            if 'lotillin' in str(record_downstream.description):
                LIST_flotillin_records.append(record_downstream)
                DICT_rhodID_2_flotillinID[rhodID]=record_downstream.id[11:]
  

LIST_rhodopsin_IDs = LIST_rhodopsin_geneIDs
LIST_ferredoxin_IDs = [record.id[11:] for record in LIST_ferredoxin_records] # e.g. 'AG-538-A02_AG-538-A02_contigs_AG-538-A02_NODE_4_9' -> 'AG-538-A02_contigs_AG-538-A02_NODE_4_9'
LIST_flotillin_IDs = [record.id[11:] for record in LIST_flotillin_records]

                
print("\tFound "+str(len(LIST_ferredoxin_records)) + ' proteins annotated as ferredoxin, that are next to rhodopsin')
print("\tFound "+str(len(LIST_flotillin_records)) + ' proteins annotated as flotillin, that are next to rhodopsin')



print("\n### VI.   Adding rhodopsin, flotillin, and ferredoxin, sequences to table...")
DF = DF_rhod_meta
print("\n\tRhodopsin")
DF = add_col_with_sequence(DF,'protein','rhodopsin')
DF = add_col_with_sequence(DF,'CDS','rhodopsin')
print("\n\tFerredoxin")
DF = add_col_with_sequence(DF,'protein','ferredoxin')
DF = add_col_with_sequence(DF,'CDS','ferredoxin')
print("\n\tFlotillin")
DF = add_col_with_sequence(DF,'protein','flotillin')
DF = add_col_with_sequence(DF,'CDS','flotillin')



print("\n#### V.    Loading start/end coordinates for rhodopsin genes\n")

# Drop columns that will be made redundant with merge
_DF = DF.drop(columns=['Unnamed: 0','sag','rhodopsin_scaffold'])

_DF_annot = DF_annot[['rhodopsin_gene','fasta','scaffold','start_position','end_position','strandedness']]
_DF_annot = _DF_annot.rename(columns={'fasta':'sag','scaffold':'rhodopsin_scaffold','start_position':'rhodopsin_start_position(DRAM)','end_position':'rhodopsin_end_position(DRAM)','strandedness':'rhodopsin_strandedness(DRAM)'})
_DF_annot['sag'] = _DF_annot['sag'].str.replace('_contigs','')

# Merge
DF = _DF_annot.merge(_DF,on='rhodopsin_gene', how='right')


print("\n### VI.    Add operon info to table")

_DF_operons = DF_operons.drop(columns=['Unnamed: 0', '8th helix'])
DF = DF.merge(_DF_operons,on='rhodopsin_gene',how='inner')



print("\n### VII.   Polishing and saving")

# Rename some columns for clarity
DF = DF.rename(columns={'carotenoid_gene_cluster':'carotenoid_gene_cluster(HMMER3)',
                       'flotillin':'flotillin(HMMER3)',
                       'neither':'neither(HMMER3)',
                       'flanking_genes':'flanking_genes(DRAM+HMMER+Foldseek)',
                        'niche':'niched_based_on_read_recruitment',
                      })

print("\tSaving...")
DF.to_csv(PATH_out,sep="\t")
print("\tDone.")


print("\n### VIII.   Adding in the retinal_binding info that Tianyi forgot!")

DF_retinal = pd.read_csv(PATH_retinal_CSV)
DF_retinal = DF_retinal[['seq_id','retinal_binding_residue','has_retinal_binding']]
DF_retinal = DF_retinal.rename(columns={'seq_id':'rhodopsin_gene'})

DF_main = pd.read_csv(PATH_out,sep="\t")

LIST_col_order = ["rhodopsin_gene", 'sag', 'rhodopsin_scaffold',
       'rhodopsin_start_position(DRAM)', 'rhodopsin_end_position(DRAM)',
       'rhodopsin_strandedness(DRAM)', 'rhodopsin_count_in_sag',
       'carotenoid_gene_cluster(HMMER3)', 'flotillin(HMMER3)',
       'neither(HMMER3)', 'flanking_genes(DRAM+HMMER+Foldseek)',
       'retinal_binding_residue','has_retinal_binding',
       'color_tuning_type', 'color_tuning_residue', 'three_residue_motif',
       'pump_type', 'gtdb', 'niched_based_on_read_recruitment',
       'rhodopsin_family', 'rhodopsin_subfamily', 'suitable_for_phylogenetics',
       '8th_N-terminal_helix (alphafold3)',
       'incomplete_8th_helix (alphafold3)', 'rhodopsin_protein(DRAM)',
       'rhodopsin_CDS(DRAM)', 'flanking_ferredoxin_protein(DRAM)',
       'flanking_ferredoxin_CDS(DRAM)', 'flanking_flotillin_protein(DRAM)',
       'flanking_flotillin_CDS(DRAM)', "3' 2Fe-2S binding dom.",
       "3' flotillin", "5' AMP-binding enz.", "5' VirC1",
       "5' amidohydrolase fam.", "5' A.A. permease",
       "5' periplasmic-binding prot.", "3' CrtE", "3' L-rhamnose isomerase",
       "5' brp/blh", "3' tRNA synthetase I", "3' haemolysin-III related",
       "5' fasciclin", "3' fatty acid desaturase", "3' brp/blh",
       "5' thiamine pyrophosphate", "5' TMEM205-like",
       "5' AI-2E fam. transporter", "5' toxin-antitoxin",
       "3' phosphotransferase fam.", "3' SOUL heme-binding prot.",
       "3' ABC1 kinase-like", "5' FGGY fam. carb. kinases"]


print("adding 2 columns")
DF = DF_main.merge(DF_retinal, on="rhodopsin_gene", how="left")
DF = DF.drop(columns=["Unnamed: 0"])
DF = DF[LIST_col_order]

print("Writing out updated table")
DF.to_csv(PATH_out, sep="\t")

Starting
### I.  Subsetting dataset to work only with rhodopsin-bearing SAGs.
	Otherwise, using full GORG Dark takes 10 min for each combined dataframe to load.
	Dropping all genes not on the 1079 contigs with rhodopsin
	39133 annotation rows remaining.

### II.  Reading tables -> dataframes. Fastas -> dictionaries.
	1093 rhodopsins in GORG Dark
	422 rhods flanking ferredoxin
	391 rhods flanking flotillin
	Loaded 38369 prots from rhodopsin-bearing contigs
	757689 CDS loaded from DRAM
	757689 FAA loaded from DRAM

### III.    Finding the flotillin and ferredoxin seqs that neighbored rhodopsin
	Found 422 proteins annotated as ferredoxin, that are next to rhodopsin
	Found 372 proteins annotated as flotillin, that are next to rhodopsin

### VI.   Adding rhodopsin, flotillin, and ferredoxin, sequences to table...

	Rhodopsin
	1093 proteins to find
	1093 found
	1093 rows of the rhodopsin metadata table now have a protein added for rhodopsin
	1093 CDSs to find
	1093 found
	1093 rows of the rh

In [10]:
DF_annot

Unnamed: 0.1,Unnamed: 0,rhodopsin_gene,fasta,scaffold,gene_position,start_position,end_position,strandedness,rank,ko_id,...,peptidase_RBH,peptidase_identity,peptidase_bitScore,peptidase_eVal,pfam_hits,cazy_ids,cazy_hits,cazy_subfam_ec,cazy_best_hit,heme_regulatory_motif_count
314,314,AG-538-A02_contigs_AG-538-A02_NODE_4_1,AG-538-A02_contigs,AG-538-A02_NODE_4,1,3,1466,-1,D,,...,,,,,Sodium:solute symporter family [PF00474.20],,,,,0
315,315,AG-538-A02_contigs_AG-538-A02_NODE_4_2,AG-538-A02_contigs,AG-538-A02_NODE_4,2,1471,1749,-1,C,K21700,...,,,,,,,,,,0
316,316,AG-538-A02_contigs_AG-538-A02_NODE_4_3,AG-538-A02_contigs,AG-538-A02_NODE_4,3,1773,2141,-1,E,,...,,,,,,,,,,0
317,317,AG-538-A02_contigs_AG-538-A02_NODE_4_4,AG-538-A02_contigs,AG-538-A02_NODE_4,4,2141,4498,-1,D,,...,,,,,Domain of unknown function (DUF6605) [PF20254.1],,,,,0
318,318,AG-538-A02_contigs_AG-538-A02_NODE_4_5,AG-538-A02_contigs,AG-538-A02_NODE_4,5,4524,4931,-1,E,,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
757039,435,AM-689-M16_contigs_SCGC_AM-689-M16_contig19_18,AM-689-M16_contigs,SCGC_AM-689-M16_contig19,18,16200,17348,1,E,,...,,,,,,,,,,0
757040,436,AM-689-M16_contigs_SCGC_AM-689-M16_contig19_19,AM-689-M16_contigs,SCGC_AM-689-M16_contig19,19,17356,17619,1,E,,...,,,,,,,,,,0
757041,437,AM-689-M16_contigs_SCGC_AM-689-M16_contig19_20,AM-689-M16_contigs,SCGC_AM-689-M16_contig19,20,17680,18339,-1,C,K07056,...,,,,,Tetrapyrrole (Corrin/Porphyrin) Methylases [PF...,,,,,0
757042,438,AM-689-M16_contigs_SCGC_AM-689-M16_contig19_21,AM-689-M16_contigs,SCGC_AM-689-M16_contig19,21,18411,19577,-1,C,K06194,...,False,0.545,130.0,1.527000e-30,Peptidase family M23 [PF01551.25]; LysM domain...,,,,,0


In [5]:
DF.columns

Index(['rhodopsin_gene', 'sag', 'rhodopsin_scaffold',
       'rhodopsin_start_position(DRAM)', 'rhodopsin_end_position(DRAM)',
       'rhodopsin_strandedness(DRAM)', 'rhodopsin_count_in_sag',
       'carotenoid_gene_cluster(HMMER3)', 'flotillin(HMMER3)',
       'neither(HMMER3)', 'flanking_genes(DRAM+HMMER+Foldseek)',
       'color_tuning_type', 'color_tuning_residue', 'three_residue_motif',
       'pump_type', 'gtdb', 'niched_based_on_read_recruitment',
       'rhodopsin_family', 'rhodopsin_subfamily', 'suitable_for_phylogenetics',
       '8th_N-terminal_helix (alphafold3)',
       'incomplete_8th_helix (alphafold3)', 'rhodopsin_protein(DRAM)',
       'rhodopsin_CDS(DRAM)', 'flanking_ferredoxin_protein(DRAM)',
       'flanking_ferredoxin_CDS(DRAM)', 'flanking_flotillin_protein(DRAM)',
       'flanking_flotillin_CDS(DRAM)', '3' 2Fe-2S binding dom.',
       '3' flotillin', '5' AMP-binding enz.', '5' VirC1',
       '5' amidohydrolase fam.', '5' A.A. permease',
       '5' periplasmic-bind