In [34]:
import pandas as pd
import json

def safe_parse_json(json_str):
    try:
        return json.loads(json_str.replace("'", '"'))
    except json.JSONDecodeError:
        return None  # or some default value or handling

## load data

In [52]:
metabolic_reactions = pd.read_csv('data/metabolic_reactions.tsv', sep='\t', comment='#')
complexation_reactions = pd.read_csv('data/complexation_reactions.tsv', sep='\t', comment='#')
genes = pd.read_csv('data/genes.tsv', sep='\t', comment='#')
transcripts = pd.read_csv('data/rnas.tsv', sep='\t', comment='#')

# Convert the 'catalyzed_by' column from string representations of lists to actual lists
metabolic_reactions['catalyzed_by'] = metabolic_reactions['catalyzed_by'].apply(safe_parse_json)
complexation_reactions['stoichiometry'] = complexation_reactions['stoichiometry'].apply(safe_parse_json)
transcripts['monomer_ids'] = transcripts['monomer_ids'].apply(safe_parse_json)

In [46]:
print('METABOLISM')
print(metabolic_reactions.info())
print('\nCOMPLEXATION')
print(complexation_reactions.info())
print('\nGENES')
print(genes.info())
print('\nTRANSCRIPTS')
print(transcripts.info())

METABOLISM
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6590 entries, 0 to 6589
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             6590 non-null   object
 1   stoichiometry  6590 non-null   object
 2   direction      6590 non-null   object
 3   catalyzed_by   6590 non-null   object
dtypes: object(4)
memory usage: 206.1+ KB
None

COMPLEXATION
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1135 entries, 0 to 1134
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             1135 non-null   object
 1   stoichiometry  1135 non-null   object
 2   common_name    904 non-null    object
 3   cofactors      1135 non-null   object
dtypes: object(4)
memory usage: 35.6+ KB
None

GENES
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4748 entries, 0 to 4747
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype

## analyze metabolism

In [32]:
# Flatten the list of lists into a single list
all_enzymes = [enzyme for sublist in metabolic_reactions['catalyzed_by'] for enzyme in sublist]

# Extract unique enzymes
unique_enzymes = list(set(all_enzymes))

len(unique_enzymes)

1563

In [33]:
print(unique_enzymes)

['CPLX-7524', 'CPLX0-8200', 'G6653-MONOMER', 'URACIL-PRIBOSYLTRANS-CPLX', 'CPLX0-254', 'CPLX0-8010', 'G7103-MONOMER', 'CPLX0-3021', 'PDXK-CPLX', 'EG12267-MONOMER', 'ISOCIT-LYASE', 'DIHYDROFOLATEREDUCT-MONOMER', 'EG11846-MONOMER', 'CPLX0-7997', 'CPLX0-8122', 'G6474-MONOMER', 'EG11486-MONOMER', 'ABC-49-CPLX', 'YFEP-MONOMER', 'EG11921-MONOMER', 'FABZ-CPLX', 'CPLX0-7992', 'DIHYDROXYACIDDEHYDRAT-CPLX', 'PSERPHOSPHA-MONOMER', 'FGAMSYN-MONOMER', 'CPLX0-322', 'BIOTIN-CARBOXYL-CPLX', 'CPLX0-8614', 'CPLX0-8032', 'CPLX0-301', 'G7800-MONOMER', 'G7410-MONOMER', 'CPLX0-7652', 'EG11383-MONOMER', 'YHHO-MONOMER', 'O-SUCCINYLBENZOATE-COA-SYN-MONOMER', 'UDP-NACMURALGLDAPAALIG-MONOMER', 'EG10119-MONOMER', 'CPLX0-8167', 'CADB-MONOMER', 'CPLX0-7974', 'EG11080-MONOMER', 'EG11938-MONOMER', 'MONOMER0-1241', 'CPLX0-245', 'CPLX0-8112', 'G6532-MONOMER', 'EG11932-MONOMER', 'UDPNACETYLGLUCOSAMENOLPYRTRANS-MONOMER', 'METHGLYSYN-CPLX', 'G6892-MONOMER', 'EG10853-MONOMER', 'G7106-MONOMER', 'G7449-MONOMER', 'CPLX0-3950'

## analyze complexation

In [39]:
# Initialize a dictionary to hold the mapping of complexes to their components
complex_to_components = {}

# Iterate over each row in the DataFrame
for index, row in complexation_reactions.iterrows():
    stoichiometry = row['stoichiometry']
    # Iterate over each item in the stoichiometry dictionary
    for component, count in stoichiometry.items():
        # Skip if count is None or not an integer
        if count is None or not isinstance(count, int):
            continue
        # If the count is positive, it's a complex; if negative, it's a component
        if count > 0:
            # Initialize the complex in the dictionary if not already present
            if component not in complex_to_components:
                complex_to_components[component] = []
            # Add the components (with negative counts) to the complex's list
            complex_to_components[component].extend([comp for comp, comp_count in stoichiometry.items() if comp_count is not None and comp_count < 0])

# print the mapping
for complex, components in complex_to_components.items():
    print(f"{complex}: {components}")

1-PFK: ['1-PFK-MONOMER']
2OXOGLUTARATEDEH-CPLX: ['E1O', 'E2O', 'E3-CPLX']
3-ISOPROPYLMALDEHYDROG-CPLX: ['3-ISOPROPYLMALDEHYDROG-MONOMER']
3-ISOPROPYLMALISOM-CPLX: ['LEUC-MONOMER', 'LEUD-MONOMER']
3-METHYL-2-OXOBUT-OHCH3XFER-CPLX: ['3-CH3-2-OXOBUTANOATE-OH-CH3-XFER-MONOMER']
3-OXOACYL-ACP-SYNTHII-CPLX: ['3-OXOACYL-ACP-SYNTHII-MONOMER']
6PFK-1-CPX: ['6PFK-1-MONOMER']
6PFK-2-CPX: ['6PFK-2-MONOMER']
6PGLUCONDEHYDROG-CPLX: ['6PGLUCONDEHYDROG-MONOMER']
7-ALPHA-HYDROXYSTEROID-DEH-CPLX: ['7-ALPHA-HYDROXYSTEROID-DEH-MONOMER']
7KAPSYN-CPLX: ['7KAPSYN-MONOMER']
ABC-10-CPLX: ['FEPC-MONOMER', 'FEPD-MONOMER', 'FEPG-MONOMER', 'FEPB-MONOMER']
ABC-11-CPLX: ['FHUC-MONOMER', 'FHUB-MONOMER', 'FHUD-MONOMER']
ABC-12-CPLX: ['GLNQ-MONOMER', 'GLNP-MONOMER', 'GLNH-MONOMER']
ABC-13-CPLX: ['G6359-MONOMER', 'GLTK-MONOMER', 'GLTJ-MONOMER', 'GLTL-MONOMER']
ABC-14-CPLX: ['HISP-MONOMER', 'HISM-MONOMER', 'HISQ-MONOMER', 'HISJ-MONOMER']
ABC-15-CPLX: ['LIVF-MONOMER', 'LIVG-MONOMER', 'LIVH-MONOMER', 'LIVM-MONOMER', 'LIVJ-

## get expanded enzymes to gene products

In [41]:
# Initialize an empty list to hold both individual gene products and components of complexes
expanded_enzymes = []

for enzyme in unique_enzymes:
    # Check if the enzyme is a complex and has a mapping
    if enzyme in complex_to_components:
        # If it's a complex, extend the list with its components
        expanded_enzymes.extend(complex_to_components[enzyme])
    else:
        # If it's not a complex, just add the enzyme itself
        expanded_enzymes.append(enzyme)

# Remove duplicates by converting the list to a set, then back to a list
unique_gene_products = list(set(expanded_enzymes))

len(unique_gene_products)

# # Now, unique_gene_products contains only unique gene products, with complexes replaced by their components
# print(unique_gene_products)

1746

## analyze transcription/translation

In [53]:
genes

Unnamed: 0,id,symbol,synonyms,left_end_pos,right_end_pos,direction,rna_ids
0,EG10001,alr,"[""alr"", ""alr5"", ""b4053"", ""ECK4045""]",4265782.0,4266861.0,+,"[""EG10001_RNA""]"
1,EG10002,modB,"[""modB"", ""chlJ"", ""tslJ"", ""b0764"", ""ECK0753""]",795862.0,796551.0,+,"[""EG10002_RNA""]"
2,EG10003,cysZ,"[""cysZ"", ""b2413"", ""ECK2408""]",2531463.0,2532224.0,+,"[""EG10003_RNA""]"
3,EG10004,dfp,"[""dfp"", ""coaBC"", ""b3639"", ""ECK3629""]",3812731.0,3813951.0,+,"[""EG10004_RNA""]"
4,EG10006,dcuB,"[""dcuB"", ""genF"", ""b4123"", ""ECK4116""]",4347404.0,4348744.0,-,"[""EG10006_RNA""]"
...,...,...,...,...,...,...,...
4743,M013,mhpD,"[""mhpD"", ""mhpS"", ""b0350"", ""ECK0347""]",372115.0,372924.0,+,"[""M013_RNA""]"
4744,M014,mhpF,"[""mhpF"", ""b0351"", ""ECK0348""]",372921.0,373871.0,+,"[""M014_RNA""]"
4745,M015,hcaF,"[""hcaF"", ""yfhV"", ""hcaA"", ""phdC2"", ""digB"", ""hca...",2670390.0,2670908.0,+,"[""M015_RNA""]"
4746,RUVA,ruvA,"[""ruvA"", ""b1861"", ""ECK1862""]",1945365.0,1945976.0,-,"[""RUVA_RNA""]"


In [54]:
transcripts

Unnamed: 0,id,common_name,synonyms,type,modified_forms,gene_id,monomer_ids,anticodon,coding_segments
0,EG10001_RNA,alr,"[""alr"", ""alr5"", ""b4053"", ""ECK4045""]",mRNA,[],EG10001,[ALARACEBIOSYN-MONOMER],,[]
1,EG10002_RNA,modB,"[""modB"", ""chlJ"", ""tslJ"", ""b0764"", ""ECK0753""]",mRNA,[],EG10002,[MODB-MONOMER],,[]
2,EG10003_RNA,cysZ,"[""cysZ"", ""b2413"", ""ECK2408""]",mRNA,[],EG10003,[EG10003-MONOMER],,[]
3,EG10004_RNA,dfp,"[""dfp"", ""coaBC"", ""b3639"", ""ECK3629""]",mRNA,[],EG10004,[EG10004-MONOMER],,[]
4,EG10006_RNA,dcuB,"[""dcuB"", ""genF"", ""b4123"", ""ECK4116""]",mRNA,[],EG10006,[DCUB-MONOMER],,[]
...,...,...,...,...,...,...,...,...,...
4743,M013_RNA,mhpD,"[""mhpD"", ""mhpS"", ""b0350"", ""ECK0347""]",mRNA,[],M013,[MHPDHYDROL-MONOMER],,[]
4744,M014_RNA,mhpF,"[""mhpF"", ""b0351"", ""ECK0348""]",mRNA,[],M014,[MHPF-MONOMER],,[]
4745,M015_RNA,hcaF,"[""hcaF"", ""yfhV"", ""hcaA"", ""phdC2"", ""digB"", ""hca...",mRNA,[],M015,[HCAA2-MONOMER],,[]
4746,RUVA_RNA,ruvA,"[""ruvA"", ""b1861"", ""ECK1862""]",mRNA,[],RUVA,[EG10923-MONOMER],,[]


In [59]:
# Create a mapping from gene IDs to RNA IDs
gene_to_rna_mapping = pd.Series(transcripts.id.values, index=transcripts.gene_id).to_dict()

# Create a mapping from RNA IDs to Monomer IDs
rna_to_monomer_mapping = pd.Series(transcripts.monomer_ids.values, index=transcripts.id).to_dict()

# Initialize an empty dictionary for the gene-to-monomer mapping
gene_to_monomer_mapping = {}

# Iterate through the gene_to_rna_mapping to populate the gene_to_monomer_mapping
for gene_id, rna_id in gene_to_rna_mapping.items():
    # Check if the RNA ID has a corresponding entry in the rna_to_monomer_mapping
    if rna_id in rna_to_monomer_mapping:
        # Add the mapping from gene ID to monomer IDs
        gene_to_monomer_mapping[gene_id] = rna_to_monomer_mapping[rna_id]

# Using a set to ensure uniqueness, and a list comprehension to flatten the list of lists
unique_monomers_set = {monomer for monomers_list in rna_to_monomer_mapping.values() for monomer in monomers_list}

# Convert the set back to a list if you need a list format
unique_monomers_list = list(unique_monomers_set)

len(unique_monomers_list)
# print(unique_monomers_list)
# print(gene_to_monomer_mapping)
# print(rna_to_monomer_mapping)
# print(gene_to_rna_mapping)

4434