Import dependencies

In [2]:
import pandas as pd

#Komodo tsv's with all information from the komodo.modelseed.org webpage
komodo_media = pd.read_csv('komodo_media_original.txt', sep="\t") #media ID and media name
komodo_taxa = pd.read_csv('komodo_taxa_original.txt', sep="\t") #taxon ID and media name

1.1 Format media list

In [4]:
# Formatting komodo media list to only include IDs for complex=false, aerobic=true, submedia=false
complex = komodo_media[(komodo_media['Complex']==False)] #remove complex media
aerobic = complex[(complex['Aerobic']==True)] #remove anaerobic media
submedia = aerobic[(aerobic['SubMedium']==False)] #remove submedia
kom_med = submedia[['ID','Name','PH']]
kom_med = kom_med.rename(columns={'Name': 'Media list'}) #rename the media column to merge with taxa later
len(kom_med)

171

1.2 Format taxon list

In [6]:
# Formatting komodo taxon list to not include NaN values for 'Taxon ID', as this is what we use in the functions notebook
kom_tax = komodo_taxa[['Taxon ID', 'Organism Name', 'Media list']].dropna(subset=['Taxon ID'])
kom_tax['Taxon ID'] = kom_tax['Taxon ID'].astype(str).str.replace(".0", "") #make back into strings, dropna adds a '.0' value
kom_tax = kom_tax.dropna(subset=['Media list']) #also dropping taxa with no associated media name
len(kom_tax)

7627

1.3 Merge our lists

In [8]:
merged_df = kom_tax.merge(kom_med, on='Media list', how='left') #merged dataframe with media info for ALL taxon IDs
onlyIDs = merged_df.dropna(subset=['ID']) #dropping all rows where there's no media ID
onlyIDs = onlyIDs.rename(columns={'ID': 'Media ID'})
results_df = onlyIDs[['Taxon ID','Organism Name','Media ID','Media list']]
results_df

Unnamed: 0,Taxon ID,Organism Name,Media ID,Media list
3,35830,Acetivibrio cellulolyticus,165,ACETIVIBRIO CELLULOLYTICUS MEDIUM
420,192843,Albidiferax ferrireducens,1001,BASAL MEDIUM
512,61596,Allochromatium minutissimum,28,PFENNIG'S MEDIUM I
515,1049,Allochromatium vinosum,28,PFENNIG'S MEDIUM I
516,61595,Allochromatium warmingii,28,PFENNIG'S MEDIUM I
...,...,...,...,...
7291,381300,Thiohalospira halophila,1058.1,For DSM 15070 and DSM 15071
7297,373391,Thiomicrospira halophila,1058,THIOHALOPHILUS MEDIUM
7301,265883,Thiomicrospira thermophila,1011,MJ MEDIUM
7308,394805,Thioprofundum hispidum,1011,MJ MEDIUM


2.1 [media_functions.ipynb] extracting ec numbers for each 'Taxon ID'

In [10]:
# Packages for KEGG compound ID extraction with a media component input
from io import StringIO
from Bio.KEGG import REST
import re
import requests
from requests.adapters import HTTPAdapter, Retry

# Initializing retry configuration for HTTP requests
retries = Retry(
    total=5,
    backoff_factor=0.25,
    status_forcelist=[500, 502, 503, 504]
)
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))

# Initializing pagination (used for gathering ec numbers from Uniprot)
def get_next_link(headers):
    if "Link" in headers:
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)

def get_batch(batch_url):
    while batch_url:
        response = session.get(batch_url)
        response.raise_for_status()
        total = response.headers["x-total-results"]
        yield response, total
        batch_url = get_next_link(response.headers)

2.2 Format input for our function

In [12]:
taxonomy_ids = results_df['Taxon ID'].tolist()
len(taxonomy_ids)

193

2.3 Retrieve ec numbers for our list of taxon IDs

In [19]:
# Initialize an empty list to store DataFrame rows
def taxa2ec():
    taxa2ec_df = []

    # REST API base URL
    base_url = 'https://rest.uniprot.org/uniprotkb/search?fields=organism_id%2Cec&format=tsv&size=500'

    for taxonomy_id in taxonomy_ids:
        #url = f'{base_url}&query=organism_id%3A{taxonomy_id}'
        url = f'{base_url}&query=taxonomy_id%3A{taxonomy_id}'
        session = requests.Session()  # Initialize session
        response = session.post(url)
        response.raise_for_status()
        lines = response.text.splitlines()
    
        # Iterate through lines to extract EC numbers
        for line in lines[1:]:  # Skip header line
            ec_number = line.split('\t')[1]  # Assuming EC number is the second column
            taxa2ec_df.append({"Taxa ID": taxonomy_id, "Enzyme": ec_number})

    # Convert the list of dictionaries into a DataFrame
    taxa2ec_df = pd.DataFrame(taxa2ec_df)
    
    return taxa2ec_df

2.3.1 Formatting and output

In [22]:
df = taxa2ec()

# NaN values are actually empty strings ' '; need to replace these and remove NaN
df['Enzyme'].replace('', pd.NA, inplace=True)
df.dropna(subset=['Enzyme'], inplace=True)

# Due to syntax, I needed to group the enzymes together before re-exploding them to remove duplicates
df = df.groupby('Taxa ID')['Enzyme'].agg(';'.join).reset_index()
# Exploding 'Enzyme' column
df['Enzyme'] = df['Enzyme'].str.split(';')
df = df.explode('Enzyme')
# Removing duplicates
df = df.drop_duplicates()

# Re-grouping the list by 'Taxa ID'
df = df.groupby('Taxa ID')['Enzyme'].agg(list).reset_index()
df

Unnamed: 0,Taxa ID,Enzyme
0,101192,"[1.2.1.11, 6.3.4.2, 4.2.1.136, 5.1.99.6, 3.5...."
1,1049,"[4.1.1.39, 2.3.1.-, 2.8.5.2, 1.8.2.2, 1.8.5.6,..."
2,1058,"[4.2.1.131, 1.8.1.2, 1.8.4.8, 2.7.11.33, 2.7...."
3,109258,"[4.1.1.36, 6.3.2.5, 4.2.1.136, 5.1.99.6, 3.4..."
4,115544,"[4.1.1.36, 6.3.2.5, 2.5.1.19, 4.2.3.4, 1.1.1..."
...,...,...
124,879,"[1.17.1.9, 1.8.99.2, 1.-.-.-, 1.12.2.1, 1.2.99..."
125,90732,"[6.3.2.10, 6.3.2.13, 1.2.1.11, 6.3.4.16, 6.3..."
126,936138,"[2.3.1.157, 2.7.7.23, 1.2.1.11, 4.1.1.36, 6...."
127,936456,"[3.5.4.25, 4.1.99.12, 2.3.1.157, 2.7.7.23, 2..."


In [23]:
df2 = df.rename(columns={'Taxa ID': 'Taxon ID'})
merged = results_df.merge(df2, on='Taxon ID', how='left')
merged

Unnamed: 0,Taxon ID,Organism Name,Media ID,Media list,Enzyme
0,35830,Acetivibrio cellulolyticus,165,ACETIVIBRIO CELLULOLYTICUS MEDIUM,[3.2.1.4]
1,192843,Albidiferax ferrireducens,1001,BASAL MEDIUM,"[6.3.4.2, 2.3.1.157, 2.7.7.23, 2.4.2.1, 2.4...."
2,61596,Allochromatium minutissimum,28,PFENNIG'S MEDIUM I,[5.6.2.2]
3,1049,Allochromatium vinosum,28,PFENNIG'S MEDIUM I,"[4.1.1.39, 2.3.1.-, 2.8.5.2, 1.8.2.2, 1.8.5.6,..."
4,61595,Allochromatium warmingii,28,PFENNIG'S MEDIUM I,"[2.4.2.1, 2.4.2.2, 2.1.1.163, 2.1.1.201, 3.1..."
...,...,...,...,...,...
188,381300,Thiohalospira halophila,1058.1,For DSM 15070 and DSM 15071,"[4.2.1.136, 5.1.99.6, 6.3.4.2, 3.1.26.12, 2.1..."
189,373391,Thiomicrospira halophila,1058,THIOHALOPHILUS MEDIUM,
190,265883,Thiomicrospira thermophila,1011,MJ MEDIUM,"[1.3.1.76, 2.1.1.107, 4.99.1.4, 2.7.1.167, ..."
191,394805,Thioprofundum hispidum,1011,MJ MEDIUM,


NaN values in 'Enzyme' implies that there's no taxonomy_id listing for the given 'Taxon ID' within UniProtkb's database. 

Other considerations for this table: only Komodo entries with valid (no NaN) 'Taxon ID' AND 'Media ID' values were retained. The 'Media list' was filtered to only show [complex, aerobic, submedia] = [false, true, false], respectively.

NEXT: write a function to pull compounds associated with these ec numbers (substrates, metal/ion transporters, etc)

In [27]:
#merged.to_csv('komodo-uniprot-media-ec.csv')

In [29]:
merged.explode("Enzyme").value_counts("Taxon ID")


Taxon ID
345632    662
40983     656
225194    642
60035     622
28232     576
         ... 
198347      1
646273      1
392844      1
241365      1
35830       1
Name: count, Length: 185, dtype: int64