https://bacdive.dsmz.de/tutorials

0.1 Import packages, link to Bacdive Client (set up your own log-in from the bacdive website)

In [2]:
# Import packages
import pandas as pd
import bacdive
# Initialize client
client = bacdive.BacdiveClient('wjlowe03@gmail.com', 'UNINA_Bacdive')

-- Authentication successful --


1.1 Example on how to dump information from the api using the client

In [4]:
client.search(id='5621')

for strain in client.retrieve():
    print(strain['Culture and growth conditions'])

for strain in client.retrieve():
    print(strain['Physiology and metabolism'])

{'culture medium': [{'@ref': 1231, 'name': 'NUTRIENT AGAR (DSMZ Medium 1)', 'growth': 'yes', 'link': 'https://mediadive.dsmz.de/medium/1', 'composition': 'Name: NUTRIENT AGAR (DSMZ Medium 1)\nComposition:\nAgar 15.0 g/l\nPeptone 5.0 g/l\nMeat extract 3.0 g/l\nDistilled water'}, {'@ref': 1231, 'name': 'CASO AGAR (MERCK 105458) (DSMZ Medium 220)', 'growth': 'yes', 'link': 'https://mediadive.dsmz.de/medium/220', 'composition': 'Name: CASO AGAR (Merck 105458) (DSMZ Medium 220)\nComposition:\nAgar 15.0 g/l\nCasein peptone 15.0 g/l\nNaCl 5.0 g/l\nSoy peptone 5.0 g/l\nDistilled water'}, {'@ref': 34912, 'name': 'MEDIUM 3 - Columbia agar', 'growth': 'yes', 'composition': 'Columbia agar (39.000 g);distilled water (1000.000 ml)'}, {'@ref': 121392, 'name': 'CIP Medium 3', 'growth': 'yes', 'link': 'https://catalogue-crbip.pasteur.fr/fiche_milieu.xhtml?crbip=3'}, {'@ref': 121392, 'name': 'CIP Medium 72', 'growth': 'yes', 'link': 'https://catalogue-crbip.pasteur.fr/fiche_milieu.xhtml?crbip=72'}], 'cu

1.2 [Metabolite utilization ] Example on how to specify search fields (may be a better way to parse, but this is what GPT suggested)

In [6]:
client.search(id='5621')

metu = [] #metabolite utilization
# Extract 'Chebi-ID' and 'metabolite'
for strain in client.retrieve(['metabolite utilization']):
    metu.append(strain[list(strain)[0]][0]['metabolite utilization'])
# Extract metabolite and Chebi-ID from the extracted information
util = []
for metabolite_utilization in metu:
    for metabolite_info in metabolite_utilization:
        metabolite = metabolite_info.get('metabolite')
        chebi = metabolite_info.get('Chebi-ID')
        util.append({'Metabolite': metabolite, 'Chebi-ID': chebi})
# Create DataFrame from the list of dictionaries
util_df = pd.DataFrame(util)
#util_df

1.3 [Media information] Example on how to extract media information from bacdive (MediaDive api is preferred for this)

In [8]:
client.search(id='5621')

media = []
for strain in client.retrieve(['culture medium']):
    media.append(strain[list(strain)[0]][0]['culture medium'])
    
media_data = []
for medium_info in media[0]:  # Assuming the output is a list containing one nested list
    if 'composition' in medium_info:  # Check if 'composition' key exists
        name = medium_info['name']
        composition = medium_info['composition']
        media_data.append({'Name': name, 'Composition': composition})

media_df = pd.DataFrame(media_data)
#media_df

2.1 [Metabolism information] Extracting information from the 'Physiology and metabolism' field into a dataframe

In [10]:
import pandas as pd

# List of bacdive IDs
strain_ids = ['5621', '139709','1000']

ec = []
for strain_id in strain_ids:
    # Retrieve data for the current strain ID
    client.search(id=strain_id)
  
    try:
        retrieved_data = client.retrieve(['enzymes'])
        if not retrieved_data:
            print(f"No data returned for strain ID {strain_id}")
            continue

        for strain in retrieved_data:
            strain_key = list(strain)[0]
            if 'enzymes' in strain[strain_key][0]:
                enzymes = strain[strain_key][0]['enzymes']
                
                # Flatten the list of enzyme information and add strain_id
                if enzymes:  # Check if the enzymes list is not empty
                    for enzyme_info in enzymes:
                        enzyme_info['bacdive_id'] = strain_id
                        ec.append(enzyme_info)
            else:
                print(f"No enzymes data for strain ID {strain_id}")

    except IndexError as e:
        print(f"Error processing strain ID {strain_id}: {e}")
    except Exception as e:
        print(f"Unexpected error processing strain ID {strain_id}: {e}")

# Create DataFrame from enzyme information
if ec:  # Check if there are any enzyme entries
    ec_df = pd.DataFrame(ec)
    # Display the DataFrame
    print(ec_df)
else:
    print("No enzyme data found for the provided strain IDs.")


Error processing strain ID 1000: list index out of range
      @ref                            value activity        ec bacdive_id
0    68382                 alpha-fucosidase        -  3.2.1.51       5621
1    68382                alpha-mannosidase        -  3.2.1.24       5621
2    68382    N-acetyl-beta-glucosaminidase        -  3.2.1.52       5621
3    68382                 beta-glucosidase        -  3.2.1.21       5621
4    68382                alpha-glucosidase        -  3.2.1.20       5621
5    68382               beta-glucuronidase        -  3.2.1.31       5621
6    68382               beta-galactosidase        -  3.2.1.23       5621
7    68382              alpha-galactosidase        -  3.2.1.22       5621
8    68382  naphthol-AS-BI-phosphohydrolase        +       NaN       5621
9    68382                 acid phosphatase        +   3.1.3.2       5621
10   68382               alpha-chymotrypsin        -  3.4.21.1       5621
11   68382                          trypsin        +  3

2.2 Formatting our dataframe for ec info

In [16]:
# Reformatting the retrieved data into a format that can be merged with the master table (MediaDive output)
ec_subset = ec_df[['bacdive_id','ec']]
ec_subset = ec_subset.dropna()
ec_grouped = ec_subset.groupby('bacdive_id').agg(lambda x: list(x)).reset_index()
ec_grouped

Unnamed: 0,bacdive_id,ec
0,139709,"[3.2.1.21, 3.5.1.5, 3.5.3.6]"
1,5621,"[3.2.1.51, 3.2.1.24, 3.2.1.52, 3.2.1.21, 3.2.1..."
