In [None]:
# STEP 1: USE CHATGPT TO PARSE THIS WEBSITE AND GENERATE A CSV FILE OF THE CONTENTS (GARDP_parsed.csv)
# STEP 2: MANUALLY ANNOTATE GARDP ANTIBIOTIC CLASSES WITH CHEBI IDS (GARDP_edited.csv)
# STEP 3: DOWNLOAD CHEBI ONTOLOGY FILE FROM: https://ftp.ebi.ac.uk/pub/databases/chebi-2/ontology/ ---> chebi_lite.json.gz | mapping CHEBI ids to labels/names
# STEP 4: DOWNLOAD SMILES FROM ANTIBIOTIC DB - Got them from eos6ojg
# STEP 5: USING ERSILIA, CHEBIFY ANTIBIOTIC DB (eos6tpo)
# STEP 6: MERGE GARDP_edited.csv WITH CHEBIFIED ANTIBIOTIC DB TO DEFINE A FINAL SET OF FEATURES AND RULES (Final_column_criteria.csv and notes.txt)

In [1]:
from collections import Counter
import pandas as pd
import gzip
import json 
import tqdm

In [2]:
# Read json file
file_path = "../data/chebi_lite.json.gz"
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    data = json.load(f)

# Clean data
data = data['graphs'][0]
nodes, edges = data['nodes'], data['edges']

print(f"Number of edges: {len(edges)}")
print(f"Number of nodes: {len(nodes)}")

Number of edges: 377879
Number of nodes: 223061


In [3]:
id_to_lbl = {}
for node in tqdm.tqdm(nodes):
    if "CHEBI" in node['id'] and ("deprecated" not in node['meta'] or node['meta']['deprecated'] is False):
        id_to_lbl[node['id'].split("/")[-1].replace("_", ":")] = node['lbl']

100%|██████████| 223061/223061 [00:00<00:00, 1648338.94it/s]


In [4]:
len(id_to_lbl)

203671

In [55]:
# CHEBIFIED ANTIBIOTICDB

In [5]:
# Get all chebis from antibioticdb_chebified.csv
adb = pd.read_csv("../results/antibioticdb_chebified.csv")
chebis = adb["chebi_predicted_parents"].tolist()
chebis = Counter([j for i in chebis for j in i.strip().split(";")])
chebis = pd.DataFrame(chebis.items(), columns=['ChEBI ID', 'Count']).sort_values(by='Count', ascending=False)

# Add labels
chebis['label'] = [id_to_lbl[i] for i in chebis['ChEBI ID']]

# chebis.to_csv("../results/antibioticdb_chebified_grouped.csv", index=False)