In [49]:
# STEP 1: USE CHATGPT TO PARSE THIS WEBSITE AND GENERATE A CSV FILE OF THE CONTENTS
# STEP 2: MANUALLY ANNOTATE GARDP ANTIBIOTIC CLASSES WITH CHEBI IDS
# STEP 3: DOWNLOAD CHEBI ONTOLOGY FILE FROM: https://ftp.ebi.ac.uk/pub/databases/chebi-2/ontology/ ---> chebi_lite.json.gz

In [50]:
from collections import Counter
import pandas as pd
import gzip
import json 
import tqdm

In [51]:
# Read json file
file_path = "../data/chebi_lite.json.gz"
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    data = json.load(f)

# Clean data
data = data['graphs'][0]
nodes, edges = data['nodes'], data['edges']

print(f"Number of edges: {len(edges)}")
print(f"Number of nodes: {len(nodes)}")

Number of edges: 377879
Number of nodes: 223061


In [52]:
Counter([i['type'] for i in nodes])

Counter({'CLASS': 223037, 'PROPERTY': 24})

In [53]:
id_to_lbl = {}
for node in tqdm.tqdm(nodes):
    if "CHEBI" in node['id'] and ("deprecated" not in node['meta'] or node['meta']['deprecated'] is False):
        id_to_lbl[node['id'].split("/")[-1].replace("_", ":")] = node['lbl']

100%|██████████| 223061/223061 [00:00<00:00, 1822290.18it/s]


In [54]:
len(id_to_lbl)

203671

In [55]:
# CHEBIFIED ANTIBIOTICDB

In [70]:
# Get all chebis from antibioticdb_chebified.csv
adb = pd.read_csv("../results/antibioticdb_chebified.csv")
chebis = adb["chebi_predicted_parents"].tolist()
chebis = Counter([j for i in chebis for j in i.strip().split(";")])
chebis = pd.DataFrame(chebis.items(), columns=['ChEBI ID', 'Count']).sort_values(by='Count', ascending=False)

In [72]:
# Add labels
chebis['label'] = [id_to_lbl[i] for i in chebis['ChEBI ID']]

In [73]:
chebis.to_csv("../results/antibioticdb_chebified_grouped.csv", index=False)

In [74]:
chebis[:50]

Unnamed: 0,ChEBI ID,Count,label
4,CHEBI:24431,2138,chemical entity
3,CHEBI:23367,2138,molecular entity
14,CHEBI:33579,2136,main group molecular entity
21,CHEBI:33675,2136,p-block molecular entity
30,CHEBI:50860,2136,organic molecular entity
15,CHEBI:33582,2136,carbon group molecular entity
11,CHEBI:33285,2114,heteroorganic entity
13,CHEBI:33304,1980,chalcogen molecular entity
25,CHEBI:36357,1962,polyatomic entity
8,CHEBI:25806,1889,oxygen molecular entity
