In [27]:
from collections import Counter
from tqdm import tqdm
import pandas as pd
import numpy as np
import ollama
import os

In [28]:
print("Loading ChEMBL preprocessed data...")
ChEMBL = pd.read_csv("../config/chembl_processed/activities_preprocessed.csv", low_memory=False)
print(f"Original size: {len(ChEMBL)}")
print("Filtering out nan values...")
ChEMBL = ChEMBL[ChEMBL['value'].isna() == False].reset_index(drop=True)
print(f"Size after filtering nan values: {len(ChEMBL)}")

Loading ChEMBL preprocessed data...
Original size: 24267312
Filtering out nan values...
Size after filtering nan values: 20911360


In [29]:
# Get mtb data
pathogen = "Mycobacterium tuberculosis"
pathogen = "plasmodium falciparum"
ChEMBL = ChEMBL[ChEMBL['target_organism'].str.contains(pathogen, case=False, na=False) | 
                ChEMBL['assay_organism'].str.contains(pathogen, case=False, na=False)].reset_index(drop=True)

print(f"Number of activities: {len(ChEMBL)}")

df = dict(Counter(ChEMBL['target_organism']))
df = pd.DataFrame([[i, df[i]] for i in sorted(df, key = lambda x: df[x], reverse=True)], columns=['organism', 'count'])
df

Number of activities: 1071815


Unnamed: 0,organism,count
0,Plasmodium falciparum,986730
1,Plasmodium falciparum (isolate 3D7),71250
2,,7339
3,Plasmodium falciparum 3D7,2911
4,Plasmodium falciparum K1,2596
5,Plasmodium falciparum FcB1/Columbia,899
6,Plasmodium falciparum (isolate FCR-3 / Gambia),30
7,Plasmodium falciparum (isolate K1 / Thailand),18
8,Rattus norvegicus,15
9,Homo sapiens,12


In [30]:
len(ChEMBL), len(set(ChEMBL['compound_chembl_id']))

(1071815, 497202)

In [31]:
Counter(ChEMBL['assay_type'])

Counter({'F': 1055951, 'B': 15696, 'A': 122, 'T': 33, 'P': 9, 'U': 4})

In [32]:
# # Get activities data
# df = pd.read_csv("../config/chembl_activities/activities.csv", low_memory=False)

# # Assay id to doc id
# assayid_to_docid = {i: j for i,j in zip(df['assay_id'], df['doc_id'])}

# # Load dict target
# df = pd.read_csv("../config/chembl_activities/target_dictionary.csv", low_memory=False)

# # ChEMBL ID to name
# target_chemblid_to_name = {i: j for i,j in zip(df['chembl_id'], df['pref_name'])}

In [33]:
# Helper function - is there only a single value?
def only_one(values, name):
    if len(values) != 1:
        raise ValueError(f"Expected exactly one {name}, found {values}")
    return values[0]

In [34]:
assays = sorted(set(ChEMBL['assay_chembl_id']))

In [35]:
ASSAYS_INFO = []

# For each assay
for assay in tqdm(assays):

    # Get subset of strain + assay data
    df_ = ChEMBL[ChEMBL["assay_chembl_id"] == assay]
    
    # Get values
    assay_type = list(set(df_['assay_type']))
    target_type = list(set(df_['target_type']))
    target_chembl_id = list(set(df_['target_chembl_id']))
    activity_types = list(set(df_['activity_type']))
    target_organism = list(set(df_['target_organism']))
    assay_organism = list(set(df_['assay_organism']))

    # Check coherence
    assay_type = only_one(assay_type, "assay_type")
    target_type = only_one(target_type, "target_type")
    target_chembl_id = only_one(target_chembl_id, "target_chembl_id")
    target_organism = only_one(target_organism, "target_organism")
    assay_organism = only_one(assay_organism, "assay_organism")

    # For each activity type
    for act_type in activity_types:

        df__ = df_[df_["activity_type"] == act_type]
        activity_type = list(set(df__['activity_type']))
        activity_type = only_one(activity_type, 'activity_type')
        units = list(set(df__['unit']))

        for u in units:
            if type(u) != str:
                df___ = df__[df__["unit"].isna()]
            else:
                df___ = df__[df__["unit"] == u]
            unit = list(set(df___['unit']))
            unit = only_one(unit, "unit")
            activities = len(df___)
            cpds = len(set(df___['compound_chembl_id']))
            ASSAYS_INFO.append([assay, assay_type, assay_organism, target_type, target_chembl_id, target_organism, activity_type, unit, activities, cpds])

ASSAYS_INFO = pd.DataFrame(ASSAYS_INFO, columns=["assay_id", "assay_type", "assay_organism", "target_type", "target_chembl_id", "target_organism", "activity_type", "unit", "activities", "cpds"])
ASSAYS_INFO = ASSAYS_INFO.sort_values('cpds', ascending=False).reset_index(drop=True)

  0%|          | 0/9151 [00:00<?, ?it/s]

100%|██████████| 9151/9151 [03:58<00:00, 38.40it/s] 


In [36]:
ASSAYS_INFO = ASSAYS_INFO[ASSAYS_INFO['cpds'] > 100].reset_index(drop=True)

In [37]:
# target_dict = pd.read_csv("../config/chembl_activities/target_dictionary.csv", low_memory=False)
# chembl_id_to_pref_name = {i: j for i,j in zip(target_dict['chembl_id'], target_dict['pref_name'])}ç
# ASSAYS_INFO['pref_name'] = [chembl_id_to_pref_name[i] if i in chembl_id_to_pref_name else np.nan for i in ASSAYS_INFO['target_chembl_id']]
# chembl_uniprot_mapping = open("/home/acomajuncosa/Downloads/chembl_uniprot_mapping.txt").readlines()[1:]
# chembl_uniprot_mapping = {i.split("\t")[1]: i.split("\t")[0] for i in chembl_uniprot_mapping}
# ASSAYS_INFO['uniprot'] = [chembl_uniprot_mapping[i] if i in chembl_uniprot_mapping else np.nan for i in ASSAYS_INFO['target_chembl_id']]
# ASSAYS_INFO[ASSAYS_INFO['cpds'] > 100].reset_index(drop=True).to_csv("/home/acomajuncosa/Desktop/assays_info_falciparum.csv", index=False)

In [38]:
pathogen_code = str(pathogen.split()[0][0] + pathogen.split()[1]).lower()

root = "."
PATH_TO_OUTPUT = os.path.join(root, "..", "output", pathogen_code)
os.makedirs(os.path.join(PATH_TO_OUTPUT, "descriptions"), exist_ok=True)
ASSAYS_INFO.to_csv(os.path.join(PATH_TO_OUTPUT, 'assays.csv'), index=False)

In [39]:
# Load assays and docs information
assays = pd.read_csv("../config/chembl_activities/assays.csv", low_memory=False)
docs = pd.read_csv("../config/chembl_activities/docs.csv", low_memory=False)
assay_type_map = {"F": "Functional","B": "Binding","T": "Toxicity","A": "ADME","P": "Physicochemical","U": "Uncategorized"}

In [40]:
# for i in ASSAYS_INFO[['assay_type', 'assay_organism', 'target_type', 'target_organism', 'activity_type', 'unit', 'activities', 'cpds', 'assay_id']].values:
    
#     print(f"Assay ChEMBL ID: {i[8]}")
#     print(f"Assay type: {assay_type_map[i[0]]}")
#     print(f"Assay organism: {i[1]}")
#     print(f"Assay description: {assays[assays['chembl_id'] == i[8]]['description'].tolist()[0]}")
#     print(f"Assay strain: {assays[assays['chembl_id'] == i[8]]['assay_strain'].tolist()[0]}")
#     print(f"Assay category: {assays[assays['chembl_id'] == i[8]]['assay_category'].tolist()[0]}")
#     print(f"Assay test type: {assays[assays['chembl_id'] == i[8]]['assay_test_type'].tolist()[0]}")
#     print(f"Assay cell type: {assays[assays['chembl_id'] == i[8]]['assay_cell_type'].tolist()[0]}")

#     print(" ")
#     doc_id = assays[assays['chembl_id'] == i[8]]['doc_id'].tolist()[0]
#     print(f"Document title: {docs[docs['doc_id'] == doc_id]['title'].tolist()[0]}")
#     print(f"Document abstract: {docs[docs['doc_id'] == doc_id]['abstract'].tolist()[0]}")
#     print(f"Document journal: {docs[docs['doc_id'] == doc_id]['journal'].tolist()[0]}")
#     print(f"Document PubMed ID: {docs[docs['doc_id'] == doc_id]['pubmed_id'].tolist()[0]}")
#     print(f"Document DOI: {docs[docs['doc_id'] == doc_id]['doi'].tolist()[0]}")

#     print(" ")
#     print(f"Target type: {i[2]}")
#     print(f"Target Organism: {i[3]}")

#     print(" ")
#     print(f"Activity Type: {i[4]}")
#     print(f"Unit: {i[5]}")
#     print(f"Number of activities: {i[6]}")
#     print(f"Number of compounds: {i[7]}")

#     print(" ")
#     assay_activities = ChEMBL = ChEMBL[(ChEMBL['assay_chembl_id'] == i[8]) & (ChEMBL['activity_type'] == i[4]) & (ChEMBL['unit'] == i[5])]
#     assay_activities = assay_activities["value"].astype(float).tolist()
#     print(f"Percentile 1: {round(np.percentile(assay_activities, 1), 3)}")
#     print(f"Percentile 25: {round(np.percentile(assay_activities, 25), 3)}")
#     print(f"Mean: {round(np.mean(assay_activities), 3)}")
#     print(f"Median: {round(np.percentile(assay_activities, 50), 3)}")
#     print(f"Percentile 75: {round(np.percentile(assay_activities, 75), 3)}")
#     print(f"Percentile 99: {round(np.percentile(assay_activities, 99), 3)}")

#     break

In [41]:
SYSTEM = """
You are a ChEMBL biodata curator. Your task is to write a complete, accurate and standardized description of a given biological assay.

Formatting instructions:
- The description must be structured into three paragraphs (enumerated below), each of 80-120 words.
- Each paragraph must begin with a bold markdown title in the exact format:
  **1. Assay description**\n
  **2. Outcome interpretation**\n
  **3. Results and insights**\n
- The assay description paragraph must explain the objective of the assay, the experimental system, and methodology. Specify the biological target, assay format (e.g., cell-based, binding), detection method, and any relevant experimental conditions (e.g., temperature, compound concentration).
- The outcome interpretation paragraph must describe how assay outputs are measured and interpreted. Specify how results relate to biological activity or target modulation, the direction of the biological activity (-1 if lower values lead to higher activity e.g., IC50; +1 if higher values result in higher activity e.g., percent. inhibition or effect; 0 if it’s inconclusive, e.g., clearance or solubility), controls, reference compounds, signal thresholds and normalization steps.
- The results and insights paragraph must summarize typical activity ranges, notable behaviors (e.g., agonists, inhibitors), data quality and curation notes that support integration and reproducibility. Highlight meaningful observations from the distribution of activity data. It must be coherent with the outcome interpretation paragraph.
- Separate paragraphs with a single blank line.
- Use only standard ASCII spacing for all numbers, units, and symbols.
- Insert commas in numbers when necessary (e.g., 1,000; 100,000).
- Do not insert non-breaking spaces, narrow spaces, or special typographic characters.
- Do not use tables or hidden formatting.
- Use scientific and formal language.
- Avoid speculation, informal expressions or fabricated information.
- If any relevant data is missing (reported as ‘nan’), state “not reported” rather than inventing details.
- Do NOT write Q&A, lists, bullets, or add any heading besides the three above.
- Do not use thinking mode.

"""

In [51]:
ASSAYS_INFO

Unnamed: 0,assay_id,assay_type,assay_organism,target_type,target_chembl_id,target_organism,activity_type,unit,activities,cpds
0,CHEMBL1794345,F,Plasmodium falciparum,ORGANISM,CHEMBL364,Plasmodium falciparum,POTENCY,umol.L-1,170462,169987
1,CHEMBL4888485,F,Plasmodium falciparum,ORGANISM,CHEMBL364,Plasmodium falciparum,ZSCORE,,147592,147430
2,CHEMBL4888485,F,Plasmodium falciparum,ORGANISM,CHEMBL364,Plasmodium falciparum,INHIBITION,%,147592,147430
3,CHEMBL1794580,F,Plasmodium falciparum,ORGANISM,CHEMBL364,Plasmodium falciparum,POTENCY,umol.L-1,131175,130746
4,CHEMBL4649943,F,Plasmodium falciparum,SINGLE PROTEIN,CHEMBL3301561,Plasmodium falciparum (isolate 3D7),PERCENTEFFECT,%,68620,68614
...,...,...,...,...,...,...,...,...,...,...
93,CHEMBL4879872,B,Plasmodium falciparum,SINGLE PROTEIN,CHEMBL1908388,Plasmodium falciparum (isolate 3D7),%CONTROL,%,108,108
94,CHEMBL4812655,B,Plasmodium falciparum,UNCHECKED,CHEMBL612545,,IC50,umol.L-1,163,107
95,CHEMBL4812658,F,Plasmodium falciparum,ORGANISM,CHEMBL364,Plasmodium falciparum,EC50,umol.L-1,161,107
96,CHEMBL769481,F,Plasmodium falciparum,ORGANISM,CHEMBL364,Plasmodium falciparum,IC50,umol.L-1,102,102


In [54]:
np.isnan(i[5])

True

In [56]:
for i in ASSAYS_INFO[['assay_type', 'assay_organism', 'target_type', 'target_organism', 'activity_type', 'unit', 'activities', 'cpds', 'assay_id']].values[:10]:

    assay_id = i[8]
    doc_id = assays[assays['chembl_id'] == assay_id]['doc_id'].tolist()[0]
    if type(i[5]) == str:
        assay_activities = ChEMBL[(ChEMBL['assay_chembl_id'] == assay_id) & (ChEMBL['activity_type'] == i[4]) & (ChEMBL['unit'] == i[5])]["value"].astype(float).tolist()
    else:
        assay_activities = ChEMBL[(ChEMBL['assay_chembl_id'] == assay_id) & (ChEMBL['activity_type'] == i[4]) & (ChEMBL['unit'].isna())]["value"].astype(float).tolist()

    result = {
        "Assay ChEMBL ID": assay_id,
        "Assay type": assay_type_map[i[0]],
        "Assay organism": i[1],
        "Assay description": assays[assays['chembl_id'] == assay_id]['description'].tolist()[0],
        "Assay strain": assays[assays['chembl_id'] == assay_id]['assay_strain'].tolist()[0],
        "Assay category": assays[assays['chembl_id'] == assay_id]['assay_category'].tolist()[0],
        "Assay test type": assays[assays['chembl_id'] == assay_id]['assay_test_type'].tolist()[0],
        "Assay cell type": assays[assays['chembl_id'] == assay_id]['assay_cell_type'].tolist()[0],
        "Document title": docs[docs['doc_id'] == doc_id]['title'].tolist()[0],
        "Document abstract": docs[docs['doc_id'] == doc_id]['abstract'].tolist()[0],
        "Document journal": docs[docs['doc_id'] == doc_id]['journal'].tolist()[0],
        "Document PubMed ID": docs[docs['doc_id'] == doc_id]['pubmed_id'].tolist()[0],
        "Document DOI": docs[docs['doc_id'] == doc_id]['doi'].tolist()[0],
        "Target type": i[2],
        "Target organism": i[3],
        "Activity type": i[4],
        "Unit": i[5],
        "Number of activities": i[6],
        "Number of compounds": i[7],
        "Stats": {
            "Percentile 1": round(np.percentile(assay_activities, 1), 3),
            "Percentile 25": round(np.percentile(assay_activities, 25), 3),
            "Mean": round(np.mean(assay_activities), 3),
            "Median": round(np.percentile(assay_activities, 50), 3),
            "Percentile 75": round(np.percentile(assay_activities, 75), 3),
            "Percentile 99": round(np.percentile(assay_activities, 99), 3)
        }
    }

    result = "\n".join([i + ": " + str(result[i]) for i in result])
    USER = f"""Below you will find enumerated annotations from the assay under study.\n\n{result}\n\nUsing the information provided, return a standardized description for the assay."""

    # Print data
    with open(os.path.join(PATH_TO_OUTPUT, "descriptions", f"{assay_id}_input.txt"), "w") as f:
        f.write(USER)
    
    # # Non streaming call
    # import ollama
    # response = ollama.chat(model='gpt-oss:20b', 
    #                        messages=[{"role": "system", "content": SYSTEM}, {"role": "user", "content": USER}],
    #                        options={"temperature": 0.2, "num_ctx": 4096, "num_predict": 5000})
    response = ollama.generate(model='gpt-oss:20b', prompt=SYSTEM + USER, stream=False, think=True)

    # Print response
    with open(os.path.join(PATH_TO_OUTPUT, "descriptions", f"{assay_id}_output.txt"), "w") as f:
        f.write(response.response)

    print(f"✓ Completed {assay_id}")


✓ Completed CHEMBL1794345
✓ Completed CHEMBL4888485
✓ Completed CHEMBL4888485
✓ Completed CHEMBL1794580
✓ Completed CHEMBL4649943
✓ Completed CHEMBL4513221
✓ Completed CHEMBL4513220
✓ Completed CHEMBL4649964
✓ Completed CHEMBL4649945
✓ Completed CHEMBL1054502


In [57]:
ChEMBL[(ChEMBL['assay_chembl_id'] == "CHEMBL1794345")]

Unnamed: 0,activity_id,assay_id,assay_chembl_id,assay_type,assay_confidence_score,assay_organism,tid,target_type,target_organism,target_chembl_id,...,canonical_smiles,MW,pchembl,activity_comment,standard_text,value,unit,activity_type,relation,pchembl_calculated
110820,6522534,752407,CHEMBL1794345,F,1,Plasmodium falciparum,50425,ORGANISM,Plasmodium falciparum,CHEMBL364,...,CCN1CCCC1CNC(=O)CCNC(=O)c1cc(OC)c(OC)c(OC)c1,393.484,,1,0,2.6169,umol.L-1,POTENCY,=,5.582213
110821,6479107,752407,CHEMBL1794345,F,1,Plasmodium falciparum,50425,ORGANISM,Plasmodium falciparum,CHEMBL364,...,O=C(Nc1ccc(C(=O)N2CCC(N3CCCC3)CC2)cc1)c1ccc(Br...,560.367,,-1,0,5.2213,umol.L-1,POTENCY,=,5.282221
110822,6479108,752407,CHEMBL1794345,F,1,Plasmodium falciparum,50425,ORGANISM,Plasmodium falciparum,CHEMBL364,...,C[C@@]12CCC3[C@@](CC[C@@H]4[C@](C)(C(=O)O)CCC[...,320.473,,0,0,9.2850,umol.L-1,POTENCY,=,5.032218
110823,6479109,752407,CHEMBL1794345,F,1,Plasmodium falciparum,50425,ORGANISM,Plasmodium falciparum,CHEMBL364,...,O=C1c2ccccc2C(=O)N1CCSc1nc2ccccc2c(=O)n1CCCN1C...,478.574,,0,0,18.5260,umol.L-1,POTENCY,=,4.732218
110824,6479110,752407,CHEMBL1794345,F,1,Plasmodium falciparum,50425,ORGANISM,Plasmodium falciparum,CHEMBL364,...,COc1ccc(/C=C(\NC(=O)c2ccc(OC)cc2)C(=O)Nc2ccc(C...,446.459,,0,0,10.4179,umol.L-1,POTENCY,=,4.982220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281782,6718549,752407,CHEMBL1794345,F,1,Plasmodium falciparum,50425,ORGANISM,Plasmodium falciparum,CHEMBL364,...,CC(C)(O)CC[C@@H](O)[C@](C)(O)[C@H]1CC[C@@]2(O)...,480.642,,-1,0,11.6891,umol.L-1,POTENCY,=,4.932219
281783,6718550,752407,CHEMBL1794345,F,1,Plasmodium falciparum,50425,ORGANISM,Plasmodium falciparum,CHEMBL364,...,O=C(C1=C(N2CCCC2)CCc2ccccc21)c1ccccc1Cl,337.850,,0,0,11.6891,umol.L-1,POTENCY,=,4.932219
281784,6718551,752407,CHEMBL1794345,F,1,Plasmodium falciparum,50425,ORGANISM,Plasmodium falciparum,CHEMBL364,...,CCOC(=O)/C(C#N)=C/c1ccc(OCC(=O)Nc2ccc(OC)cc2)c...,410.426,,0,0,0.1853,umol.L-1,POTENCY,=,6.732125
281785,6718552,752407,CHEMBL1794345,F,1,Plasmodium falciparum,50425,ORGANISM,Plasmodium falciparum,CHEMBL364,...,CCn1ncc(Br)c1C(=O)Nc1cc([N+](=O)[O-])ccc1C,353.176,,-1,0,13.1154,umol.L-1,POTENCY,=,4.882218


In [43]:
assay_activities

[]