In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
activities_all_raw = pd.read_csv(os.path.join(".", "..", "config", 'chembl_processed', 'activities_preprocessed.csv'), low_memory=False)

In [3]:
def create_text_flag(df):

    cond_nan = (df['activity_comment'] == 0) & (df['standard_text'] == 0)
    cond_pos = (df['activity_comment'] == 1) | (df['standard_text'] == 1)
    cond_neg = (df['activity_comment'] == -1) | (df['standard_text'] == -1)

    # Detect row-level conflicts
    conflict = cond_pos & cond_neg
    if conflict.any():
        raise ValueError(
            "Conflicting labels (contains both 1 and -1):\n"
            + df.loc[conflict, ["compound_chembl_id", "activity_comment", "standard_text"]].head(20).to_string())

    # Assign row-level label
    df["text_flag"] = np.nan
    df.loc[cond_pos, "text_flag"] = 1
    df.loc[cond_neg, "text_flag"] = -1
    df.loc[cond_nan, "text_flag"] = 0

    # Remove original fields
    df = df.drop(columns=['activity_comment', 'standard_text'])

    return df

In [4]:
activities_all_raw = create_text_flag(activities_all_raw)

In [5]:
activities_all_raw

Unnamed: 0,activity_id,assay_id,assay_chembl_id,assay_type,assay_confidence_score,assay_organism,doc_chembl_id,tid,target_type,target_organism,...,canonical_smiles,MW,pchembl,bao_endpoint,value,unit,activity_type,relation,pchembl_calculated,text_flag
0,31863,54505,CHEMBL663853,B,8,,CHEMBL1137930,63,SINGLE PROTEIN,Homo sapiens,...,c1ccc(-c2nc3c(-c4nc5ccccc5o4)cccc3o2)cc1,312.328,,BAO_0000190,100.000,umol.L-1,IC50,>,4.000000,0.0
1,31864,83907,CHEMBL872937,B,8,,CHEMBL1146658,11653,SINGLE PROTEIN,Homo sapiens,...,Cc1ccc2oc(-c3cccc(N4C(=O)c5ccc(C(=O)O)cc5C4=O)...,398.374,5.60,BAO_0000190,2.500,umol.L-1,IC50,=,5.602060,0.0
2,31865,88152,CHEMBL693237,F,1,Homo sapiens,CHEMBL1146658,22221,NON-MOLECULAR,,...,Cc1ccc2oc(-c3cccc(N4C(=O)c5ccc(C(=O)O)cc5C4=O)...,398.374,,BAO_0000190,50.000,umol.L-1,IC50,>,4.301030,0.0
3,31866,83907,CHEMBL872937,B,8,,CHEMBL1146658,11653,SINGLE PROTEIN,Homo sapiens,...,COc1ccccc1-c1ccc2oc(-c3ccc(OC)c(N4C(=O)c5ccc(C...,520.497,5.05,BAO_0000190,9.000,umol.L-1,IC50,=,5.045757,0.0
4,31867,88153,CHEMBL693238,F,1,Homo sapiens,CHEMBL1146658,22221,NON-MOLECULAR,,...,COc1ccccc1-c1ccc2oc(-c3ccc(OC)c(N4C(=O)c5ccc(C...,520.497,,BAO_0000190,,umol.L-1,IC50,=,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24040982,29054631,2596842,CHEMBL5739541,B,9,Homo sapiens,CHEMBL5729811,19639,SINGLE PROTEIN,Homo sapiens,...,CC(C)Oc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(-c3cccc(=O...,436.468,,BAO_0000480,,,KON,=,,0.0
24040983,29054632,2596842,CHEMBL5739541,B,9,Homo sapiens,CHEMBL5729811,19639,SINGLE PROTEIN,Homo sapiens,...,CC(C)Oc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(-c3cccc(=O...,436.468,,BAO_0000179,,s-1,KOFF,=,,0.0
24040984,29054633,2596842,CHEMBL5739541,B,9,Homo sapiens,CHEMBL5729811,19639,SINGLE PROTEIN,Homo sapiens,...,Cc1nccc(-c2cnc(OC[C@@H]3CCC(=O)N3)c3cc(OC(C)C)...,435.484,6.30,BAO_0000190,0.503,umol.L-1,IC50,=,6.298432,0.0
24040985,29054634,2596842,CHEMBL5739541,B,9,Homo sapiens,CHEMBL5729811,19639,SINGLE PROTEIN,Homo sapiens,...,Cc1nccc(-c2cnc(OC[C@@H]3CCC(=O)N3)c3cc(OC(C)C)...,435.484,,BAO_0000480,,,KON,=,,0.0


In [6]:
s = activities_all_raw[["activity_type", "unit", 'text_flag']].astype("string").fillna("")
out = (
s.value_counts(subset=["activity_type", "unit", 'text_flag'], dropna=False)
    .reset_index(name="count")
    .sort_values("count", ascending=False, ignore_index=True)
)
total_count = out['count'].sum()
out['cumulative_prop'] = (out['count'].cumsum() / total_count).round(3)

In [7]:
out

Unnamed: 0,activity_type,unit,text_flag,count,cumulative_prop
0,IC50,umol.L-1,0.0,3207559,0.133
1,POTENCY,umol.L-1,0.0,2875824,0.253
2,GI50,umol.L-1,-1.0,2220580,0.345
3,INHIBITION,%,0.0,1708901,0.416
4,PERCENTEFFECT,%,0.0,1328350,0.472
...,...,...,...,...,...
13544,VMAX,pM 0.5hr-1,0.0,1,1.000
13545,KD,umol.L-1.s-1,0.0,1,1.000
13546,IZAARAARLV,,-1.0,1,1.000
13547,IZAARAARLV,%,0.0,1,1.000
