In [22]:
import pandas as pd
import text2term as t2t
from text2term import *
import seaborn as sns
import matplotlib.pyplot as plt
import os

### Load all assay descriptions in ChEMBL 35 annotated with NER method

In [2]:
chembl_assays_all = pd.read_csv('../1_NER_method/Results/ner_chembl_35.tsv', sep='\t')
chembl_assays_all.head()

Unnamed: 0,assay_id,year,description,assay_type,method
0,89,1994.0,Evaluated for its activity to inhibit rat live...,B,
1,128,2002.0,Tested in vitro for inhibition of chymotrypsin...,B,
2,248,1982.0,"Percent inhibition was measured against 5,10-M...",B,
3,235,2003.0,Inhibitory activity against inosine 5'-inosine...,B,
4,336,1989.0,Binding affinity to the receptor was determine...,F,


In [3]:
len(chembl_assays_all)

1169293

In [4]:
# Some assays don't have a method identified by NER
len(chembl_assays_all.loc[chembl_assays_all['method'].isnull()])

506618

In [5]:
# Discard assays without a method identified by NER
chembl_assays = chembl_assays_all.dropna(subset='method', axis='index')

In [6]:
len(chembl_assays)

662675

In [7]:
# Some assays have multiple methods
chembl_assays.loc[chembl_assays['method'].str.contains('|', regex=False)]

Unnamed: 0,assay_id,year,description,assay_type,method
17,667,2003.0,5-hydroxytryptamine 1A receptor antagonism was...,F,"the ability to inhibit [35S]GTP-gamma-S, bindi..."
28,949,2003.0,In vitro binding affinity by radioligand bindi...,B,In vitro binding affinity | radioligand bindin...
30,963,2003.0,In vitro binding affinity was determined by ra...,B,In vitro binding affinity | radioligand bindin...
48,1689,1997.0,"Maximum stimulation of [35S]GTP-gamma-S, bindi...",F,maximal effect produced | 5-hydroxytryptamine ...
89,3807,1992.0,In vitro inhibition of leukotriene B4 synthesi...,B,In vitro inhibition of leukotriene | inhibitin...
...,...,...,...,...,...
1169163,2366440,2023.0,Inhibition of STAT1 phosphorylation in human S...,B,IL-6 stimulation | Western blot analysis
1169199,2311423,2023.0,Binding affinity to HspA5 (unknown origin) ass...,B,dissociation constant | FP assay
1169239,2357032,2023.0,Binding affinity to PBRM1 bromodomain 4 (unkno...,B,dissociation constant | isothermal titration c...
1169263,2349703,2022.0,Binding affinity to TEAD 4 (unknown origin) as...,B,dissociation constant | fluorescence polarisat...


In [8]:
# Split the 'method' column on ' | ' 
chembl_assays['split_method'] = chembl_assays['method'].str.split(' | ', regex=False)

# Explode the list into separate rows
chembl_assays = chembl_assays.explode('split_method')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chembl_assays['split_method'] = chembl_assays['method'].str.split(' | ', regex=False)


In [9]:
chembl_assays.head()

Unnamed: 0,assay_id,year,description,assay_type,method,split_method
10,410,2001.0,accumulation of compound in 3LL cells was meas...,F,HPLC,HPLC
12,471,1993.0,Inhibitory concentration of compound was calcu...,F,clonal assay,clonal assay
14,816,2000.0,Percent of maximum response by 5-HT when teste...,F,5-HT when tested for inhibition of vasoactive ...,5-HT when tested for inhibition of vasoactive ...
15,630,1998.0,Antagonistic efficacy was evaluated by adenyly...,F,adenylyl cyclase assay,adenylyl cyclase assay
17,667,2003.0,5-hydroxytryptamine 1A receptor antagonism was...,F,"the ability to inhibit [35S]GTP-gamma-S, bindi...","the ability to inhibit [35S]GTP-gamma-S, binding"


In [10]:
chembl_assays.loc[chembl_assays['assay_id']==2357032]

Unnamed: 0,assay_id,year,description,assay_type,method,split_method
1169239,2357032,2023.0,Binding affinity to PBRM1 bromodomain 4 (unkno...,B,dissociation constant | isothermal titration c...,dissociation constant
1169239,2357032,2023.0,Binding affinity to PBRM1 bromodomain 4 (unkno...,B,dissociation constant | isothermal titration c...,isothermal titration calorimetric analysis


In [11]:
len(chembl_assays)

681476

In [12]:
chembl_assays['split_method'].nunique()

42349

In [13]:
unique_terms = list(chembl_assays['split_method'].drop_duplicates())
unique_terms[:20]

['HPLC',
 'clonal assay',
 '5-HT when tested for inhibition of vasoactive intestinal polypeptide-induced',
 'adenylyl cyclase assay',
 'the ability to inhibit [35S]GTP-gamma-S, binding',
 '5-HT; Emin (%): not determined',
 'radioligand binding technique',
 'In vitro binding affinity',
 'radioligand binding assay',
 'In vitro inhibition of',
 'maximal effect produced',
 '5-hydroxytryptamine 1D receptor',
 'In vitro binding affinity towards the',
 'In vitro mean growth lethal concentration',
 '5-HETE production',
 'In vitro by displacement of [3H]LY-278584 from 5-hydroxytryptamine 3',
 'In vitro inhibition of leukotriene',
 'inhibiting 5-lipoxygenase',
 'In vitro antifungal activity against',
 'FACS flow cytometry']

In [27]:
len(unique_terms)

42349

### Annotate with text2term BAO ontology

In [14]:
# Load the BAO ontology
# Caches the ontology in memory
cache_ontology("data/bao_complete_2.8.12.owl", ontology_acronym="bao", base_iris=())

2025-06-24 09:24:40 INFO [text2term.term_collector]: Loading ontology data/bao_complete_2.8.12.owl...
2025-06-24 09:25:03 INFO [text2term.term_collector]: ...done (ontology loading time: 22.37s)
2025-06-24 09:25:03 INFO [text2term.term_collector]: Collecting ontology term details...
2025-06-24 09:25:03 INFO [text2term.term_collector]: ...done: collected 802 ontology terms (collection time: 0.10s)
2025-06-24 09:25:03 INFO [text2term.t2t]: Filtered ontology terms to those of type: OntologyTermType.ANY
2025-06-24 09:25:03 INFO [text2term.t2t]: Caching ontology data/bao_complete_2.8.12.owl to: cache/bao


<text2term.onto_cache.OntologyCache at 0x7fe13f06c090>

In [45]:
list_to_process = unique_terms
chunk_size = 500
chunks = []
for i in range(0, len(list_to_process), chunk_size):
    chunk = list_to_process[i:i + chunk_size]
    chunks.append(chunk)

Process the assays in chunks because without doing so it requires a lot of memory to store the resulting text2term annotations in memory for all the assays. Processing in chunks worked and appends to a list after each text2term call of 500. 

#### Following cell appends to file - make sure no file with the filename exists already!

In [46]:
#%%time
output_path='./results/chembl35_assays_bao_annotations.tsv'

for chunk in chunks:
    #executes text2term in the dataset with BAO annotates assays 
    assays_ann_data = t2t.map_terms(source_terms = chunk,
                         target_ontology = 'bao',
                         mapper = Mapper.TFIDF,
                         #base_iris = 'http://www.bioassayontology.org/bao#',
                         # Set the minimum Mapping Score here
                         min_score=0.6,
                         max_mappings=50,
                         term_type=OntologyTermType.CLASS,
                         use_cache=True)
    # Merge text2term results back to the assay method (to get back assay id)
    annotated_chembl_assays =  chembl_assays.merge(assays_ann_data, left_on='split_method', right_on='Source Term', how='right')
    # Use append to the file (to reduce memory use)
    annotated_chembl_assays.to_csv(output_path, sep='\t', mode='a', header=not os.path.exists(output_path), index=False)

2025-06-24 09:49:55 INFO [text2term.t2t]: Loading cached ontology from: cache/bao/bao-term-details.pickle
2025-06-24 09:49:55 INFO [text2term.t2t]: Filtered ontology terms to those of type: OntologyTermType.CLASS
2025-06-24 09:49:55 INFO [text2term.t2t]: Mapping 3000 source terms to bao
2025-06-24 09:49:55 INFO [text2term.t2t]: ...done (mapping time: 0.17s seconds)
2025-06-24 09:49:59 INFO [text2term.t2t]: Loading cached ontology from: cache/bao/bao-term-details.pickle
2025-06-24 09:49:59 INFO [text2term.t2t]: Filtered ontology terms to those of type: OntologyTermType.CLASS
2025-06-24 09:49:59 INFO [text2term.t2t]: Mapping 3000 source terms to bao
2025-06-24 09:49:59 INFO [text2term.t2t]: ...done (mapping time: 0.18s seconds)
2025-06-24 09:50:00 INFO [text2term.t2t]: Loading cached ontology from: cache/bao/bao-term-details.pickle
2025-06-24 09:50:00 INFO [text2term.t2t]: Filtered ontology terms to those of type: OntologyTermType.CLASS
2025-06-24 09:50:00 INFO [text2term.t2t]: Mapping 3