## NER model for method extraction in assay description - Results
The notebook analyses the annotated data stats and the model predictions

In [None]:
import pandas as pd
import os
import numpy as np
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [None]:
# Define input/output paths
dpath = "data"
rpath = "Results"
spath = "Results/Sample"

In [None]:
# Loads chembl 35 dataset without annotations. Check assays_description/2_broad_assay_category/chembl_35_broad_predictions.ipynb for more details on how to generate this file 
chembl_assays = pd.read_csv(os.path.join(dpath, 'chembl_35_all_BF_assays.txt'), sep='\t')
chembl_assays = chembl_assays.drop(['Unnamed: 0'] ,axis=1)

In [None]:
# Loads training set (with manually annotated data) and chembl35 dataset with NER annotated data
ann_assays = pd.read_csv(os.path.join(dpath, 'assays_data.csv'), sep='\t')
ner_assays = pd.read_csv(os.path.join(rpath, 'ner_chembl_35.tsv'), sep='\t')

### General numbers and commonly identified methods

In [None]:
#Check annotated methods in training set: unique, no method, other numbers, etc
#Unique methods strings
print(ann_assays.method.nunique())

#Counts methods strings frequency in assays
value_counts = ann_assays.method.value_counts()
print(value_counts.nlargest(20))

#Counts descriptions without methods
len(ann_assays[ann_assays.method.isnull()])

In [None]:
#Check proportion of F and B assays in all (non annotated) chembl 35 data
allassays = len(chembl_assays)                              # total number of assays in ChEMBL 35 dataset
all_B = len(chembl_assays[chembl_assays.assay_type == 'B']) # assays of type B
bprop = all_B*100/len(chembl_assays)                        # proportion of assays type B
all_F = len(chembl_assays[chembl_assays.assay_type == 'F']) # assays of type F
fprop = all_F*100/len(chembl_assays)                        # proportion of assays type F
print(allassays)
print(all_B)
print(all_F)
print(bprop)
print(fprop)

In [None]:
# Check proportion of B and F assays where a method hs been found
ner_found = ner_assays[~ner_assays.method.isnull()] # assays in dataset with a method found by NER model
ann_B = len(ner_found[ner_found.assay_type == 'B']) # assays of type B with method
annbprop = ann_B*100/all_B                          # proportion of annotated B
ann_F = len(ner_found[ner_found.assay_type == 'F']) # assays of type F with method
annfprop = ann_F*100/all_F                          # proportion of type F
print(len(ner_found))
print(ann_B)
print(ann_F)
print(annbprop)
print(annfprop)


In [None]:
#Checks predicted methods by NER model
print(ner_assays.method.nunique())

# method frequency among assays description
value_counts = ner_assays.method.value_counts()
print(value_counts.nlargest(20))
fullset = value_counts.reset_index().sort_values(by='count',ascending=False)
fullset.to_csv(os.path.join(rpath, 'ner_chembl_35_methods.tsv'), sep='\t', index=False)

# Count assays with and without methods
print(len(ner_assays[ner_assays.method.isnull()]))
print(len(ner_assays[~ner_assays.method.isnull()]))

### Sampling data and performance

#### Sample data 1

In [None]:
# Select stratified sample set with annotated methods
display(ner_assays.columns)

training_assays = ann_assays.assay_id.unique()
ner_b = ner_assays[(ner_assays.assay_type == 'B') & (~ner_assays.assay_id.isin(training_assays))]
ner_f = ner_assays[(ner_assays.assay_type == 'F') & (~ner_assays.assay_id.isin(training_assays))]

sample_b = ner_b.sample(n=int(round(bprop,0)), random_state=412)  # Set random_state for reproducibility
sample_f = ner_f.sample(n=int(round(fprop, 0)), random_state=412)

method_sample = pd.concat([sample_b, sample_f])[['assay_id', 'assay_type', 'description', 'method']]

# Shuffle the final sample (optional, but good practice)
#final_sample = final_sample.sample(frac=1, random_state=42)  # frac=1 shuffles all rows

method_sample.head(2)


In [None]:
# Writes the final sample dataset into a csv file
method_sample.to_csv(os.path.join(spath, 'sample_subset_chembl35.tsv'), sep='\t', index=False)
print(method_sample.groupby('assay_type').count())

# edit the final sample dataset without NER method for blind curation into a csv file
blind_method_sample = method_sample.drop(columns='method', axis=1)

# writes blind sample data into tsv file 
blind_method_sample.to_csv(os.path.join(spath, 'sample_subset_chembl35_blind.tsv'), sep='\t', index=False)

In [None]:
#Load blind sample data with annotations by curators
blind_method_sample_ann = pd.read_excel(os.path.join(spath, 'sample_subset_chembl35_blind_ann.xlsx'), skiprows=[0]).rename(columns={'extracted method':'annotated_method'})

#merging NER annotations and curators annotations
method_sample = method_sample.merge(blind_method_sample_ann, on='assay_id', how='left', suffixes=('', '_ann'))
method_sample = method_sample.drop(columns=[col for col in method_sample.columns if col.endswith('_ann')])

#### Sample data 2

In [None]:
# Select stratified sample set 2 with annotated methods
display(ner_assays.columns)

training_assays = ann_assays.assay_id.unique()
ner_b = ner_assays[(ner_assays.assay_type == 'B') & (~ner_assays.assay_id.isin(training_assays))] #checking of used assay_ids in sample data 1 could have been done here, but if modified now random_state could be affected
ner_f = ner_assays[(ner_assays.assay_type == 'F') & (~ner_assays.assay_id.isin(training_assays))]

sample_b = ner_b.sample(n=int(round(bprop,0)), random_state=266)  # Set random_state for reproducibility
sample_f = ner_f.sample(n=int(round(fprop, 0)), random_state=266)

method_sample2 = pd.concat([sample_b, sample_f])[['assay_id', 'assay_type', 'description', 'method']]

#method_sample2.assay_id.isin(method_sample.assay_id.tolist())


In [None]:
# Writes the final sample dataset 2 #into a csv file
method_sample2.to_csv(os.path.join(spath, 'sample_subset2_chembl35.tsv'), sep='\t', index=False)
print(method_sample2.groupby('assay_type').count())

# Writes the final sample dataset without NER method for blind curation into a csv file
blind_method_sample2 = method_sample2.drop(columns='method', axis=1)
#display(blind_method_sample)
blind_method_sample2.to_csv(os.path.join(spath, 'sample_subset2_chembl35_blind.tsv'), sep='\t', index=False)

In [None]:
#Load blind sample data 2 with annotations by curators (when available)
blind_method_sample2_ann = pd.read_excel(os.path.join(spath, 'sample_subset2_chembl35_blind_ann.xlsx'), skiprows=[0]).rename(columns={'method':'annotated_method'})

#merging NER annotations and curators annotations
method_sample2 = method_sample2.merge(blind_method_sample2_ann, on='assay_id', how='left', suffixes=('', '_ann'))
method_sample2 = method_sample2.drop(columns=[col for col in method_sample2.columns if col.endswith('_ann')])
method_sample2

#### Confusion matrix in both sample datasets
For now it just considers the first sample data as the second one is missing annotations.

In [None]:
# Contcatenating the two sample datasets (when available)
final_sample = pd.concat([method_sample,method_sample2])
#final_sample = method_sample  #delete when concatenation of both sample datas possible

#Assigning labels for predictions vs annotations
final_sample['evaluation'] = np.select(
    [
        final_sample.method == final_sample.annotated_method,  # TP (Highest priority)
        (final_sample.method.isna()) & (final_sample.annotated_method.isna()),  # FP
        (final_sample.method.isna()) & (final_sample.annotated_method.notna()),  # FP
        (final_sample.method.notna()) & (final_sample.annotated_method.isna()),  # FN
    ],
    ['TP', 'TN', 'FN', 'FP', ],
    default='PM')

# values for the confusion matrix
final_sample.evaluation.value_counts()
final_sample.to_csv(os.path.join(rpath, "ner_chembl_35_sample_subset_evaluation.tsv"), sep='\t', index=False)

In [None]:
#show examples with FP, FN, and partial matches
display(final_sample[final_sample.evaluation.isin(['FP','FN','PM'])])

In [None]:
#show examples with partial match and calculate lev distance 
sample_pm = final_sample[final_sample.evaluation == 'PM']

def levenshtein_distance(str1, str2):
    matrix = [[0 for _ in range(len(str2) + 1)] for _ in range(len(str1) + 1)]
    for i in range(len(str1) + 1):
        matrix[i][0] = i
    for j in range(len(str2) + 1):
        matrix[0][j] = j
    for i in range(1, len(str1) + 1):
        for j in range(1, len(str2) + 1):
            if str1[i - 1] == str2[j - 1]:
                matrix[i][j] = matrix[i - 1][j - 1]
            else:
                matrix[i][j] = min(
                    matrix[i - 1][j] + 1,
                    matrix[i][j - 1] + 1,
                    matrix[i - 1][j - 1] + 1
                )
    return matrix[len(str1)][len(str2)]

sample_pm['lev_distance'] = sample_pm.apply(lambda row: levenshtein_distance(row['method'], row['annotated_method']), axis=1)
sample_pm