# Order-free matching performance

In [1]:
# Generic imports

import glob
import os
import pandas as pd
from os import listdir
from os.path import isfile, join
import ast
import json


# Sklearn imports
from sklearn.metrics import f1_score, recall_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, auc, roc_curve

from tqdm import tqdm

## Read order-free candidates

In [2]:
picos = 'I' # ['P', 'I', 'O', 'S']
match_level = 'doc' # ['doc', 'sent', 'win_5', 'para']

In [3]:
order_free_dir = f'/mnt/nas2/results/Results/systematicReview/order_free_matching/EBM_PICO_training_matches/order_free/{match_level}/{picos}'
order_free_files = os.listdir(order_free_dir)
print('Files: ', order_free_files)

Files:  ['Biomedical_or_Dental_Material.json', '.nfs0000000011200a4200000003', 'Classification.json', 'Intellectual_Product.json', 'Biologically_Active_Substance.json', 'Diagnostic_Procedure.json', 'Gene_or_Genome.json', 'Finding.json', 'Functional_Concept.json', 'Medical_Device.json', 'Organic_Chemical.json', 'Laboratory__Procedure.json', 'Manufactured_Object.json', 'train_ebm_intervention.json', 'Professional_Society.json', 'Pharmacologic_Substance.json', 'Therapeutic_or_Preventive_Procedure.json', 'train_ebm_intervention_syn.json', 'Biomedical_Occupation_or_Discipline.json', 'Health_Care_Activity.json', 'Idea_or_Concept.json', 'Temporal_Concept.json']


In [4]:
order_free_files.remove('.nfs0000000011200a4200000003')
#order_free_files.remove('Finding.json')

In [5]:
orf_loaded_files = dict()

for i in tqdm(order_free_files):
    
    filpath = f'{order_free_dir}/{i}'
    print('Loading file...', i)
    with open( filpath, 'r' ) as rf:
        orf_i = json.load(rf)
        orf_loaded_files[i] = orf_i

  0%|          | 0/21 [00:00<?, ?it/s]

Loading file... Biomedical_or_Dental_Material.json


  5%|▍         | 1/21 [00:00<00:15,  1.30it/s]

Loading file... Classification.json


 10%|▉         | 2/21 [00:01<00:11,  1.62it/s]

Loading file... Intellectual_Product.json


 14%|█▍        | 3/21 [00:16<02:09,  7.18s/it]

Loading file... Biologically_Active_Substance.json


 19%|█▉        | 4/21 [00:20<01:41,  5.95s/it]

Loading file... Diagnostic_Procedure.json


 24%|██▍       | 5/21 [00:44<03:20, 12.50s/it]

Loading file... Gene_or_Genome.json


 29%|██▊       | 6/21 [00:45<02:08,  8.56s/it]

Loading file... Finding.json


 33%|███▎      | 7/21 [02:17<08:22, 35.88s/it]

Loading file... Functional_Concept.json


 38%|███▊      | 8/21 [02:17<05:19, 24.60s/it]

Loading file... Medical_Device.json


 43%|████▎     | 9/21 [02:20<03:33, 17.75s/it]

Loading file... Organic_Chemical.json


 48%|████▊     | 10/21 [02:22<02:19, 12.71s/it]

Loading file... Laboratory__Procedure.json


 52%|█████▏    | 11/21 [02:51<02:58, 17.88s/it]

Loading file... Manufactured_Object.json


 57%|█████▋    | 12/21 [02:52<01:52, 12.55s/it]

Loading file... train_ebm_intervention.json


 62%|██████▏   | 13/21 [03:00<01:31, 11.43s/it]

Loading file... Professional_Society.json
Loading file... Pharmacologic_Substance.json


 71%|███████▏  | 15/21 [03:04<00:41,  7.00s/it]

Loading file... Therapeutic_or_Preventive_Procedure.json


 76%|███████▌  | 16/21 [06:17<04:25, 53.18s/it]

Loading file... train_ebm_intervention_syn.json


 81%|████████  | 17/21 [06:19<02:39, 39.90s/it]

Loading file... Biomedical_Occupation_or_Discipline.json
Loading file... Health_Care_Activity.json


 95%|█████████▌| 20/21 [07:34<00:29, 29.73s/it]

Loading file... Idea_or_Concept.json
Loading file... Temporal_Concept.json


100%|██████████| 21/21 [07:35<00:00, 21.67s/it]


## Load order-free candidates

In [49]:
def are_n_consecutive(numbers, n):
    n = 2
    if len(numbers) < n: # if the number of partial matches are less than two, 
        return False
    for i in range(len(numbers) - n + 1):
        window = numbers[i:i+n]
        if max(window) - min(window) != n-1:
            continue
        if len(set(window)) == n:
            return True
    return False

In [2]:
def are_numbers_consecutive(numbers):
    """
    Check if numbers in a list are consecutive
    
    Args:
        numbers (list[int]): A list of integers
        
    Returns:
        bool: True if the numbers are consecutive, False otherwise
    """
    if len(numbers) < 2:
        # A single number cannot be considered consecutive
        return False
    
    # Sort the numbers in ascending order
    sorted_numbers = sorted(numbers)
    
    # Check if each number is equal to the previous number plus one
    for i in range(1, len(sorted_numbers)):
        if sorted_numbers[i] != sorted_numbers[i-1] + 1:
            return False
    
    # If all numbers are consecutive, return True
    return True

In [92]:
# set variables

set_partial_matches = True
set_actual_orf_matches = True

In [93]:
def get_orf(v, par, actual_orf):
    
    orfs = dict()
    
    for k_i, v_i in v.items():
        # print( k_i ) # example: name_15_3, name_9_9, name_12_8
        
        if 'Inters. (full)' in v_i and len(v_i['Inters. (full)']) > 0:
            
            full_inters = v_i['Inters. (full)']
            # full_inters.keys() = PMIDs
            # full_inters.values() = offsets, tokens

            for pmid, matches in full_inters.items():
                if len(matches['char offs.']) > 1:
                    if pmid not in orfs:
                        if actual_orf == True:
                            #if sorted( matches['word offs.'] ) != matches['word offs.']:
                            if are_numbers_consecutive(matches['word offs.'], 2) == False:
                                orfs[pmid] = [ ]
                                orfs[pmid].extend( matches['word offs.'] )      
                        elif actual_orf == False: 
                            orfs[pmid] = [ ]
                            orfs[pmid].extend( matches['word offs.'] )
                    else:
                        if actual_orf == True:
                            #if sorted( matches['word offs.'] ) != matches['word offs.']:
                            if are_numbers_consecutive(matches['word offs.'], 2) == False:
                                orfs[pmid].extend( matches['word offs.'] )
                        elif actual_orf == False:
                            orfs[pmid].extend( matches['word offs.'] )


        if 'Inters. (partial)' in v_i and len(v_i['Inters. (partial)']) > 0 and (par=='both' or par==True):
            
            par_inters = v_i['Inters. (partial)']
            
            for pmid, matches in par_inters.items():
                if len(matches['char offs.']) > 1:
                    if pmid not in orfs:
                        if actual_orf == True:
                            #if sorted( matches['word offs.'] ) != matches['word offs.']:
                            if are_numbers_consecutive(matches['word offs.'], 2) == False:
                                orfs[pmid] = [ ]
                                orfs[pmid].extend( matches['word offs.'] )   
                        elif actual_orf == False:
                            orfs[pmid] = [ ]
                            orfs[pmid].extend( matches['word offs.'] ) 
                    else:
                        if actual_orf == True:
                            #if sorted( matches['word offs.'] ) != matches['word offs.']:
                            if are_numbers_consecutive(matches['word offs.'], 2) == False:
                                orfs[pmid].extend( matches['word offs.'] )
                        elif actual_orf == False:
                            orfs[pmid].extend( matches['word offs.'] )
    
    return orfs

In [94]:
orfs_dict = dict()

for k,v in orf_loaded_files.items():

    orf_fetched = get_orf(v, par = set_partial_matches, actual_orf = set_actual_orf_matches)
    orfs_dict[k] = orf_fetched

In [95]:
# Order the loaded ORF matches in ascending order depending on the number of matches found

order_offset_dict = dict()

for k,v in orfs_dict.items():
    
    total_offsets = []
    for k_i, v_i in v.items():
        total_offsets.extend( v_i )
    
    order_offset_dict[k] = len(total_offsets)
    
    print( f'Offsets in {k}: ', len(total_offsets) )

Offsets in Biomedical_or_Dental_Material.json:  269764
Offsets in Classification.json:  221903
Offsets in Intellectual_Product.json:  6384570
Offsets in Biologically_Active_Substance.json:  1747134
Offsets in Diagnostic_Procedure.json:  8650791
Offsets in Gene_or_Genome.json:  707175
Offsets in Finding.json:  34806379
Offsets in Functional_Concept.json:  438148
Offsets in Medical_Device.json:  2408193
Offsets in Organic_Chemical.json:  1226365
Offsets in Laboratory__Procedure.json:  9350649
Offsets in Manufactured_Object.json:  351652
Offsets in train_ebm_intervention.json:  20569
Offsets in Professional_Society.json:  1014
Offsets in Pharmacologic_Substance.json:  3235747
Offsets in Therapeutic_or_Preventive_Procedure.json:  80134307
Offsets in train_ebm_intervention_syn.json:  74471
Offsets in Biomedical_Occupation_or_Discipline.json:  29481
Offsets in Health_Care_Activity.json:  13611371
Offsets in Idea_or_Concept.json:  168142
Offsets in Temporal_Concept.json:  656504


In [96]:
# sort dictionary
order_offset_sorteddict = dict(sorted(order_offset_dict.items(), key=lambda x: x[1]))

In [119]:
for i in order_offset_sorteddict.keys():
    str2print =  '\'' + str(i).replace('.json', '') + '\','
    print( str2print )

'Professional_Society',
'train_ebm_intervention',
'Biomedical_Occupation_or_Discipline',
'train_ebm_intervention_syn',
'Idea_or_Concept',
'Classification',
'Biomedical_or_Dental_Material',
'Manufactured_Object',
'Functional_Concept',
'Temporal_Concept',
'Gene_or_Genome',
'Organic_Chemical',
'Biologically_Active_Substance',
'Medical_Device',
'Pharmacologic_Substance',
'Intellectual_Product',
'Diagnostic_Procedure',
'Laboratory__Procedure',
'Health_Care_Activity',
'Finding',
'Therapeutic_or_Preventive_Procedure',


In [97]:
print( 'Total number of dictionaries loaded: ', len(order_offset_sorteddict) )

Total number of dictionaries loaded:  21


## Load order-bound matches

In [98]:
def order_free_matches(x, orf_offsets):
    
    labs_modified = []
    
    for i, (identifier, offs, labs) in enumerate( zip(x.pmid, x.offsets, x.labels) ):
             
        lab_val = [v for k, v in ast.literal_eval(labs).items()] 
        off_val = ast.literal_eval(offs) 
        
        if str(identifier) in orf_offsets: 
            orf_matches =  orf_offsets[ str(identifier) ]
            match_indices = [ off_val.index(m) for m in orf_matches ]
            for i, l in enumerate(lab_val):
                if i in match_indices:
                    lab_val[i] = 1
                    
        labs_modified.append( lab_val )
        
        
    return labs_modified

In [99]:
ob_int = f'/mnt/nas2/results/Results/systematicReview/order_free_matching/EBM_PICO_training_matches/direct/{picos}/lf_ds_intervention_syn.tsv'
ob_int_syn = f'/mnt/nas2/results/Results/systematicReview/order_free_matching/EBM_PICO_training_matches/direct/{picos}/lf_ds_intervetion.tsv'

In [100]:
ob_int_df = pd.read_csv(ob_int, sep='\t', header=0)
ob_int_syn_df = pd.read_csv(ob_int_syn, sep='\t', header=0)
ob_merged_df = pd.concat([ob_int_df,ob_int_syn_df])

In [101]:
def process_gt(l):
    
    labels = l
    
    if isinstance(labels, str):
        labels = ast.literal_eval(labels)
        
    # convert non-1 fine labels labels to 1's
    labels = ['1' if (n != '1' and n != '0') else str(n) for i, n in enumerate(labels) ]
    
    return labels

ob_int_df['i'] = ob_int_df.i.apply(process_gt)
ob_int_syn_df['i'] = ob_int_syn_df.i.apply(process_gt)

ob_int_df['i_f'] = ob_int_df.i_f.apply(process_gt)
ob_int_syn_df['i_f'] = ob_int_syn_df.i_f.apply(process_gt)

In [102]:
# Fetch ground truth from the direct matching

coarse_int_gt = dict(zip(ob_int_df['pmid'], ob_int_df['i']))
fine_int_gt = dict(zip(ob_int_df['pmid'], ob_int_df['i_f']))

In [103]:
# Preprocess order-bound labels

def process_ob_labs(l):
    
    labels = l
    
    if isinstance( labels, str ):
        labels = ast.literal_eval(labels)

    labels = [ v for k, v in labels.items() ]
    labels = ['0' if n == -1 else str(n) for i, n in enumerate(labels) ]

    return labels

ob_int_df['labels'] = ob_int_df.labels.apply(process_ob_labs) # order bound matching labels for int source
ob_int_syn_df['labels'] = ob_int_syn_df.labels.apply(process_ob_labs) # order bound matching labels for int_syn source

In [104]:
# Fetch order-bound predictions for merged dataframes

ob_preds_merged = dict()

ob_int_dict = dict(zip(ob_int_df['pmid'], ob_int_df['labels']))
ob_int_syn_dict = dict(zip(ob_int_syn_df['pmid'], ob_int_syn_df['labels']))

for k,v in ob_int_dict.items():

    if k not in ob_preds_merged:
        ob_preds_merged[k] = []
        ob_preds_merged[k] = v

    else:
        old_pred = ob_preds_merged[k]
        new_pred = v
        
        # merge old and new predictions
        merged_predictions = [ max( o,n ) for o,n in zip( old_pred, new_pred ) ]
        assert len( old_pred ) == len( new_pred ) == len( merged_predictions )
        ob_preds_merged[k] = merged_predictions

In [105]:
len( list(ob_preds_merged.values()) )

4802

In [106]:
for k,v in ob_int_syn_dict.items():

    if k not in ob_preds_merged:
        ob_preds_merged[k] = []
        ob_preds_merged[k] = v

    else:
        old_pred = ob_preds_merged[k]
        new_pred = v

        # merge old and new predictions
        #print( 'merging the new predictions...' )
        merged_predictions = [ max( o,n ) for o,n in zip( old_pred, new_pred ) ]
        assert len( old_pred ) == len( new_pred ) == len( merged_predictions )
        ob_preds_merged[k] = merged_predictions

## merge and calculate evaluation metrics

In [107]:
def flatten(d):
    l = [ v for k,v in d.items() ]
    l = [item for sublist in l for item in sublist]
    l = list(map(int, l))
    return l

In [108]:
# ground truth : coarse_int_gt, fine_int_gt
# ob/direct matching preds :  ob_preds_merged

order_bound_preds = flatten( ob_preds_merged )
picos_coarse = flatten( coarse_int_gt )
picos_fine = flatten( fine_int_gt )

In [109]:
def merge_preds_ordered( ob_preds, orf_offsets ):
    
    merged_predictions = dict()
    
    for k, v in ob_preds.items(): # ob preds are the old preds
        
        k = str(k)

        if k in orf_offsets:
            matching_offsets = orf_offsets[k]
            old_preds = list( map( int, v ))
            new_preds = list( map( int, v ))
                             
            # add new offsets to the new_preds
            for indice in matching_offsets:
                new_preds[indice] = 1

            merged_predictions[ k ] = new_preds
        else:
            merged_predictions[ k ] = list( map( int, v ) )
    
    return merged_predictions

In [110]:
all_orf_ordered = list(order_offset_sorteddict.keys())
all_orf_ordered.insert( 0, 'order bound' )

In [111]:
base_preds = dict()
base_preds_flattened = []


for count, i in enumerate(all_orf_ordered):
    
    # get offsets and merge with the ob ones
    if count == 0:
        base_preds = ob_preds_merged
        base_preds_flattened =  flatten( ob_preds_merged )
    else:
        # Add more dicts to orderbound preds and modify base_preds_flattened
        updated_of_preds = merge_preds_ordered( base_preds, orfs_dict[i] )
        base_preds_flattened =  flatten( updated_of_preds )
        base_preds = updated_of_preds
    
    
    # Classification report
    cr_order_bound_coarse = classification_report( picos_coarse, base_preds_flattened, digits=4, output_dict=True)
    print(f'Confusion matrix for coarse-grained ground truth and {i} matches')
    report = '{:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format(cr_order_bound_coarse['macro avg']['recall'], cr_order_bound_coarse['macro avg']['f1-score'], cr_order_bound_coarse['0']['precision'], cr_order_bound_coarse['0']['recall'], cr_order_bound_coarse['0']['f1-score'], cr_order_bound_coarse['1']['precision'], cr_order_bound_coarse['1']['recall'], cr_order_bound_coarse['1']['f1-score'])
    print( report )

    cr_order_bound_fine = classification_report( picos_fine, base_preds_flattened, digits=4, output_dict=True)
    print(f'\n\nConfusion matrix for fine-grained ground truth and {i} matches')
    report = '{:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}'.format(cr_order_bound_fine['macro avg']['recall'], cr_order_bound_fine['macro avg']['f1-score'], cr_order_bound_fine['0']['precision'], cr_order_bound_fine['0']['recall'], cr_order_bound_fine['0']['f1-score'], cr_order_bound_fine['1']['precision'], cr_order_bound_fine['1']['recall'], cr_order_bound_fine['1']['f1-score'])
    print( report )
    
    print( '--------------------------------------------------------------------------' )

Confusion matrix for coarse-grained ground truth and order bound matches
0.6077, 0.5335, 0.9295, 0.7394, 0.8236, 0.1635, 0.4761, 0.2434


Confusion matrix for fine-grained ground truth and order bound matches
0.6438, 0.5252, 0.9565, 0.7385, 0.8335, 0.1352, 0.5492, 0.2169
--------------------------------------------------------------------------
Confusion matrix for coarse-grained ground truth and Professional_Society.json matches
0.6077, 0.5334, 0.9295, 0.7391, 0.8235, 0.1634, 0.4762, 0.2433


Confusion matrix for fine-grained ground truth and Professional_Society.json matches
0.6438, 0.5251, 0.9565, 0.7382, 0.8333, 0.1351, 0.5494, 0.2169
--------------------------------------------------------------------------
Confusion matrix for coarse-grained ground truth and train_ebm_intervention.json matches
0.6073, 0.5324, 0.9295, 0.7369, 0.8221, 0.1627, 0.4778, 0.2427


Confusion matrix for fine-grained ground truth and train_ebm_intervention.json matches
0.6435, 0.5240, 0.9566, 0.7360, 0.831