In [None]:
# Potentially useful NER source
# GENIA corpus of MEDLINE abstracts
# http://www.aclweb.org/anthology/W04-1213

In [1]:
import os
import pandas as pd
import numpy as np
import re
from fuzzywuzzy import fuzz

## Setup
Setup directories, and read in annotated labels (ground truth)

In [3]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

extracted_dir = parent_dir + '/data/extracted_txt/'

df = pd.read_csv(parent_dir + '/data/100annotations.csv')

df = df.drop(df.columns[0], axis=1)
df = df.fillna('none')

targets = list(df.columns)[2:]
annotated_uuids = df.iloc[:, 0].unique().tolist()
print(f"Targets: {targets}")
print(f"Working with data for {len(annotated_uuids)} annotated uuids")

Targets: ['Cancer Type', 'Cancer Location']
Working with data for 100 annotated uuids


## Standardization Functions

1. Standardize Location: Given cancer location, put variable in std. format - direction + location + extra info (e.g. left + lung + lower lobe)

&nbsp;
2. Standardize Cancer Type: Given cancer location, put variable in std. format (lowercase, strip punctuation)


In [5]:
def standardize_location(desc):
    if desc == 'none' or desc is None:
        return 'none'
    
    desc = re.sub(r'(of|the)', '', desc)
    desc = re.sub(r'\s*,\s*|\s+', ' ', desc).strip().lower()
    
    if 'lll' in desc:
        return 'left lung lower lobe'
    if 'rll' in desc:
         return 'right lung lower lobe'
    if 'lul' in desc:
         return 'right lung upper lobe'
    if 'rul' in desc:
         return 'right lung upper lobe'
        
        
    direction = ''
    if 'left and right' in desc or 'right and left' in desc:
        direction = 'left and right'
    if 'left' in desc:
        direction = 'left'
        desc = re.sub(r'left', '', desc) 
    if 'right' in desc:
        direction = 'right'
        desc = re.sub(r'right', '', desc) 
    
    location = ''
    if 'lung' in desc:
        location = 'lung'
        desc = re.sub(r'lung', '', desc) 
    if 'breast' in desc:
        location = 'breast'
        desc = re.sub(r'breast', '', desc)
        
    extra_info = desc
    
    std_desc = ' '.join([direction, location, extra_info])
    std_desc = re.sub(r'\s+', ' ', std_desc).strip()

    return std_desc
    

In [8]:
def standardize_cancer_type(desc):
    std_desc = desc.lower()
    std_desc = re.sub(r'[^\w\s]', '', std_desc)
    return std_desc
    

In [36]:
df['Cancer Location'] = df['Cancer Location'].apply(standardize_location)
df['Cancer Type'] = df['Cancer Type'].apply(standardize_cancer_type)

In [16]:
cancer_type_freq = (df['Cancer Type'].value_counts(normalize=True) * 100).round(2)
print(cancer_type_freq.astype(str) + '%')

Cancer Type
infiltrating ductal carcinoma                                                             18.58%
adenocarcinoma                                                                            18.58%
invasive ductal carcinoma                                                                  9.73%
ductal carcinoma                                                                           8.85%
infiltrating lobular carcinoma                                                             7.08%
invasive lobular carcinoma                                                                 5.31%
metastatic carcinoma                                                                       3.54%
lobular carcinoma                                                                          1.77%
esophagogastric adenocarcinoma                                                             1.77%
metastatic adenocarcinoma                                                                  1.77%
micrometastatic ca

In [38]:
cancer_location_freq = (df['Cancer Location'].value_counts(normalize=True) * 100).round(2)
print(cancer_location_freq.astype(str) + '%')


Cancer Location
left breast                             24.78%
right breast                            20.35%
right lung upper lobe                    7.96%
breast                                   7.96%
left lung lower lobe                     6.19%
left lung upper lobe                     5.31%
right lung lower lobe                    4.42%
lymph node                               4.42%
none                                     1.77%
right lung                               1.77%
left breast upper outer quadrant         1.77%
anterior lymph node hilar lymph node     0.88%
right lower inner quadrant               0.88%
left lateral segment upper lobe          0.88%
left breast axillary lymph nodes         0.88%
right sentinel lymph node                0.88%
right lung -upper                        0.88%
proximal stomach                         0.88%
right and axilla                         0.88%
breast central portion                   0.88%
right breast upper inner quadrant        0.8

## Gather Data
For documents with ground truth, read in model output

In [20]:
extracted_docs = {}

for root, dirs, files in os.walk(extracted_dir):
    for name in files:
          if name.endswith('.txt'):
                uuid = name.rstrip(".txt")
                if uuid not in annotated_uuids:
                    continue

                file_path = os.path.join(extracted_dir, name)
                with open(file_path, 'r') as file:
                    extracted_docs[uuid] = file.read()
                    
print(f"We have {len(extracted_docs)} extracted docs")

We have 64 extracted docs


In [21]:
list(extracted_docs.values())[0]

'Cancer Type: Invasive Ductal Carcinoma\nCancer Location: Left Breast<|im_end|>'

## Helper Functions

1. Extract Info: Given some model output and some target, find the target in the output and get the models 'answer'. Relies heavily on model output coming out standardised in format - 'Target: Answer\n'

In [19]:
def extract_info(doc, targets):
    extracted_info = {}
    
    for i, target in enumerate(targets):
        pattern = rf"{target.lower()}: (.+?)(?=\n|$)"
        match = re.search(pattern, doc, re.IGNORECASE)

        if match:
            extracted_info[target] = match.group(1).rstrip("<|im_end|>")
        else:
            extracted_info[target] = 'none'
    
    return extracted_info
        

## Measure results
Compare model output to ground truth, assigning a '1' if the model output cintains the right information.
We measure 'containing the right information' as any of
- if ground truth in output
- if output in ground truth
- if output and ground truth are 'close' in fuzzy string matching

In [22]:
results = {}
num_targets = 0
model_output_none = 0
for uuid in extracted_docs:
    results[uuid] = np.zeros(2)
    doc = extracted_docs[uuid].lower()
    model_output = extract_info(doc, targets)
    
    ground_truth = df[df["UUID"] == uuid][targets]
    num_targets += ground_truth.shape[0]

    
    for i, target in enumerate(targets):
        for j in range(len(ground_truth)):
            t = ground_truth.iloc[j][target].lower()
            t_hat = model_output[target]
            
            if target == "Cancer Location":
                t = standardize_location(t)
                t_hat = standardize_location(t_hat)
            
            similarity_score = fuzz.ratio(t.lower(), t_hat.lower())
            
            if (t in t_hat) or (t_hat in t) or (similarity_score > 80):
                results[uuid][i] += 1
            
            if t_hat == 'none':
                model_output_none += 1

results

{'A8980EE8-52DE-4DA5-84DD-F8E86EBF6A9B': array([1., 1.]),
 '217E5C9A-950F-46C0-A022-5ECE5205D9D4': array([1., 1.]),
 'AE348902-0573-4A57-BBD6-037FA2AF1872': array([0., 1.]),
 'E5C54019-F404-4F8D-847A-80CF65F65E19': array([1., 0.]),
 'B0E8F3C1-E996-4E70-9630-AC95AF6E4EDC': array([1., 0.]),
 'A4BFA94D-166F-43D4-90B5-3C547F4026A0': array([0., 0.]),
 '1FE4EB4C-FEF8-486A-B39D-6F9F023FDAC4': array([0., 1.]),
 'A1D72B00-7384-4A8F-B335-5BEB9AE078FC': array([1., 1.]),
 '29EDE507-90BA-4ABE-B3EF-559EBA4CCD56': array([1., 1.]),
 '9A8F7726-D97C-44DE-B72F-6BF0FC2078F8': array([0., 0.]),
 '4FDBA110-6E14-428C-BF82-168AE28A14E4': array([0., 1.]),
 'AB083371-A2AB-4F57-8FBF-0DB2C91CEDDE': array([1., 1.]),
 'E5011BD1-E1DE-4A47-BF3C-66BA1E0B84DC': array([0., 1.]),
 '5D93AE27-79A0-4809-AE7C-54814CED0851': array([0., 1.]),
 'DD5295D4-3896-4AA5-AF75-30AEA45D1652': array([1., 1.]),
 '13C40632-79E2-485C-9EF2-DA6D075B054C': array([0., 0.]),
 '9A5C8EDF-9243-4F55-9036-A400CF3F4CC1': array([1., 1.]),
 'E1EA2600-CEE

## Accuracy

Accuracy Scores for Cancer Type, Cancer Location.

Percentage of model output where regex failed to strip. (Model output not in the expected format of "{var}: \n".)

In [35]:
result = np.sum(list(results.values()), axis=0) / num_targets

for i, target in enumerate(targets):
    print(f"{target} - Accuracy: {100*round(result[i], 3)}%")
    
print(f"\nRegex failed to strip: {round(model_output_none / (num_targets*2),3)*100}%")

Cancer Type - Accuracy: 47.9%
Cancer Location - Accuracy: 61.6%

Regex failed to strip: 20.5%
