In [None]:
!pip install scikit-learn &> /dev/null
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
import numpy
from sklearn.metrics import f1_score,precision_score,recall_score

In [None]:
import os
from typing import List, Dict, Tuple
from pprint import pprint

# Task Overview

Multi-evidence Natural Language Inference for Clinical Trial Data (NLI4CT)

Data used: breast cancer CTRs, statements, explanations, labels -> see exploration section

Task name: Textual Entailment

Todo: Definition

Useful reference:
https://app.luminpdf.com/viewer/650b5e628177bb9e3155248e
https://github.com/armanaydemir/CU_COLIEE_2020/blob/main/FormalRunDatasetCOLIEE2020.ipynb


##1) Load project

Add the shared project folder to your Google Drive by right click on the folder and click "Add to my Drive"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive
PROJECT_DIR = '/content/drive/MyDrive/NLP_Project'
if not os.path.isdir(PROJECT_DIR):
  raise Exception("Project Directory not found.")

/content/drive/MyDrive


##2) Load Data

In [None]:
# Unzip Training data (if not present or need to rewrite)
# !unzip /content/drive/MyDrive/NLP_Project/training_data.zip -d /content/drive/MyDrive/NLP_Project/ &> /dev/null

In [None]:
from pprint import pprint

In [None]:
# Dev set
import json

dev_path = os.path.join(PROJECT_DIR, "training_data/dev.json")
with open(dev_path) as json_file:
    dev = json.load(json_file)

# Example instance
# key: 9cbc00e9-3a2d-4471-a93e-72c95132fb6a
pprint(dev[list(dev.keys())[1]])

In [None]:
uuid_list = list(dev.keys())
statements = []
gold_dev_primary_evidence = []
gold_dev_secondary_evidence = []
for i in range(len(uuid_list)):
  #Retrieve all statements from the development set
  # statement example:
  #. (single): there is a 13.2% difference between the results from the two the primary trial cohorts
  #. (comparison): Patients with significantly elevated ejection fraction are excluded from the primary
  #        trial, but can still be eligible for the secondary trial if they are 55 years of age or over
  statements.append(dev[uuid_list[i]]["Statement"])

# len(uuid_list) = len(statements)

### Explore: CT Reports (inside CT JSON/{Nxxxxx}.json)

Clinical trial report json. Contains information including trial methods, results, etc, for each clinical trial record. Each trail / trial report is categorized into four sections: Intervention, Eligibility, Results, Adverse Events. In the json, each section is recorded as a list of strings, with some string simply being a "header" for each instance of the section.

In [None]:
# get CTR folder
import glob
import pickle
import json
ct_path = os.path.join(PROJECT_DIR, "training_data/CT json")

In [None]:
# load CTR JSON (we use pickle to save computation)
#ct_jsons = []
#for ct_file_path in glob.glob(os.path.join(ct_path, "*.json")):
#  with open(ct_file_path, 'r') as f:
#    ct_jsons.append(json.load(f))

with open(os.path.join(PROJECT_DIR, 'training_data', 'loaded_ct_jsons.pickle'), 'rb') as f:
  ct_jsons = pickle.load(f)

In [None]:
# dump loaded json for fast reference
#with open(os.path.join(PROJECT_DIR, 'training_data', 'loaded_ct_jsons.pickle'), 'wb') as f:
#  pickle.dump(ct_jsons, f)

we will do some data exploraiton for each section.

In [None]:
# explore all invervention unique types
# result: intervention texts are separated by "INTERVENTION {#}" into sublists of sentences.
# Each sublists can be combined to build a complete intervention description.
crit_groups = []
for ct in ct_jsons:
  intervention = ct.get('Intervention')
  id = ct.get('Clinical Trial ID')
  for s in intervention:
        # check if sentence without beginning whitespaces have any special meaning
    if not s.startswith(" ") and not s.startswith("INTERVENTION"):
      crit_groups.append(id)

## examine
pprint([ct for ct in ct_jsons if ct['Clinical Trial ID'] == 'NCT01806259'][0])

In [None]:
ct_jsons[20]['Intervention']

['INTERVENTION 1: ', '  Palbociclib + Letrozole or Fulvestrant', '  Palbociclib + Letrozole or Fulvestrant: Palbociclib x 21 days with a 7 day rest plus 2.5 mg Letrozole QD (no break) or Fulvestrant 500mg IM every 2 weeks for 3 doses and then every 4 weeks until progression or maximum of 12 months']

#### Intervention

Task Definition: Information concerning the type, dosage, frequency, and duration of treatments being studied.

What we found: All intervention arrays are separated into intervention groups by a header line "INTERVENTION {count}". Subarrays should be concatenated into 1 string as description for the respective intervention. We do not dive into sub-categories for dosage, frequency, duration, etc.

In [None]:
# examing ct reports with eligibility fields not following "Inclusion" "Exclusion" formats
[ct['Clinical Trial ID'] for ct in ct_jsons
 if "Inclusion Criteria" not in ''.join(ct['Eligibility'])
 and "Exclusion Criteria" not in ''.join(ct['Eligibility'])][:5]

['NCT01105312', 'NCT01385137', 'NCT00436566', 'NCT00711529', 'NCT00080301']

In [None]:
ct_jsons[20]['Eligibility']

['Inclusion Criteria:', '  Self-identified Black, African or African American women of  18 years of age with proven diagnosis of advanced adenocarcinoma of the breast (locoregionally recurrent or metastatic disease)', '  ER-positive and/or PgR-positive tumor based on local laboratory results', '  HER2-negative breast cancer based on local laboratory results (test to be used as per local practice)', '  Patients must be appropriate candidates for letrozole or fulvestrant therapy', '  Eastern Cooperative Oncology Group (ECOG) performance status 0-2', '  Adequate bone marrow function:', '  Absolute Neutrophil Count (ANC)  1,000/mm3 (1.0 x 109/L);', '  Platelets 100,000/mm3 (100 x 109/L);', '  Hemoglobin 9 g/dL (90 g/L).', 'Exclusion Criteria:', '  Current use of food or drugs known to be potent inhibitors or inducers of CYP3A4', "  Active uncontrolled or symptomatic brain metastases. Previously treated and clinically stable, as per Investigator's judgment, brain metastases are permitted.",

In [None]:
pprint([ct for ct in ct_jsons if ct['Clinical Trial ID'] == 'NCT01105312'][0]['Eligibility'])

#### Elibility

Task Definition: A set of conditions for patients to be allowed to take part in the clinical trial.

What we found: some reports' eligibility contains "Inclusion Criteria" and "Exclusion Criteria" header lines. Others may contain only one of the header lines. Others do not contain either of them.

*Assumption: maybe we should assume anything not outlined specifically by header lines should be considered "Inclusion Criteria". Only when we see "Exclusion Criteria" header, we treat all lines following it as "Exclusion Criteria".*

In [None]:
# examine headers: we got: Outcome Measurement, Results 1, Results 2
[set([t for t in ct["Results"] if not t.startswith(" ")]) for ct in ct_jsons]

In [None]:
ct_jsons[20]['Results']

['Outcome Measurement: ', '  Number of Patients Who Complete Planned Oncologic Therapy Without the Development of a Hematological Event', '  For study purpose febrile neutropenia will be defined according to the National Cancer Institute (NCI) Common Terminology Criteria for Adverse Events (CTCAE) v4.0: "ANC less than 1000/mm3 with a single temperature of >38.3 degrees Celsius (101 degrees Fahrenheit) or a sustained temperature of 38 degrees Celsius (100.4 degrees Fahrenheit) for more than one hour."', '  Planned oncology therapy is defined as completion of one year of therapy for advanced breast cancer in the absence of disease progression or cessation of study drug due to progressive disease or non-hematological toxicity.', '  Time frame: 12 months', 'Results 1: ', '  Arm/Group Title: Palbociclib + Letrozole or Fulvestrant', '  Arm/Group Description: Palbociclib + Letrozole or Fulvestrant: Palbociclib x 21 days with a 7 day rest plus 2.5 mg Letrozole QD (no break) or Fulvestrant 500m

#### Results

Task Definition: Number of participants in the trial, outcome measures, units, and the results.

What we observed: Unique header lines are ["Outcome Measurement", "Results 1", "Results 2"]

Outcome measures: describes outcome / metric definition

Results 1/2: records outcome.

Results attributes: "Overall Number of Participants Analyzed", "Measure Type", "Unit of Measure", "Complete Response", "Arm/Group Title", "Arm/Group Description", some other non-uniform fields

In [None]:
# examine header lines: meaningful header: Adverse Events #
[[x for x in ct["Adverse Events"]
  if not x.startswith(" ") and not x.startswith("Adverse Events ")] for ct in ct_jsons]

In [None]:
ct_jsons[20]['Adverse Events']

['Adverse Events 1:', '  Total: 8/35 (22.86%)', '  Heart failure 1/35 (2.86%)', '  Colitis 1/35 (2.86%)', '  Diarrhea 1/35 (2.86%)', '  Fever 1/35 (2.86%)', '  Upper respiratory infection 1/35 (2.86%)', '  Urinary tract infection 1/35 (2.86%)', '  Neutrophil count decreased 1/35 (2.86%)', '  Anorexia 1/35 (2.86%)', '  Suicidal ideation 1/35 (2.86%)', '  Acute kidney injury 1/35 (2.86%)']

#### Adverse Events

Task Definition: These are signs and symptoms observed in patients during the clinical trial.

What we found: all adverse events arrays are separated into subarrays describing each observed event for each trial. Separation string is "Adverse Events {#}". For each described event, each line seems to be of format "{sideeffect name} {numerical measure}". Example: "Hemoglobin decreased/Anemia  1/3 (33.33%)"

Summary: CTR JSON files are mostly clean. There are some irregularities such as missing "Inclusion Criteria"/"Exclusion Criteria" header that might need special treatment. Each section can be considered as section instances (subarray) combined together. The content / medical meaning for each sentence/instance however, needs standardization if we want to use their semantic.

### Explore: Training Labels (train.json, dev.json)

If we look at dev.json/train.json, we saw something like

{
  "uuid": {
    "Type": "",
    "Section_id": ""
    ...,
    "Statement": ""
    ...,
  }

}

The "Statement" field is what we look at.

Task Definition: Statements make some type of claim about the information contained in one of the sections in the CTR premise

### Explore: train.json

##3) TF-IDF Entailment prediction baseline

In [None]:
text = ["Adverse Events 1:",
        "  Total: 0/3 (0.00%)",
        "  Lymphocyte count decreased 0/3 (0.00%)",
        "  Neutrophil count decreased 0/3 (0.00%)",
        "  Thrombotic microangiopathy 0/3 (0.00%)",
        "  Disseminated intravascular coagulation 0/3 (0.00%)",
        "  Sinus tachycardia 0/3 (0.00%)",
        "  Hypotension 0/3 (0.00%)",
        "  left ventricular dysfunction 0/3 (0.00%)",
        "  Vision blurred 0/3 (0.00%)",
        "  General symptom 0/3 (0.00%)",
        "Adverse Events 2:",
        "  Total: 0/3 (0.00%)",
        "  Lymphocyte count decreased 0/3 (0.00%)",
        "  Neutrophil count decreased 0/3 (0.00%)",
        "  Thrombotic microangiopathy 0/3 (0.00%)",
        "  Disseminated intravascular coagulation 0/3 (0.00%)",
        "  Sinus tachycardia 0/3 (0.00%)",
        "  Hypotension 0/3 (0.00%)",
        "  left ventricular dysfunction 0/3 (0.00%)",
        "  Vision blurred 0/3 (0.00%)",
        "  General symptom 0/3 (0.00%)"]

In [None]:
vectorizer = TfidfVectorizer().fit(text)

In [None]:
vectorizer.get_feature_names_out()

array(['00', 'adverse', 'blurred', 'coagulation', 'count', 'decreased',
       'disseminated', 'dysfunction', 'events', 'general', 'hypotension',
       'intravascular', 'left', 'lymphocyte', 'microangiopathy',
       'neutrophil', 'sinus', 'symptom', 'tachycardia', 'thrombotic',
       'total', 'ventricular', 'vision'], dtype=object)

In [None]:
Results = {}

# individual trail report JSON (full text)
CT_JSON_PATH = os.path.join(PROJECT_DIR, "training_data/CT json")

#
for i in range(len(uuid_list)):

  # load clinical trial report JSON
  # primary_id example: NCT00066573 (file name)
  primary_ctr_path = os.path.join(CT_JSON_PATH, dev[uuid_list[i]]["Primary_id"]+".json")
  with open(primary_ctr_path) as json_file:
    primary_ctr = json.load(json_file)

  #Retrieve the full section from the primary trial
  # Section_id example: Eligibility, Intervention, Adverse Events, Results
  primary_section = primary_ctr[dev[uuid_list[i]]["Section_id"]]

  #Convert a primary section entries to a matrix of TF-IDF features.
  # primary_section example: List['str', 'str', 'str'...] --> refer to previous section

  # create a vectorizer (model) that uses primary_section as corpus
  vectorizer = TfidfVectorizer().fit(primary_section)

  X_s = vectorizer.transform([statements[i]]) # sttement vector
  X_p = vectorizer.transform(primary_section) # CTR vector

  #Compute the cosine similarity between the primary section entries and the statement
  primary_scores = cosine_distances(X_s, X_p)

  #Repeat for the secondary trial
  if dev[uuid_list[i]]["Type"] == "Comparison":
    secondary_ctr_path = os.path.join(CT_JSON_PATH, dev[uuid_list[i]]["Secondary_id"]+".json")
    with open(secondary_ctr_path) as json_file:
      secondary_ctr = json.load(json_file)
    secondary_section = secondary_ctr[dev[uuid_list[i]]["Section_id"]]
    vectorizer = TfidfVectorizer().fit(secondary_section)
    X_s = vectorizer.transform([statements[i]])
    X_p = vectorizer.transform(secondary_section) # second CTR vector
    secondary_scores = cosine_distances(X_s, X_p)
    #Combine and average the cosine distances of all entries from the relevant section of the primary and secondary trial
    combined_scores = []
    combined_scores.extend(secondary_scores[0])
    combined_scores.extend(primary_scores[0])
    score = numpy.average(combined_scores) # take average between primary and secondary score
    #If the cosine distance is gless than 0.9 the prediction is entailment
    if score > 0.9:
      Prediction = "Contradiction"
    else:
      Prediction = "Entailment"
    Results[str(uuid_list[i])] = {"Prediction":Prediction}
  else:
    #If the cosine distance is greater than 0.9 the prediction is contradiction
    score = numpy.average(primary_scores)
    if score > 0.9:
      Prediction = "Contradiction"
    else:
      Prediction = "Entailment"
    Results[str(uuid_list[i])] = {"Prediction":Prediction}


##3.1) Transformer-based classification

Large model

Pretraining with clinical text

GPT embedding

In [None]:
Results = {}

CT_JSON_PATH = os.path.join(PROJECT_DIR, "training_data/CT json")

for i in range(len(uuid_list)):
  primary_ctr_path = os.path.join(CT_JSON_PATH, dev[uuid_list[i]]["Primary_id"]+".json")
  with open(primary_ctr_path) as json_file:
    primary_ctr = json.load(json_file)

  #Retrieve the full section from the primary trial
  primary_section = primary_ctr[dev[uuid_list[i]]["Section_id"]]

  #Convert a primary section entries to a matrix of TF-IDF features.
  vectorizer = TfidfVectorizer().fit(primary_section)
  X_s = vectorizer.transform([statements[i]])
  X_p = vectorizer.transform(primary_section)
  #Compute the cosine similarity between the primary section entries and the statement
  primary_scores = cosine_distances(X_s, X_p)
  #Repeat for the secondary trial
  if dev[uuid_list[i]]["Type"] == "Comparison":
    secondary_ctr_path = os.path.join(CT_JSON_PATH, dev[uuid_list[i]]["Secondary_id"]+".json")
    with open(secondary_ctr_path) as json_file:
      secondary_ctr = json.load(json_file)
    secondary_section = secondary_ctr[dev[uuid_list[i]]["Section_id"]]
    vectorizer = TfidfVectorizer().fit(secondary_section)
    X_s = vectorizer.transform([statements[i]])
    X_p = vectorizer.transform(secondary_section)
    secondary_scores = cosine_distances(X_s, X_p)
    #Combine and average the cosine distances of all entries from the relevant section of the primary and secondary trial
    combined_scores = []
    combined_scores.extend(secondary_scores[0])
    combined_scores.extend(primary_scores[0])
    score = numpy.average(combined_scores)
    #If the cosine distance is gless than 0.9 the prediction is entailment
    if score > 0.9:
      Prediction = "Contradiction"
    else:
      Prediction = "Entailment"
    Results[str(uuid_list[i])] = {"Prediction":Prediction}
  else:
    #If the cosine distance is greater than 0.9 the prediction is contradiction
    score = numpy.average(primary_scores)
    if score > 0.9:
      Prediction = "Contradiction"
    else:
      Prediction = "Entailment"
    Results[str(uuid_list[i])] = {"Prediction":Prediction}


Save the results in the submission format.

In [None]:
from itertools import islice
def take(n: int, iterable: Dict) -> Dict:
    """Return the first n items of the iterable as a list."""
    return {k: iterable[k] for k in list(iterable)[:n]}

In [None]:
print('example prediction: ')
pprint(take(5, Results))
with open(os.path.join(PROJECT_DIR,"results-baseline.json"),'w') as jsonFile:
    jsonFile.write(json.dumps(Results,indent=4))

example prediction: 
{'0b6cc8e3-69ee-4a91-b93d-2ad3fddce65f': {'Prediction': 'Contradiction'},
 '1adc970c-d433-44d0-aa09-d3834986f7a2': {'Prediction': 'Contradiction'},
 '6b9162d0-0816-46d4-81af-c60028dcc63b': {'Prediction': 'Entailment'},
 '904061c0-14fa-4f13-9118-9a41e24fa8eb': {'Prediction': 'Entailment'},
 'cc1f712a-2116-4e40-9810-f315e3fa5ff8': {'Prediction': 'Entailment'}}


##3.2) LLM (base or fine-tune)

base

fewshot learning

##4) Evaluation

Compute F1 score, Precision, and Recall. Note that in the final evaluation systems will be ranked by Faithfulness and Consistency, which cannot be computed on the training and development set.

In [None]:
def main():

    gold = dev
    results = Results
    uuid_list = list(results.keys())

    results_pred = []
    gold_labels = []
    for i in range(len(uuid_list)):
        if results[uuid_list[i]]["Prediction"] == "Entailment":
            results_pred.append(1)
        else:
            results_pred.append(0)
        if gold[uuid_list[i]]["Label"] == "Entailment":
            gold_labels.append(1)
        else:
            gold_labels.append(0)

    f_score = f1_score(gold_labels,results_pred)
    p_score = precision_score(gold_labels,results_pred)
    r_score = recall_score(gold_labels,results_pred)

    print('F1:{:f}'.format(f_score))
    print('precision_score:{:f}'.format(p_score))
    print('recall_score:{:f}'.format(r_score))

if '__main__' == __name__:
    main()

F1:0.502415
precision_score:0.485981
recall_score:0.520000


Baseline Solution:

F1:0.502415

precision_score:0.485981

recall_score:0.520000
