# Filter ClinicalTrials.gov dataset for keywords

This script is to filter the protocols for key terms relating to sample size, so that the whole protocol doesn't need to be passed to Spacy

In [None]:
import pandas as pd
import re
import os
import sys
import pickle as pkl
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from IPython.display import display
import matplotlib.pyplot as plt
import numpy as np
import json

In [None]:
df_annotations = pd.read_csv("../data/ctgov/annotations/all_annotations.csv")

# Get data

In [None]:
with open("../data/ctgov/protocols.pkl.gz", "rb") as f:
    file_to_pages = pkl.load(f)

In [None]:
import spacy
nlp = spacy.blank("en")

In [None]:
import operator
sorted(ctr.items(), key=operator.itemgetter(1))

In [None]:
word2num ={'one': 1,
 'two': 2,
 'three': 3,
 'four': 4,
 'five': 5,
 'six': 6,
 'seven': 7,
 'eight': 8,
 'nine': 9,
 'ten': 10,
 'eleven': 11,
 'twelve': 12,
 'thirteen': 13,
 'fourteen': 14,
 'fifteen': 15,
 'sixteen': 16,
 'seventeen': 17,
 'eighteen': 18,
 'nineteen': 19,
 'both': 2,
 'single': 2,
 'twenty': 20,
 'thirty': 30,
 'forty': 40,
 'fifty': 50,
 'sixty': 60,
 'seventy': 70,
 'eighty': 80,
 'ninety': 90,
 'hundred': 100,
 'thousand': 1000}

In [None]:
from collections import Counter
ctr = Counter()

texts = []

for i in range(len(df_annotations)):
    file_name = df_annotations.file.iloc[i]
    
    pages= file_to_pages[file_name]
    
    text = ""
    for page in pages:
        doc = nlp(page)
        
        is_include = [False] * len(doc)
        for tok in doc:
            next_tok = None
            if tok.i < len(doc) - 1:
              next_tok = doc[tok.i + 1]
            prev_tok = None
            if tok.i > 0:
              prev_tok = doc[tok.i - 1]
            antepenultimate_tok = None
            if tok.i > 1:
              antepenultimate_tok = doc[tok.i - 2]

            if tok.text.lower() in {"phase","phases", "arm", "arms", "armed", "cohort", "cohorts", "group", "groups",
                                   "sample",  "sampling", "sampled", "samples", "enroll", "enrol", "enrols", "enrolled", "enrolling", "enrolment",
                                    "recruit", "recruiting", "recruited", "recruits", "recruitment",
                                    "target", "accrual", "accruing", "accrue", "accrued",
                                    "power", "powered", "pts",
                                    "simulate", "simulates", "simulated", "simulating", "simulation", "simulations",
                                    "scenarios",
                                    "n",
                                    "overall", "total", 
                                    "participants", "subjects", "people", "persons", "healthy", "infected",
                                    "pairs", "individuals", "women", "men", "patients", "males", "females", "male", "female",
                                    "select", "selection", "approximately", "achieve",
                                    "cases"
                                   }:
                to_include = True
                if tok.text.lower() == "n" and next_tok is not None and next_tok.text not in {"=", ">", "<", "≥"}:
                  to_include = False
                if tok.i > 1 and tok.text.lower() in {"participants", "subjects", "people", "persons", "healthy", "infected",
                                    "pairs", "individuals", "women", "men", "patients", "males", "females", "male", "female",
                                    "cases", "pts"} and not (prev_tok.like_num or antepenultimate_tok.like_num or prev_tok.text.lower() in word2num or antepenultimate_tok.text.lower() in word2num):
                  to_include = False
                
                if to_include:
                  ctr[tok.text.lower()] += 1
                  for token_index in range(tok.i - 15, tok.i + 15):
                      if token_index >= 0 and token_index < len(doc):
                          is_include[token_index] = True
        
        for token_index in range(len(doc)):
            if is_include[token_index]:
                text += doc[token_index].text + doc[token_index].whitespace_
                
    if text == "":
        print ("nothing found", file_name, df_annotations.phase.iloc[i])
        text = " ".join(pages)
        if len(text) > 100000:
            text = text[:100000]
    else:
        print ("found", file_name, df_annotations.phase.iloc[i], len(text))

    texts.append(text)
df_annotations["text"] = texts

In [None]:
df_annotations.to_csv("filtered_for_phase_arms_subjects_02.csv.bz2")