In [1]:
#imports
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import re
from collections import Counter
import os
import docx
import nltk 
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet 
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')
from gensim.models.fasttext import FastText
from joblib import load

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Sam/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /Users/Sam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/Sam/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#object blueprint
class gearboxNLP():

    """
    A Python class for automated matching of patients to clinical trials.


    Attributes
    -------------------
    embedding_model : gensim model object
        - loaded upon instantiation after user specification of model path (string)

    classifer_model : string
        - user-specified filepath (string) for folder containing multiple binary classifiers
        - classifiers are sklearn model objects stored as .joblib files
        - classifiers are loaded sequentially when ClassifyCriteria() method is called
        
    trial_info : dictionary
        - empty upon initialization
        - will contain dictionary of trial information dictionaries with trial ID
        as keys
        - in addition to trial ID, age, and condition, contains full free-text eligibility
        string, raw criteria, cleaned criteria, embeddings, classifications, and match
        scores


    Methods
    -------------------   
    GetDocx(filepath):
        - accepts a .docx filepath (string) and retrieves full text as string
        - used within ExtractTrialInfo() method for full protocol documents
        - uses 'python-docx' library
        
    RepeatRegexFinder(text, regex):
        - given a string, returns a list of substrings that intersperse a 
        given, repeated regular expression
        - used within ExtractCriteria() method
        
    MultiRegexFinder(text, regex_list):
        - given a string, returns a list of substrings that intersperse a 
        given list of unique regular expressions in order of appearance
        - used within ExtractCriteria() method
        
    ExtractTrialInfo(path):
        - accepts either an NCT ID or .docx filepath and returns a dictionary
        whose contents include the trial ID, age requirements, diagnoses, and
        free-text eligibility criteria
        - if an NCT ID is passed, the information is automatically retrieved
        from clinicaltrials.gov
        - if a .docx filepath is passed, the free text is automatically retrieved
        and other dictionary contents are left blank until ComputeMatchScore()
        method is called
        - used within Match() method
        
    ExtractCriteria(text, mode):
        - given eligibility criteria free text, returns a list of discretized eligibility
        criteria based on the text format detected
        - for text from clinicaltrials.gov, mode = 'ctgov'
        - for text from .docx trial protocols, mode = 'docx'
        - used within Match() method
        
    pos_tagger(nltk_tag):
        - given a part-of-speech tag from nltk library, return wordnet tag
        - used within CleanCriteria() method for lemmatization of text
        
    CleanCriteria(ExtractedCriteria):
        - given list of raw criteria created by ExtractCriteria(), return list of
        cleaned and pre-processed text for NLP (lowercasing, removal of single letter
        words, removal of various special characters, strip leading and ending whitespace,
        lemmatization, etc.)
        
    sent_vectorizer(sent, model):
        - given a sentence and embedding model (specified as attribute above),
        returns a sentence embedding using centroid method
        - used within EmbedCriteria() method
        
    EmbedCriteria(CleanedCriteria):
        - given list of cleaned criteria created by CleanCriteria(), return a pandas
        dataframe of sentence embeddings with their original sentence using the 
        embedding_model loaded at instantiation
        
    ClassifyCriteria(criteria, embeddings, model_folder_path):
        - given embeddings created by EmbedCriteria() and SVM model path (specified as
        attribute above), return pandas dataframe of labeled criteria alongside original
        criterion
        
    ComputeMatchScore(patient, ExtractedCriteria, trialinfo, classified_df):
        - given dictionary of patient info, list from ExtractCriteria, trialinfo dict,
        and dataframe from ClassifyCriteria(), returns a match score
        - match score = (% of matches out out potential matches) * (% of criteria not classified
        as "Other")
        
    Match(patient, docx_trials, ctgov_trials):
        - given dictionary of patient info, list of filepaths for protocol word documents, and list
        of NCT ID's for trials found on clinicaltrials.gov, returns a ranked list of trials, sorted by
        highest likelihood of match (as a pandas dataframe)
        
    """    
        
    #initialization and definition of attributes
    def __init__(self, embedding_model, classifier_model):
        
        #attributes
        self.embedding_model = FastText.load(embedding_model)
        self.classifier_model = classifier_model
        self.trial_info = {}
        
    #GetDocx() method
    def GetDocx(self, filepath):
        
        #read document using docx
        doc = docx.Document(filepath)
        
        #store paragraphs in list
        fullText = []
        for para in doc.paragraphs:
            fullText.append(para.text)
        
        #join paragraphs into single string
        return '\n'.join(fullText)

    #define function to grab fulltext from word doc or ctgov
    def ExtractTrialInfo(self, path):

        #create list of desired variables to extract from xml
        variables = ["NCT_id", "condition", "./eligibility/minimum_age", "./eligibility/maximum_age", "./eligibility/criteria/textblock"]

        #create dict in which each key is one of the variables
        variables_dict = {}.fromkeys(variables)

        #if path ends in .docx
        if path.endswith('.docx'):

            #assign first 9 letters of path as trial name (e.e. APAL2020D)
            variables_dict["NCT_id"] = path.split("/")[-1][:9]

            #get text from word doc and add to dict
            text = self.GetDocx(path)
            variables_dict["./eligibility/criteria/textblock"] = text

        #otherwise, NCT ID is being used
        else:

            #server, trial ID, and ext to return xml for trial
            server = "https://clinicaltrials.gov/ct2/show/"
            trial = path
            ext = "?displayxml=true"

            #make request
            response = requests.get(server + trial + ext)

            #format response as xml
            tree = ET.fromstring(response.content)

            #iterate through variables to extract
            for variable in variables_dict.keys():
                if variable == "condition":
                    variables_dict[variable] = []
                    for each in tree.findall(variable):
                        variables_dict[variable].append(each.text)
                elif variable == "NCT_id":
                    variables_dict[variable] = path
                else:
                    variables_dict[variable] = tree.findtext(variable)        

        #add to trial_info attribute and return dict of trial info
        self.trial_info[variables_dict['NCT_id']] = variables_dict
        return variables_dict
    
    #RepeatRegexFinder() method
    def RepeatRegexFinder(self, text, regex):

        #create empty lists in which to store starts and stops
        starts = []
        stops = []

        #iterate over text to find indices matching regex and its start and stops
        for match in re.finditer(regex, text):

            #note start and stop of matching sequence
            starts.append(match.span()[0])
            stops.append(match.span()[1])

        #create empty list in which to store subcontents
        subcontents = []

        #loop through starts and stops and add desired slices to list--leaving last subcontent off
        for x in range(len(starts) - 1):

            #store each mainbullet slice
            subcontent = text[stops[x]:starts[x + 1]]
            subcontents.append(subcontent)

        #add last subcontent as last slice from 'text'
        lastsubcontent = text[stops[-1]:]
        subcontents.append(lastsubcontent)

        #return list of subcontents
        return subcontents
    
    #MultiRegexFinder() method
    def MultiRegexFinder(self, text, regex_list):

        #create empty list in which to store indices for starts of regex's
        starts = []

        #iterate through regex's
        for regex in regex_list:

            #compile each regex
            re_compiled = re.compile(regex)

            #search text for regex
            re_search = re_compiled.search(text)

            #if search is not empty, add start to above empty list
            if re_search != None:
                start = re_search.span()[0]
                starts.append(start)

            #otherwise, do nothing
            else:
                pass

        #sort starts in case out of order    
        starts.sort()

        #create empty list in which to store subcontents attached to each regex
        subcontents = []

        #loop through starts and add desired content slices to list--leaving last bullet off
        for x in range(len(starts) - 1):

            #add individual content slice of text to above list
            subcontent = text[starts[x]:starts[x + 1]]
            subcontents.append(subcontent)

        #add last header content slice to list
        lastsubcontent = text[starts[-1]:]
        subcontents.append(lastsubcontent)

        #return subcontents
        return subcontents
    
    #define ExtractCriteria()
    def ExtractCriteria(self, text, mode):

        #parse text from clinicaltrials.gov
        if mode == 'ctgov':

            #empty output_list to store criteria in
            output_list = []

            #create lists of headers for later use in format detection
            majorheaders = [r'DISEASE CHARACTERISTICS', r'PATIENT CHARACTERISTICS', r'PRIOR CONCURRENT THERAPY', r'DONOR CHARACTERISTICS']
            patientchars = [r'Age', r'Performance status', r'Life expectancy', r'Hematopoietic', r'Hepatic', r'Renal', r'Cardiovascular', r'Pulmonary', r'Other']
            priortherapy = [r'Biologic therapy', r'Chemotherapy', r'Endocrine therapy', r'Radiotherapy', r'Surgery', r'Other']

            #extraction for: Inclusion and Exclusion Criteria/No Nested Subbullets
            #if inclusion/exclusion criteria followed by any multiple charcters then ':', AND no subbullets
            if ((re.search(r"inclusion criteria(.*):", text.lower()) != None) or (re.search(r"exclusion criteria(.*):", text.lower()) != None)) and (re.search(r'\r\n\r\n {15}\S', text) == None):

                #split text at at each bullet/number, almost always noted by double carriage return, and add to output list
                text = text.split("\r\n\r\n")

                #add criteria to output list
                for each in text:
                    output_list.append(each)      

            #extraction for: Inclusion and Exclusion Criteria/Nested Subbullets
            #if inclusion/exclusion criteria followed by any multiple charcters then ':', AND contains subbullets
            elif ((re.search(r"inclusion criteria(.*):", text.lower()) != None) or (re.search(r"exclusion criteria(.*):", text.lower()) != None)) and (re.search(r'\r\n\r\n {15}\S', text) != None):

                #use RepeatRegexFinder() to return list of mainbullets and their subcontents
                #mainbullets are denoted by the pattern of double carriage return followed by exactly 10 spaces
                mainbullets = self.RepeatRegexFinder(text = text, regex = r'\r\n\r\n {10}\S')

                #set regex as raw string for subbullets - line break followed by 3 or more spaces, then nonwhitespace character
                regex = r'\r\n\r\n {3,}\S'

                #compile each regex
                re_compiled = re.compile(regex)

                #add criteria to output list
                for each in mainbullets:

                    #search text for regex
                    re_search = re_compiled.search(each)

                    #if the first subbullet regex is found
                    if re_search != None:

                        #create empty lists in which to store starts of subbullets
                        starts = []

                        #iterate over text to find indices matching regex and its start
                        for match in re.finditer(regex, each):

                            #note start and stop of matching sequence
                            starts.append(match.span()[0])

                        #for each subbullet detected
                        for x in range(len(starts) - 1):

                            #concatenate subbullet to base_statement and add to new list
                            new_criterion = each[starts[x]:starts[x + 1]]
                            output_list.append(new_criterion)

                    #otherwise, ignore
                    else:
                        output_list.append(each)

            #extraction for: Major Header/Subbullets
            #if contains major head and 0 or 1 capitalized subheaders
            elif (any(term in text for term in majorheaders) and (np.count_nonzero([term in text for term in patientchars]) < 4)):

                #use MultiRegexFinder to return header subcontents
                headercontents = self.MultiRegexFinder(text = text, regex_list = majorheaders)

                #deal with subbullet handling in the same way used above
                #iterate through each header content
                for subtext in headercontents:

                    #if carriage returns exist
                    if re.search(r'\r\n\r\n {10}\S', subtext) != None:

                        #use RepeatRegexFinder() to return list of mainbullets and their subcontents
                        #mainbullets are denoted by the pattern of double carriage return followed by exactly 10 spaces
                        subtexts = self.RepeatRegexFinder(text = subtext, regex = r'\r\n\r\n {10}\S')

                        #set regex as raw string for subbullets - line break followed by 3 or more spaces, then nonwhitespace character
                        regex = r'\r\n\r\n {3,}\S'

                        #compile each regex
                        re_compiled = re.compile(regex)

                        #add criteria to output list
                        for each in subtexts:

                            #search text for regex
                            re_search = re_compiled.search(each)

                            #if the first subbullet regex is found
                            if re_search != None:

                                #create empty lists in which to store starts of subbullets
                                starts = []

                                #iterate over text to find indices matching regex and its start
                                for match in re.finditer(regex, each):

                                    #note start and stop of matching sequence
                                    starts.append(match.span()[0])

                                #for each subbullet detected
                                for x in range(len(starts) - 1):

                                    #concatenate subbullet to base_statement and add to new list
                                    new_criterion = each[starts[x]:starts[x + 1]]
                                    output_list.append(new_criterion)

                            #otherwise, ignore
                            else:
                                output_list.append(each)

                    #otherwise has no bullets - split on period, colon, or semicolon followed by at least one space
                    else:
                        try:
                            subtexts = self.RepeatRegexFinder(text = subtext, regex =  r"[\.;] {1,}")
                            for each in subtexts:
                                output_list.append(each)
                        except:
                            output_list.append(subtext)

            #extraction for last remaining Format ID: Major Header/Subheader/Subbullets
            #this process will be the most complex, as it has the most heterogeneity
            #if contains major head and >=2 capitalized subheaders
            elif (any(term in text for term in majorheaders)) and (np.count_nonzero([term in text for term in patientchars]) >= 4):

                #use MultiRegexFinder to return major header subcontents
                headercontents = self.MultiRegexFinder(text = text, regex_list = majorheaders)

                #deal with subbullet/subheader handling for each major header section
                #iterate through each header content        
                for subtext in headercontents:

                    #if patient characteristics in subtext (regardless of capitalization pattern), handle as list with subheaders/bullets from patientchars
                    if "patient characteristics" in subtext.lower():

                        #use MultiRegexFinder() to return subcontents of patientchars headers
                        subcontents = self.MultiRegexFinder(text = subtext, regex_list = patientchars)

                        #set regex as raw string for subbullets - line break followed by 3 or more spaces, then nonwhitespace character
                        regex = r'\r\n\r\n {3,}\S'

                        #compile each regex
                        re_compiled = re.compile(regex)

                        #add to raw criteria
                        for each in subcontents:

                            #search text for regex
                            re_search = re_compiled.search(each)

                            #if the first subbullet regex is found
                            if re_search != None:

                                #create empty lists in which to store starts of subbullets
                                starts = []

                                #iterate over text to find indices matching regex and its start
                                for match in re.finditer(regex, each):

                                    #note start and stop of matching sequence
                                    starts.append(match.span()[0])

                                #for each subbullet detected
                                for x in range(len(starts) - 1):

                                    #concatenate subbullet to base_statement and add to new list
                                    new_criterion = each[starts[x]:starts[x + 1]]
                                    output_list.append(new_criterion)

                            #otherwise, ignore
                            else:
                                output_list.append(each)


                    #if prior concurrent therapy in subtext (regardless of capitalization pattern), handle as bulleted list with subheaders listed above
                    #the "chemotherapy" specification was added to ensure this section occurs in subheader format
                    #when "chemotherapy" is not present, the format does not have subheaders and will be handled otherwise
                    elif ("prior concurrent therapy" in subtext.lower()) and ("chemotherapy" in subtext.lower()):

                        #use MultiRegexFinder() to return subcontents of priortherapy headers
                        subcontents = self.MultiRegexFinder(text = subtext, regex_list = priortherapy)

                        #set regex as raw string for subbullets - line break followed by 3 or more spaces, then nonwhitespace character
                        regex = r'\r\n\r\n {3,}\S'

                        #compile each regex
                        re_compiled = re.compile(regex)

                        #add to raw criteria
                        for each in subcontents:

                            #search text for regex
                            re_search = re_compiled.search(each)

                            #if the first subbullet regex is found
                            if re_search != None:

                                #create empty lists in which to store starts of subbullets
                                starts = []

                                #iterate over text to find indices matching regex and its start
                                for match in re.finditer(regex, each):

                                    #note start and stop of matching sequence
                                    starts.append(match.span()[0])

                                #for each subbullet detected
                                for x in range(len(starts) - 1):

                                    #concatenate subbullet to base_statement and add to new list
                                    new_criterion = each[starts[x]:starts[x + 1]]
                                    output_list.append(new_criterion)

                            #otherwise, ignore
                            else:
                                output_list.append(each)

                    #otherwise, most can be treated as bulleted list with potential subbullets as with other format IDs
                    elif re.search(r'\r\n\r\n {10}\S', subtext) != None:

                        #use RepeatRegexFinder() to return list of mainbullets and their subcontents
                        #mainbullets are denoted by the pattern of double carriage return followed by exactly 10 spaces
                        mainbullets = self.RepeatRegexFinder(text = subtext, regex = r'\r\n\r\n {10}\S')

                        #set regex as raw string for subbullets - line break followed by 3 or more spaces, then nonwhitespace character
                        regex = r'\r\n\r\n {3,}\S'

                        #compile each regex
                        re_compiled = re.compile(regex)

                        #add to raw criteria
                        for each in mainbullets:

                            #search text for regex
                            re_search = re_compiled.search(each)

                            #if the first subbullet regex is found
                            if re_search != None:

                                #create empty lists in which to store starts of subbullets
                                starts = []

                                #iterate over text to find indices matching regex and its start
                                for match in re.finditer(regex, each):

                                    #note start and stop of matching sequence
                                    starts.append(match.span()[0])

                                #for each subbullet detected
                                for x in range(len(starts) - 1):

                                    #concatenate subbullet to base_statement and add to new list
                                    new_criterion = each[starts[x]:starts[x + 1]]
                                    output_list.append(new_criterion)

                            #otherwise, ignore
                            else:
                                output_list.append(each)

                    #otherwise, subtext is a full paragraph without demarcation between individual criteria
                    else:
                        try:
                            subtexts = self.RepeatRegexFinder(text = subtext, regex =  r"[\.;] {1,}")
                            for each in subtexts:
                                output_list.append(each)
                        except:
                            output_list.append(subtext)


            #otherwise, treat as bulleted list and split bullets on sentences
            else:

                #split text at at each bullet/number, almost always noted by double carriage return, and add to output list
                text = text.split("\r\n\r\n")

                #add criteria to output list
                for subtext in text:
                    try:
                        subtexts = self.RepeatRegexFinder(text = subtext, regex =  r"[\.;] {1,}")
                        for each in subtexts:
                            output_list.append(each)
                    except:
                        output_list.append(subtext)

            #return separated inclusion/exclusion criteria if possible, otherwise return list
            exc_start = 0
            for each in output_list:
                if "exclusion criteria" in each.lower():
                    exc_start = output_list.index(each)
                else:
                    pass
            if exc_start != 0:
                inclusioncriteria = output_list[:exc_start]
                exclusioncriteria = output_list[exc_start:]
                return [inclusioncriteria, exclusioncriteria]
            else:
                return output_list

        #parse text from trial protocol word document 
        elif mode == 'docx':

            #define output list to store criteria
            output_list = []

            #find eligibility criteria text between Eligibility and Arms/Regimens
            ec_window = [r'\n\nEligibility', 
                         r'\n\nArms/Regimens']
            fullcriteria = self.MultiRegexFinder(text = text, regex_list = ec_window)[0]

            #define possible subheaders to detect
            subheader_windows = [r'\nAge',
                                 r' Age:',
                                 r'\nWeight',
                                 r'\nDiagnosis',
                                 r'\nDisease Status',
                                 r'\nPerformance Status', 
                                 r'\nPrior Therapy',
                                 r'\nOrgan function criteria',
                                 r'\nOrgan Function Requirements',
                                 r'\nExclusion Criteria',
                                 r'\nExclusion criteria',
                                 r'\tExclusion Criteria',
                                 r'\nConcomitant Medications', 
                                 r'\nPregnancy or Breast-Feeding', 
                                 r'\nInfection',
                                 r'\nSystemic Diseases']

            #detect subheaders and store in list
            splitcriteria = self.MultiRegexFinder(text = fullcriteria, regex_list = subheader_windows)

            #make lists to store inclusion/exclusion criteria
            inclusioncriteria = []
            exclusioncriteria = []

            #split criteria on line breaks if many bullets (>1), else split on sentences if many sentences (>1)
            for each in splitcriteria:
                if each.startswith("\nAge") or each.startswith("Age"):
                    output_list.append(each)
                elif each.startswith("\nDiagnosis") or each.startswith("\nDisease Status"):
                    output_list.append(each)
                    for criterion in each.split("\n"):
                        if "CNS" in criterion:
                            inclusioncriteria.append(criterion)
                        else:
                            pass
                elif "exclusion criteria" not in each.lower():
                    if len(each.split("\n")) > 4:
                        for subeach in each.split("\n"):
                            inclusioncriteria.append(subeach)
                    else:
                        regex = r"[\.:;] {1,}"
                        if len([x for x in re.finditer(regex, each)]) > 1:
                            for subeach in self.RepeatRegexFinder(text = each, regex = regex):
                                inclusioncriteria.append(subeach)
                        else:
                            inclusioncriteria.append(each)
                else:
                    if len(each.split("\n")) > 4:
                        for subeach in each.split("\n"):
                            exclusioncriteria.append(subeach)
                    else:
                        regex = r"[\.:;] {1,}"
                        if len([x for x in re.finditer(regex, each)]) > 1:
                            for subeach in self.RepeatRegexFinder(text = each, regex = regex):
                                exclusioncriteria.append(subeach)
                        else:
                            exclusioncriteria.append(each)

            #add to output list
            output_list.append(inclusioncriteria)
            output_list.append(exclusioncriteria)

            #return output list
            return output_list

        #must specify mode
        else:
            print("You must specify mode as 'ctgov' or 'docx'.")
            
    #define pos_tagger()
    def pos_tagger(self, nltk_tag): 
        
        #if tagged as J, mark as adjective
        if nltk_tag.startswith('J'): 
            return wordnet.ADJ    
        
        #if tagged as V, mark as verb
        elif nltk_tag.startswith('V'): 
            return wordnet.VERB 

        #if tagged as N, mark as noun
        elif nltk_tag.startswith('N'): 
            return wordnet.NOUN 
        
        #if tagged as R, mark as adverb
        elif nltk_tag.startswith('R'): 
            return wordnet.ADV     
        
        #else, don't mark
        else:           
            return None
        
    #define CleanCriteria()
    def CleanCriteria(self, ExtractedCriteria):

        #create empty list in which to store cleaned text with the following changes
        final_criteria = []

        #compile regular expression to detect strings of all caps (i.e. abbreviations)
        regex = r"\b[A-Z]{2,}\b"
        re_compiled = re.compile(regex)

        #list of common all caps non-abbreviation words (appear > 1x)
        non_abrv = ["DONOR", "DISEASE", "CHARACTERISTICS", "AND", "DONORS", "RELATED", "OR", "INCLUSION", "CRITERIA", "EXCLUSION", "PRIOR", "CONCURRENT", "THERAPY", "NOTE", "BEFORE", "PATIENTS", "MATCHED", "UNRELATED", "MUST", "REAL", "TRANSPLANT", "PATIENT", "ELIGIBILITY", "ALLOWED", "ADULT", "PEDIATRIC", "ORGAN", "DYSFUNCTION", "EXCEPT", "STRATUM", "STRATA", "GROUP", "AGED"]

        #list of custom stop words based on top 100 terms, many removed for semantic significance
        custom_stops = ['or','of','the','patients','to','for','with','no','and','at','not','must','be','have','in',
                        'are','than','as', 'by','is','study','other','on', 'who','if', 'will','any', 'criteria','patient',
                        'from','this','that','allowed','an','may','all','known']

        #list of additional suffixes to be removed
        suffix_list = ["tion", "ical", "ious", "ance"]

        #conduct pre-processing steps for each criterion in list
        for each in ExtractedCriteria:

            #break criterion into single words
            word_list = []
            for word in each.split():

                #search word for abbreviations
                re_search = re_compiled.search(word)

                #if search is not empty and word isn't a commonly all-caps non abbreviation
                #keep abbreviation as is, otherwise lowercase
                #get rid of stop words as well
                if (re_search != None) & (not any(term in word for term in non_abrv)):
                    word_list.append(word) 
                elif word.lower() not in custom_stops:
                    word = word.lower()
                    word_list.append(word)
                else:
                    pass

            #reassign "each" to sentence that is lowercased except for abbreviations
            sentence = " ".join(word_list)

            #remove all special characters except numbers and ';' (often used in genetic mutations)
            sentence = re.sub(r'[^A-z0-9 ;]', "", sentence)

            #remove all single characters
            sentence = re.sub(r'\s+[a-zA-Z0-9]\s+', "", sentence)

            #tokenize the sentence and find the POS tag for each token 
            pos_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))   

            #use previously defined function to fix tags
            #reference: https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/#:~:text=Wordnet%20Lemmatizer%20(with%20POS%20tag)&text=This%20is%20because%20these%20words,%2C%20noun%2C%20adjective%20etc).
            wordnet_tagged = list(map(lambda x: (x[0], self.pos_tagger(x[1])), pos_tagged)) 

            #instantiate lemmatizer object to be used with CleanCriteria
            lemmatizer = WordNetLemmatizer() 

            #create empty list in which to store lemmatized sentence
            lemmatized_sentence = [] 

            #iterate through mapped list with wordnet tags
            for word, tag in wordnet_tagged: 
                #if there is no available tag, append the token as is 
                if tag is None: 
                    lemmatized_sentence.append(word) 

                # else use the tag to lemmatize the token         
                else:         
                    lemmatized_sentence.append(lemmatizer.lemmatize(word, tag)) 

            #remove selected suffixes from words that are poorly handled by automatic lemmatizer
            for index in range(len(lemmatized_sentence)):
                if lemmatized_sentence[index][-4:] in suffix_list:
                    lemmatized_sentence[index] = lemmatized_sentence[index][:-4]

            #join previously created list into sentence (i.e. single string)
            lemmatized_sentence = " ".join(lemmatized_sentence) 

            #replace multiple whitespace with single whitespace
            lemmatized_sentence = re.sub(" +", " ", lemmatized_sentence)

            #strip leading and ending whitespace
            lemmatized_sentence = lemmatized_sentence.strip()

            #add to new empty list
            final_criteria.append(lemmatized_sentence)

        #make df with original and final
        df = pd.DataFrame({'Original':ExtractedCriteria, 'Final':final_criteria})
        df = df[df['Original'] != '']
        return df
    
    #define sent_vectorizer()
    def sent_vectorizer(self, sent, model):

        #empty list in which to store sentence vectors
        sent_vec =[]

        #keeps track of total number of words in sentence
        numw = 0

        #for each word in a sentence
        for w in sent:

            #if this is the first word, sentence vector starts out with single embedding
            #if not the first word, add word embedding to previous embeddings as part of cumulative sentence vector
            try:
                if numw == 0:
                    sent_vec = model.wv[w]
                else:
                    sent_vec = np.add(sent_vec, model.wv[w])

                #add 1 to word counter for each iteration
                numw+=1

            #if there's an error, do nothing
            except:
                pass

        #when finished, return the overall sentence vector divided by the number of words 
        return np.asarray(sent_vec) / numw

    #def EmbedCriteria()
    def EmbedCriteria(self, CleanedCriteria):

        #set embedding model
        ft_model = self.embedding_model

        #tokenize criteria
        tokenized = [nltk.word_tokenize(criterion) for criterion in CleanedCriteria]

        #create empty list in which to store sentence embeddings
        X = []

        #for each criterion in overall data list, vectorize the sentence and append to X
        for sentence in tokenized:
            X.append(self.sent_vectorizer(sentence, ft_model))

        #make df with final and embedding
        df = pd.DataFrame({'Final':CleanedCriteria, 'Embedding':X})
        return df

    #define ClassifyCriteria()
    def ClassifyCriteria(self, criteria, embeddings, model_folder_path):

        #create empty dict in which to store various class probabilities from SVM
        probabilities = {}

        #iterate through models and obtain predictions on input data
        for each in os.listdir(model_folder_path):
            if each.endswith(".joblib"):
                model = load(model_folder_path + each)
                two_sided_prob = model.predict_proba(list(embeddings))
                prob_outcome = two_sided_prob[:,1]
                probabilities[each.strip(".joblib")] = list(prob_outcome)

        #cast probability dict into dataframe
        prob_df = pd.DataFrame(probabilities)

        #choose most likely class as highest probability across models (i.e. one vs. all)
        mostlikelyclass = prob_df.idxmax(axis = 1)

        #make dataframe with individual criterion alongside prediction
        pred_df = pd.DataFrame({'Criterion': list(criteria), 'Prediction': mostlikelyclass})

        #gather associated probabilities from each prediction "winner" and add to list
        winners = []
        for x in range(len(pred_df)):
            label = pred_df['Prediction'][x]
            winner = prob_df[label][x]
            winners.append(winner)

        #for each prediction, if confidence less than 20%, assign as other
        threshold = 0.2
        for x in range(len(pred_df)):
            if winners[x] < threshold:
                pred_df["Prediction"][x] = "Other"
            else:
                pass

        #for CNS involvement, drop predictions if CNS not in criterion (high sensitivity, low specificity)   
        for x in range(len(pred_df)):
            if (pred_df["Prediction"][x] == "CNSInvolvement") and ("CNS" not in pred_df['Criterion'][x]):
                pred_df["Prediction"][x] = "Other"
            else:
                pass

        #return dataframe of original criteria with associated predictions
        return pred_df
    
    #define ComputeMatchScore()
    def ComputeMatchScore(self, patient, ExtractedCriteria, trialinfo, classified_df):

        #counters to keep track of matches vs. potential matches
        potentials = 0
        matches = 0

        #Age handler
        #if age already specified in info dictionary, add as potential match
        if (trialinfo['./eligibility/minimum_age'] != None) and (trialinfo['./eligibility/maximum_age'] != None):
            potentials += 1

        #otherwise, go looking to find age elsewhere
        else:

            #if age found in list of extracted criteria
            for each in ExtractedCriteria:
                if (type(each) == str) and (each.startswith("\nAge")):

                    #regex to match comparator, number, and units
                    regex = r'([<>≤≥]|greater than|less than|greater than or equal to|less than or equal to|over|under) {0,}[=]{0,} {0,}[0-9]{1,} {1,}(years|months|days)'


                    #if regex match found, add as a potential match
                    if re.search(regex, each) != None:
                        potentials += 1
                        for found in re.finditer(regex, each):

                            #grab matching slice from criterion string
                            age_slice = each[found.span()[0]:found.span()[1]].replace(' ', '')

                            #if greater than symbol, add as minimum age to trial info dict
                            if (">" in age_slice) or ("greater" in age_slice) or ("over" in age_slice):
                                trialinfo['./eligibility/minimum_age'] = age_slice

                            #otherwise, add as maximum age to trial info dict
                            else:
                                trialinfo['./eligibility/maximum_age'] = age_slice
                                
                    #look for "and under" syntax
                    else:
                        
                        #regex search to match number, units, comparator
                        regex2 = r'[0-9]{1,} {1,}(years|months|days){,1} {0,}and (under|over)'
                        found = re.search(regex2, each)
                        
                        #if match, grab age slice and add as min/max age
                        if found != None:
                            potentials += 1
                            age_slice = each[found.span()[0]:found.span()[1]].replace(' ', '')                         
                            if "under" in age_slice:
                                trialinfo['./eligibility/maximum_age'] = age_slice
                            else:
                                trialinfo['./eligibility/minimum_age'] = age_slice
                            


            #if age is still empty, go looking for age based on NLP-classifier 
            if (trialinfo['./eligibility/minimum_age'] == None) and (trialinfo['./eligibility/maximum_age'] == None):               

                #if at least one criterion classified as "Age", add as a potential match
                if len(classified_df[classified_df['Prediction'] == 'Age']) > 0:
                    potentials += 1

                    #for each potential match
                    for each in classified_df[classified_df['Prediction'] == 'Age']['Criterion']:

                        #regex to match comparator, number, and units
                        regex = r'([<>≤≥]|greater than|less than|greater than or equal to|less than or equal to|over|under) {0,}[=]{0,} {0,}[0-9]{1,} {1,}(years|months|days)'

                        #if regex match found
                        if re.search(regex, each) != None:
                            for found in re.finditer(regex, each):

                                #grab matching slice from criterion string
                                age_slice = each[found.span()[0]:found.span()[1]].replace(' ', '')

                                #if greater than symbol, add as minimum age to trial info dict
                                if (">" in age_slice) or ("greater" in age_slice) or ("over" in age_slice):
                                    trialinfo['./eligibility/minimum_age'] = age_slice

                                #otherwise, add as maximum age to trial info dict
                                else:
                                    trialinfo['./eligibility/maximum_age'] = age_slice

                        #look for "and under" syntax
                        else:

                            #regex search to match number, units, comparator
                            regex2 = r'[0-9]{1,} {1,}(years|months|days){,1} {0,}and (under|over)'
                            found = re.search(regex2, each)

                            #if match, grab age slice and add as min/max age
                            if found != None:
                                age_slice = each[found.span()[0]:found.span()[1]].replace(' ', '')
                                if "under" in age_slice:
                                    trialinfo['./eligibility/maximum_age'] = age_slice
                                else:
                                    trialinfo['./eligibility/minimum_age'] = age_slice

        #minimum age
        min_age_original = trialinfo["./eligibility/minimum_age"]
        
        #if n/a or empty, min age is 0
        if min_age_original == "N/A" or min_age_original == None:
            min_age = 0.00
        
        #otherwise, detect units and convert to days
        else:
            if "year" in min_age_original.lower():
                min_age = float(re.sub(r"[^0-9]", "", min_age_original))
                min_age = min_age * 365
            elif "month" in min_age_original.lower():
                min_age = float(re.sub(r"[^0-9]", "", min_age_original))
                min_age = min_age * 30
            elif "day" not in min_age_original.lower():
                min_age = float(re.sub(r"[^0-9]", "", min_age_original))
                min_age = min_age * 365                 
            else:
                min_age = float(re.sub(r"[^0-9]", "", min_age_original))


        #maximum age
        max_age_original = trialinfo["./eligibility/maximum_age"]
        
        #if n/a or empty, max age is 200 yrs
        if max_age_original == "N/A" or max_age_original == None:
            max_age = 200.00 * 365
        
        #otherwise, detect units and convert to days
        else:
            if "year" in max_age_original.lower():
                max_age = float(re.sub(r"[^0-9]", "", max_age_original))
                max_age = max_age * 365
            elif "month" in max_age_original.lower():
                max_age = float(re.sub(r"[^0-9]", "", max_age_original))
                max_age = max_age * 30
            elif "day" not in max_age_original.lower():
                max_age = float(re.sub(r"[^0-9]", "", max_age_original))
                max_age = max_age * 365                
            else:
                max_age = float(re.sub(r"[^0-9]", "", max_age_original))  

        #compute if age match
        if min_age <= patient["Age (Days)"] <= max_age:
            matches +=1

        #Renal Function handler
        for each in classified_df[classified_df['Prediction'] == 'RenalFunction']['Criterion']:
            regex = r'(Glomerular filtration rate|glomerular filtration rate|GFR|\(GFR\)|Creatinine clearance|creatinine clearance|Creatinine clearance of|creatinine clearance of|creatinine-clearance|GRF|CrCl) {0,}(must be){,1} {0,}([<>≤≥]|>=|<=|greater than|greater than or equal to|at least){,1} {0,}[0-9]{2,3}'
            if re.search(regex, each) != None:
                potentials += 1

                #GFR calculation

                #for children less than 18
                if patient["Age (Days)"] < (18*365):

                    #calculate GFR using Schwartz equation        
                    GFR = (0.41 * patient["Height (cm)"]) / patient["Creatinine (mg/dL)"]

                #for children greater than 18
                else:

                    #calculate GFR using MDRD equation        
                    GFR = 175 * (patient["Creatinine (mg/dL)"]**(-1.154)) * (patient["Age (Days)"]**(-0.203))
                    if patient["Female"] == True:
                        GFR = GFR * 0.742
                    if patient["African-American"] == True:
                        GFR = GFR * 1.212

                #find GFR requirement
                found = re.search(regex, each)
                gfr_slice = each[found.span()[0]:found.span()[1]].replace(' ', '')
                gfr_slice = float(re.sub(r"[^0-9]", "", gfr_slice))

                #if GFR greater than or equal to requirement, match is found
                if GFR >= gfr_slice:
                    matches += 1
                else:
                    pass
                
            #else look for serum creatinine
            else:
                regex2 = r'(Creatinine|creatinine) {0,}(must be){,1} {0,}([<>≤≥]|>=|<=|no greater than|less than|less than or equal to){,1} {0,}[0-9]\.{,1}[0-9]{,1}'
                found = re.search(regex2, each)
                
                #if found, add a potential match
                if found != None:
                    potentials += 1
                    
                    #if creatinine matches, add match
                    cr_slice = each[found.span()[0]:found.span()[1]].replace(' ', '')
                    cr_slice = float(re.sub(r"[^0-9.]", "", cr_slice))
                    if patient["Creatinine (mg/dL)"] <= cr_slice:
                        matches += 1
                        
                #lastly, look for "normal" (assuming ULN serum Cr is 1)
                else:
                    
                    #if normal in phrase add potential match
                    if ("normal" in each) or ("ULN" in each):
                        potentials += 1
                        
                        #if creatinine <= 1, add match
                        if patient["Creatinine (mg/dL)"] <= 1.0:
                            matches += 1


        #performance status handler
        for each in classified_df[classified_df["Prediction"] == "PerformanceStatus"]['Criterion']:
            
            #look for first possible syntax
            if re.search(r'(Karnofsky|karnofsky|Karnofsky \(adult\)|Lansky|lanksy|Lanksy Play|Lanksy \(pediatric\)) {0,}(index|Index|score|scores|performance status|performance status \(PS\)|Performance status|Performance status \(PS\)|performance score|performance score \(PS\)|Performance score|Performance Score|Performance Level|PS|Performance Status \(KPS\)|performance scale|activity assessment){,1} {0,}(of){,1} {0,}([<>≤≥]|greater than|greater than or equal to|above){,1}[=]{,1} {0,}[0-9]{2,3}', each) != None:
                regex = r'(Karnofsky|karnofsky|Karnofsky \(adult\)|Lansky|lanksy|Lanksy Play|Lanksy \(pediatric\)) {0,}(index|Index|score|scores|performance status|performance status \(PS\)|Performance status|Performance status \(PS\)|performance score|performance score \(PS\)|Performance score|Performance Score|Performance Level|PS|Performance Status \(KPS\)|performance scale|activity assessment){,1} {0,}(of){,1} {0,}([<>≤≥]|greater than|greater than or equal to|above){,1}[=]{,1} {0,}[0-9]{2,3}'
                potentials += 1

                #find performance status requirement
                for found in re.finditer(regex, each):
                    ps_slice = each[found.span()[0]:found.span()[1]].replace(' ', '')
                    ps_slice = float(re.sub(r"[^0-9]", "", ps_slice))
                    if patient["Performance Status (Lanksy/Karnofsky)"] >= ps_slice:
                        matches += 1
                        break
                    else:
                        break
            
            #check other syntax
            elif re.search(r'[<>≤≥]{0,1}[=]{0,1} {0,}[0-9]{2}%{0,1} {0,1}(Karnofsky|karnofsky|Lansky|lanksy)', each) != None:
                regex = r'[<>≤≥]{0,1}[=]{0,1} {0,}[0-9]{2}%{0,1} {0,1}(Karnofsky|karnofsky|Lansky|lanksy)'
                potentials += 1

                #find performance status requirement
                for found in re.finditer(regex, each):
                    ps_slice = each[found.span()[0]:found.span()[1]].replace(' ', '')
                    ps_slice = float(re.sub(r"[^0-9]", "", ps_slice))
                    if patient["Performance Status (Lanksy/Karnofsky)"] >= ps_slice:
                        matches += 1
                        break
                    else:
                        break
            
            #otherwise if ECOG score instead of karnofsky/lansky
            else:
                regex = r'(ECOG|Zubrod|zubrod|\(ECOG\)) {0,}(score|scores|performance status|PS|PS of){,1} {0,}([0-9]-[0-9]|of 0, 1, or 2|categories 0, 1, or 2|<= 2|of 0 or 1|of 0, 1 or 2)'
                
                #if ECOG found, add to potentials
                if re.search(regex, each) != None:
                    potentials += 1

                    #ECOG score
                    if patient["Performance Status (Lanksy/Karnofsky)"] in (10,20):
                        ECOG = 4
                    elif patient["Performance Status (Lanksy/Karnofsky)"] in (30,40):
                        ECOG = 3
                    elif patient["Performance Status (Lanksy/Karnofsky)"] in (50,60):
                        ECOG = 2
                    elif patient["Performance Status (Lanksy/Karnofsky)"] in (70,80):
                        ECOG = 1
                    elif patient["Performance Status (Lanksy/Karnofsky)"] in (90,100):
                        ECOG = 0

                    #find performance status requirement
                    found = re.search(regex, each)
                    ps_slice = each[found.span()[0]:found.span()[1]]
                    ps_slice = re.sub(r"[^0-9]", "", ps_slice)
                    max_ecog = max([int(number) for number in ps_slice])
                    
                    #if patients ECOG matches, add match
                    if ECOG <= max_ecog:
                        matches += 1
                        
        #diagnosis handler
        
        #if conditions are known (from ct.gov), compare patient condition to known
        if trialinfo['condition'] != None:
            potentials += 1
            if patient['Diagnosis'] in trialinfo['condition']:
                matches += 1
        
        #otherwise, find conditions in study from diagnosis or disease status sections of protocol
        else:
            
            #empty list in which to store study conditions (condition = diagnosis)
            trialinfo['condition'] = []
            
            #look in Diagnosis section
            for each in ExtractedCriteria:
                if (type(each)) == str and each.startswith("\nDiagnosis"):
                    
                    #split on line breaks
                    for diagnosis in each.split("\n"):
                        
                        #search for diagnosis keywords and add to above list
                        search = re.search(r'(relapsed|Relapsed|refractory|Refractory|resistant|Resistant) {0,}([A-Z]{2,5}|disease|leukemia)', diagnosis)
                        if search != None:
                            trialinfo['condition'].append(diagnosis[search.span()[0]:search.span()[1]])
                        else:
                            search = re.search(r'([A-Z]{2,5}|disease|leukemia) {0,}(relapsed|refractory|resistant)', diagnosis)
                            if search != None:
                                trialinfo['condition'].append(diagnosis[search.span()[0]:search.span()[1]])
                            else:
                                pass
                
                #look in Disease Status Section
                elif (type(each)) == str and each.startswith("\nDisease Status"):
                    
                    #split on line breaks
                    for diagnosis in each.split("\n"):
                        
                        #search for diagnosis keywords and add to above list
                        search = re.search(r'(relapsed|Relapsed|refractory|Refractory|resistant|Resistant) {0,}([A-Z]{2,5}|disease|leukemia)', diagnosis)
                        if search != None:
                            trialinfo['condition'].append(diagnosis[search.span()[0]:search.span()[1]])
                        else:
                            search = re.search(r'([A-Z]{2,5}|disease|leukemia) {0,}(relapsed|refractory|resistant)', diagnosis)
                            if search != None:
                                trialinfo['condition'].append(diagnosis[search.span()[0]:search.span()[1]])
                            else:
                                pass
                
                #otherwise, pass
                else:
                    pass   
            
            #now that diagnoses have been searched for 
            if trialinfo['condition'] != None:
                potentials += 1
                
                #if patient diagnosis in list, add match
                if patient['Diagnosis'] in trialinfo['condition']:
                    matches += 1

        #CNS Involvement Handler
        #for each classified as CNSInvolvement
        for each in classified_df[classified_df["Prediction"] == "CNSInvolvement"]['Criterion']:
            
            #look for CNS numeric status and compare if detected
            search = re.search(r'CNS(-| ){,1}(1|2|3)', each)
            if search != None:
                potentials += 1
                if patient["CNS Involvement (1/2/3)"] <= int(re.sub(r'[^0-9]', '', each[search.span()[0]:search.span()[1]])):
                    matches += 1
            
            #otherwise look for ineligibility in regards to isolated CNS disease
            else:
                search = re.search(r'(isolated|Isolated) {0,}(CNS|central)', each)
                if search != None:
                    potentials += 1
                    if patient['Isolated CNS Disease'] == False:
                        matches += 1
                
                #otherwise look for ineligibility regarding general CNS involvement
                else:
                    search = re.search(r'(no|No|No known){,1} {0,}(signs of|clinical signs of){,1} {0,}(uncontrolled|active|current){,1} {0,}(CNS|central nervous system|central nervous system \(CNS\)) {0,}(disease|involvement) {0,}(eligible|ineligible|not eligibile|excluded|allowed|not allowed|are not eligible){,1}', each)
                    if search != None:
                        potentials += 1
                        cns_slice = each[search.span()[0]:search.span()[1]]
                        
                        #if CNS involvement excluded
                        if re.search(r'(not eligibile|ineligible|excluded|not allowed|are not eligible)', cns_slice) != None:
                            if patient["CNS Involvement (1/2/3)"] == 1:
                                matches += 1
                        
                        #if CNS involvement allowed
                        elif re.search(r'(eligibile|allowed)', cns_slice) != None:
                            matches += 1
                        
                        #otherwise, assume CNS criterion is exclusionary for involvement
                        else:
                            if patient["CNS Involvement (1/2/3)"] == 1:
                                matches += 1
                            else:
                                pass


        #prior therapy handler
        #for each criterion classified as prior therapy
        for each in classified_df[classified_df["Prediction"] == "PriorAntileukemicTherapy"]['Criterion']:
            
            #define two flexible regular expressions to detect comparators/time
            regex = r'(within|until|at least|greater than|more than|up to|within the past|has been|[>≥]|>=) {0,}([0-9]{1,3}|one|two|three|four|five|six) {0,}(day|week|month|hour)'            
            regex2 = r'([0-9]{1,3}|one|two|three|four|five|six) {0,}(day|week|month|hour|days|weeks|months|hours) {0,}(before|prior|must have elapsed|have elapsed)'
            
            #define lists of keywords to identify each subcategory of antileukemic therapy
            chemotherapy = ['chemotherapy', 'cytotoxic', 'myelosuppressive', 'retinoid', 'bine ']
            biologic = ['antibody', 'biologic', 'immunotherapy', 'immunomodulat', 'mab ', 'anticd', 'cellular therapy', 'interferon']
            growthfactor = ['growth factor', 'gmcsf', 'gcsf', 'filgrastim', 'sargramostim', 'epoetin alfa']
            radiotherapy = ['radio', 'radia', 'xrt', 'cgy']
            steroids = ['corticosteroid', 'isone', 'asone']
            
            #search text using first regular expression for time/comparator patterns
            search = re.search(regex, each.lower())
            
            #if match detected
            if search != None:
                
                #scan text and extract number - convert to days if necessary
                extracted_text = each[search.span()[0]:search.span()[1]]
                if "day" in extracted_text:
                    extracted_num = int(re.sub(r'[^0-9]', '', extracted_text))
                elif "week" in extracted_text:
                    extracted_num = int(re.sub(r'[^0-9]', '', extracted_text)) * 7
                elif "month" in extracted_text:
                    extracted_num = int(re.sub(r'[^0-9]', '', extracted_text)) * 30
                elif "hour" in extracted_text:
                    extracted_num = int(re.sub(r'[^0-9]', '', extracted_text)) / 24
                
                #if subcategory is chemotherapy, compare to patient characteristics and compute match
                if any(term in each.lower() for term in chemotherapy):
                    potentials += 1
                    if (patient["Days Since Cytotoxic Chemotherapy"] >= extracted_num) or (patient["Days Since Cytotoxic Chemotherapy"] == False):
                        matches += 1

                #if subcategory is biologics, compare to patient characteristics and compute match
                elif any(term in each.lower() for term in biologic):
                    potentials += 1
                    if (patient["Days Since Biologic Therapy"] >= extracted_num) or (patient["Days Since Biologic Therapy"] == False):
                        matches += 1

                #if subcategory is growth factors, cimpare to patient characteristics and compute match
                elif any(term in each.lower() for term in growthfactor):
                    potentials += 1
                    if (patient["Days Since Growth Factor Therapy"] >= extracted_num) or (patient["Days Since Growth Factor Therapy"] == False):
                        matches += 1

                #if subcategory is radiotherapy, cimpare to patient characteristics and compute match
                elif any(term in each.lower() for term in radiotherapy):
                    potentials += 1
                    if (patient["Days Since Prior Radiotherapy"] >= extracted_num) or (patient["Days Since Prior Radiotherapy"] == False):
                        matches += 1

                #if subcategory is steroids, cimpare to patient characteristics and compute match
                elif any(term in each.lower() for term in steroids):
                    potentials += 1
                    if (patient["Days Since Corticosteroids"] >= extracted_num) or (patient["Days Since Corticosteroids"] == False):
                        matches += 1
            
            #otherwise check second regex
            else:
                
                #search text using second regular expression for time/comparator patterns
                search = re.search(regex2, each.lower())
                if search != None:
                    
                    #scan text and extract number - convert to days if necessary
                    extracted_text = each[search.span()[0]:search.span()[1]]
                    if "day" in extracted_text:
                        extracted_num = int(re.sub(r'[^0-9]', '', extracted_text))
                    elif "week" in extracted_text:
                        extracted_num = int(re.sub(r'[^0-9]', '', extracted_text)) * 7
                    elif "month" in extracted_text:
                        extracted_num = int(re.sub(r'[^0-9]', '', extracted_text)) * 30
                    elif "hour" in extracted_text:
                        extracted_num = int(re.sub(r'[^0-9]', '', extracted_text)) / 24

                    #if subcategory is chemotherapy, compare to patient characteristics and compute match
                    if any(term in each.lower() for term in chemotherapy):
                        potentials += 1
                        if (patient["Days Since Cytotoxic Chemotherapy"] >= extracted_num) or (patient["Days Since Cytotoxic Chemotherapy"] == False):
                            matches += 1

                    #if subcategory is biologics, compare to patient characteristics and compute match
                    elif any(term in each.lower() for term in biologic):
                        potentials += 1
                        if (patient["Days Since Biologic Therapy"] >= extracted_num) or (patient["Days Since Biologic Therapy"] == False):
                            matches += 1

                    #if subcategory is growth factors, cimpare to patient characteristics and compute match
                    elif any(term in each.lower() for term in growthfactor):
                        potentials += 1
                        if (patient["Days Since Growth Factor Therapy"] >= extracted_num) or (patient["Days Since Growth Factor Therapy"] == False):
                            matches += 1

                    #if subcategory is radiotherapy, cimpare to patient characteristics and compute match
                    elif any(term in each.lower() for term in radiotherapy):
                        potentials += 1
                        if (patient["Days Since Prior Radiotherapy"] >= extracted_num) or (patient["Days Since Prior Radiotherapy"] == False):
                            matches += 1

                    #if subcategory is steroids, cimpare to patient characteristics and compute match
                    elif any(term in each.lower() for term in steroids):
                        potentials += 1
                        if (patient["Days Since Corticosteroids"] >= extracted_num) or (patient["Days Since Corticosteroids"] == False):
                            matches += 1

        #hepatic function handler
        #for each classified as hepatic function
        for each in classified_df[classified_df["Prediction"] == "HepaticFunction"]['Criterion']:
            
            #search first for direct bilirubin in units of X times upper limit of normal
            regex = r'(direct|conjugated){,1} {0,}bilirubin {0,}(must be){,1} {0,}([<≤≥>]|<=|>=|greater than|greater than or equal to|less than|less than or equal to|below|above){,1} {0,}[0-9].{,1}[0-9]{0,}(x|X|times){0,} {0,}(x|X|times){0,} {0,}(upper limit of normal|uln)'
            search = re.search(regex, each.lower())
            
            #if match detected, add to potentials and compute if match
            if search != None:
                potentials += 1
                if patient["Direct Bilirubin Times ULN"] <= float(re.sub(r'[^0-9.]', '', each[search.span()[0]:search.span()[1]])):
                    matches += 1    
            
            #else, search for AST/ALT in units of X times upper limit of normal
            else:
                regex2 = r'(alt|ast|sgot|sgpt|transaminase|transaminases)[^a-zA-Z]{,1} {0,}(must be){,1} {0,}([<≤≥>]|<=|>=|greater than|greater than or equal to|less than|less than or equal to|below|above){,1} {0,}[0-9].{,1}[0-9]{0,}(x|X|times){0,} {0,}(x|X|times){0,} {0,}(upper limit of normal|uln)'
                search = re.search(regex2, each.lower())
                
                #if match detected, add to potentials and compute if match     
                if search != None:
                    potentials += 1
                    if patient["AST/ALT Times ULN"] <= float(re.sub(r'[^0-9.]', '', each[search.span()[0]:search.span()[1]])):
                        matches += 1  
                
                #else, search for direct bilirubin or AST/ALT in original units (i.e. mg/dL)    
                else:
                    regex3 = r'(direct bilirubin|conjugated bilirubin|alt|ast|sgot|sgpt|transaminase|transaminases)[^a-zA-Z]{,1} {0,}([<≤≥>]|<=|>=|greater than|greater than or equal to|less than|less than or equal to|below|above){,1} {0,}[0-9]{1,3}.{,1}[0-9]{0,} {0,}(mg/dl|u/l|mgdl|ul)'
                    search = re.search(regex3, each.lower())
                    
                    #if search is not empty
                    if search != None:
                        
                        #if bilirubin detected, add potential
                        if "bilirubin" in each[search.span()[0]:search.span()[1]]:
                            potentials += 1
                            
                            #multiply patient input by 0.4 (cutoff for ULN) and compare to trial criteria
                            if (patient["Direct Bilirubin Times ULN"] * 0.4) <= float(re.sub(r'[^0-9.]', '', each[search.span()[0]:search.span()[1]])):
                                matches += 1
                        
                        #otherwise is AST/ALT, add potential
                        else:
                            potentials += 1
                            
                            #multiply patient input by 40 (cutoff for ULN) and compare to trial criteria
                            if (patient["AST/ALT Times ULN"] * 40) <= float(re.sub(r'[^0-9.]', '', each[search.span()[0]:search.span()[1]])):
                                matches += 1

        #fertility pregnancy and contraception handler
        #set initial presence of fertility/pregnancy/contraception exclusion as false
        fert_preg_contra = False
        fert_excl_keywords = ["not eligible", "ineligible", "not participate", "exclude", "pregnancy test", "agree", "require", "unwilling", "effective", "risk", "hcg", "adequate"]
        
        #for each criterion classified into this category
        for each in classified_df[classified_df["Prediction"] == "FertilityPregnancyContraception"]['Criterion']:
            
            #if exclusionary keywords present, add potential and change above presence variable to True
            if any(term in each.lower() for term in fert_excl_keywords):
                if fert_preg_contra == False:
                    potentials += 1
                    fert_preg_contra = True
        
        #if there is an exclusionary criterion and the patient characteristic is false, add a match
        if (fert_preg_contra == True) and (patient["Pregnant, Nursing, or Fertile and Unwilling to Use Contraception"] == False):
            matches += 1


        #active infection handler
        #set initial presence of active infection to false
        active_infection = False
        
        #for each classified into this category
        for each in classified_df[classified_df["Prediction"] == "ActiveInfection"]['Criterion']:
            
            #if exclusionary keywords present, add potential
            if ("not eligible" in each.lower()) or ("ineligible" in each.lower()) or ("not participate" in each.lower()) or ("exclude" in each.lower()):
                if active_infection == False:
                    potentials += 1
                    active_infection = True
            
            #if blood culture pattern present, add potential
            elif re.search(r'positive {0,}(.*){,1} {0,}blood culture', each.lower()) != None:
                if active_infection == False:
                    potentials += 1
                    active_infection = True 
            
            #otherwise if "infection" keyword is present without inclusionary words, assume this is exclusionary
            elif ("infection" in each.lower()) and ("include" not in each.lower()) and ("eligible" not in each.lower()):
                if active_infection == False:
                    potentials += 1
                    active_infection = True                    
        
        #if there is an infection exclusion and patient characteristic is false, add match
        if (active_infection == True) and (patient["Active and/or Uncontrolled Viral, Bacterial, or Fungal Infection"] == False):
            matches += 1

        #cardiac function handler
        #set initial cardiac function exclusion requirement to false
        cv_dysfunction = False
        
        #for each criterion classified into this category
        for each in classified_df[classified_df["Prediction"] == "CardiovascularFunction"]['Criterion']:
            
            #check for anthracycline exposure and add as potential match
            if ("lifetime exposure" in each.lower()) and ("rubicin" in each.lower()):
                if cv_dysfunction == False:
                    potentials += 1
                    cv_dysfunction = True
                    
            #check for exclusionary keywords and add as potential match
            elif ("not eligible" in each.lower()) or ("ineligible" in each.lower()) or ("not participate" in each.lower()) or ("exclude" in each.lower()):
                if cv_dysfunction == False:
                    potentials += 1
                    cv_dysfunction = True
                    
            #check for adequate cardiac function pattern and add as potential match
            elif re.search(r'adequate {0,}(.*){,1} {0,}function', each.lower()) != None:
                if cv_dysfunction == False:
                    potentials += 1
                    cv_dysfunction = True 
            
            #check for ejection fraction pattern and add as potential match
            elif re.search(r'(reduc|shorten|low|poor)[a-z]{,5} {0,}(.*){,1} {0,}fraction', each.lower()) != None:
                if cv_dysfunction == False:
                    potentials += 1
                    cv_dysfunction = True     
            
            #check for final keywords and add as potential match
            elif ("ejection fraction" in each.lower()) or ("stolic dysfunction" in each.lower()):
                if cv_dysfunction == False:
                    potentials += 1
                    cv_dysfunction = True                    
        
        #if there is a cardiac exclusion and patient characteristic is false, add match
        if (cv_dysfunction == True) and (patient["Impaired Cardiovascular Function/Cardiotoxicity from Chemotherapy"] == False):
            matches += 1

        #return match score - % of potential matches * % of criteria not classified as "Other"
        match_score = (matches/potentials)*(len(classified_df[classified_df['Prediction'] != "Other"]) / len(classified_df))
        return round(match_score, 2)
    
    #define Match()
    def Match(self, patient, docx_trials, ctgov_trials):
        
        #empty lists in which to store trial names and scores
        trials = []
        scores = []

        #if at least one .docx trial
        if len(docx_trials) > 0:

            #for each .docx trial
            for trial in docx_trials:

                #extract trial info
                trialinfo = self.ExtractTrialInfo(path = trial)
                
                #extract raw criteria
                ExtractedCriteria = self.ExtractCriteria(text = trialinfo['./eligibility/criteria/textblock'], mode = 'docx')
                
                #extract clean inclusion and exclusion criteria, dropping any rows with empty values
                cleaninclusion = self.CleanCriteria(ExtractedCriteria[-2])
                cleaninclusion = cleaninclusion[cleaninclusion['Final'] != '']
                cleanexclusion = self.CleanCriteria(ExtractedCriteria[-1])
                cleanexclusion = cleanexclusion[cleanexclusion['Final'] != '']
                
                #fit clean criteria into FastText embeddings
                embedded_inclusion = self.EmbedCriteria(CleanedCriteria = cleaninclusion['Final'])
                embedded_exclusion = self.EmbedCriteria(CleanedCriteria = cleanexclusion['Final'])
                
                #classify criteria and combine into single df
                classified_df_in = self.ClassifyCriteria(criteria = cleaninclusion['Original'], embeddings = embedded_inclusion['Embedding'], model_folder_path = match_instance.classifier_model)
                classified_df_ex = self.ClassifyCriteria(criteria = cleanexclusion['Original'], embeddings = embedded_exclusion['Embedding'], model_folder_path = match_instance.classifier_model)
                classified_df = pd.concat([classified_df_in, classified_df_ex])
                
                #append trial name and match score to lists
                trials.append(trialinfo['NCT_id'])
                scores.append(self.ComputeMatchScore(patient = patient, trialinfo = trialinfo, ExtractedCriteria = ExtractedCriteria, classified_df = classified_df))

        #if at least one ctgov trial
        if len(ctgov_trials) > 0:

            #for each ctgov trial
            for trial in ctgov_trials:

                #extract trial info
                trialinfo = self.ExtractTrialInfo(path = trial)
                
                #extract raw criteria
                ExtractedCriteria = self.ExtractCriteria(text = trialinfo['./eligibility/criteria/textblock'], mode = 'ctgov')
                
                #if lists in returned list, treat as inclusion/exclusion criteria
                if True in [type(each) == list for each in ExtractedCriteria]:
                
                    #extract clean inclusion and exclusion criteria, dropping any rows with empty values
                    cleaninclusion = self.CleanCriteria(ExtractedCriteria[-2])
                    cleaninclusion = cleaninclusion[cleaninclusion['Final'] != '']
                    cleanexclusion = self.CleanCriteria(ExtractedCriteria[-1])
                    cleanexclusion = cleanexclusion[cleanexclusion['Final'] != '']

                    #fit clean criteria into FastText embeddings
                    embedded_inclusion = self.EmbedCriteria(CleanedCriteria = cleaninclusion['Final'])
                    embedded_exclusion = self.EmbedCriteria(CleanedCriteria = cleanexclusion['Final'])

                    #classify criteria and combine into single df
                    classified_df_in = self.ClassifyCriteria(criteria = cleaninclusion['Original'], embeddings = embedded_inclusion['Embedding'], model_folder_path = self.classifier_model)
                    classified_df_ex = self.ClassifyCriteria(criteria = cleanexclusion['Original'], embeddings = embedded_exclusion['Embedding'], model_folder_path = self.classifier_model)
                    classified_df = pd.concat([classified_df_in, classified_df_ex])
                    
                #otherwise, treat as single list of criteria
                else:
                    
                    #extract clean criteria, dropping rows with empty criteria
                    cleancriteria = self.CleanCriteria(ExtractedCriteria)
                    cleancriteria = cleancriteria[cleancriteria['Final'] != '']
                    
                    #fit clean criteria into FastText embeddings
                    embedded = self.EmbedCriteria(CleanedCriteria = cleancriteria['Final'])

                    #classify criteria and combine into single df
                    classified_df = self.ClassifyCriteria(criteria = cleancriteria['Original'], embeddings = embedded['Embedding'], model_folder_path = self.classifier_model)

                    
                #append trial name and match score to lists
                trials.append(trialinfo['NCT_id'])
                scores.append(self.ComputeMatchScore(patient = patient, trialinfo = trialinfo, ExtractedCriteria = ExtractedCriteria, classified_df = classified_df))

        #assemble matching score dataframe with ranked list of clinical trials        
        match_df = pd.DataFrame({"Trial":trials, "Match Score":scores})
        match_df = match_df.sort_values(by = ['Match Score'], ascending = False)
        match_df = match_df.reset_index(drop = True)
        return match_df

In [3]:
%%time

#instantiate match instance with pretrained embedding and classifier models
match_instance = gearboxNLP(embedding_model = "/Users/Sam/Dropbox/Capstone/jupyter_notebooks/ft_embedding_size256_window5.model",
                            classifier_model = "/Users/Sam/Dropbox/Capstone/classifier_models/")

CPU times: user 37.9 ms, sys: 4.77 s, total: 4.81 s
Wall time: 8.51 s


In [4]:
#dict for input patient characteristics
patient = {
           #patient's age in days
           "Age (Days)": 7000,
           
           #patient's height in cm
           "Height (cm)": 122,
           
           #patient's gender (female == True)
           "Female": True,
           
           #patient's self-identified race (included to calculate eGFR using MDRD equation)
           "African-American": False,
    
           #Lanksy or Karnofsky performance status scale (depending on age)
           "Performance Status (Lanksy/Karnofsky)": 50,
           
           #patient's diagnosis 
           "Diagnosis": 'Refractory AML',
           
           #CNS involvement scale
           "CNS Involvement (1/2/3)": 1,
           
           #presence of isolated CNS disease
           "Isolated CNS Disease": False,
           
           #days since prior therapies as integer - input False for treatments the patients has not had
           "Days Since Cytotoxic Chemotherapy": False,
           "Days Since Biologic Therapy": 50,
           "Days Since Growth Factor Therapy": 60,
           "Days Since Corticosteroids": 40,
           "Days Since Prior Radiotherapy": 10,
           
           #creatinine in mg/dL
           "Creatinine (mg/dL)": 1.0,
           
           #hepatic function times ULN - e.g. if AST and/or ALT is 2x ULN, input 2
           "AST/ALT Times ULN": 2,
           "Direct Bilirubin Times ULN": 2,
           
           #true if reduced EF, high anthracycline exposure, etc.
           "Impaired Cardiovascular Function/Cardiotoxicity from Chemotherapy": True,
           
           #true if patient current has an active infection (including HIV)
           "Active and/or Uncontrolled Viral, Bacterial, or Fungal Infection": True,
           
           #true if patient is pregnant, nursing, or fertile and unwilling to use contraception during the study
           "Pregnant, Nursing, or Fertile and Unwilling to Use Contraception": True}

#file paths for trial protocols
docx_trials = ['/Users/Sam/Downloads/apal/APAL2020D_concept_venetoclax_clean_03-17-2021  eak.docx',
               '/Users/Sam/Downloads/apal/APAL2020E_Full Concept_Trametinib_210127.docx',
               '/Users/Sam/Downloads/apal/APAL2020F_Concept_Flotetuzumab_11.30.20.docx']

#NCT ID's for clinicaltrials.gov trials
ctgov_trials = ['NCT03817320', 'NCT04726241']

In [5]:
%%time

#use Match() method to return ranked list of trials for this patient
match_instance.Match(patient = patient,
                     docx_trials = docx_trials,
                     ctgov_trials = ctgov_trials)

CPU times: user 4.17 s, sys: 296 ms, total: 4.47 s
Wall time: 5.81 s


Unnamed: 0,Trial,Match Score
0,APAL2020E,0.49
1,APAL2020F,0.43
2,APAL2020D,0.42
3,NCT03817320,0.31
4,NCT04726241,0.19
