# Auto-labeling of a corpus of clinical trials for later use in retrieving search results

In [110]:
import csv
import gensim
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing
import os
import pandas as pd
import re
import seaborn as sns
import sys

from collections import OrderedDict
import spacy
from gensim import corpora
from gensim.models.phrases import Phrases, Phraser
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
from multiprocessing import Pool
nlp = spacy.load("en_core_web_lg")

Add a ton of stopwords relating to procedural things to make room for the medical conditions we're looking to classify with.

In [316]:
STOPWORDS = set(STOPWORDS).union(set(['patient', 'patients', 'study', 'disease',
                                      'treatment', 'randomized', 'statistical', 
                                      'analysis', 'group', 'trial', 'clinical', 
                                      'controlled', 'safety', 'associated', 'risk', 
                                      'intervention', 'care', 'health', 'therapy', 
                                      'participants', 'method', 'monitor', 'studies',
                                      'cohorts', 'percent', 'prospective', 'efficacy', 
                                      'days', 'months', 'evaluate', 'subjects', 
                                      'data', 'outcomes', 'research', 'function', 
                                      'effects', 'investigators', 'use', 'population', 
                                      'compared', 'quality', 'results', 'improve',
                                      'term', 'groups', 'weeks', 'week', 'groups', 
                                      'test', 'control', 'time', 'period', 'placebo', 
                                      'stimulation', 'symptoms', 'mortality', 'failure',
                                      'non', 'interventional', 'observational', 'assess', 
                                      'relative', 'positive', 'develop', 'signs', 
                                      'enrolled', 'randomly', 'assigned', 'ratio', 
                                      'stratified', 'multicenter', 'open', 'phase', 
                                      'stage', 'iiib', 'eligible', 'criteria', 
                                      'inclusion', 'exclustion', 'year', 'years',
                                      'before', 'after', 'therapy', 'therapies',
                                      'interventions', 'controlling', 'terms', 'risks',
                                      'enroll', 'studies', 'diseases', 'enrolling',
                                      'evaluating', 'evaluated', 'evaluates', 'effect',
                                      'invesetigate', 'investigator', 'investigation',
                                      'investigations', 'studying', 'eligibility', "dose", 
                                      "screening", "history", "drug", "active", "including",
                                      "significant", "day", 'days', "potential", "female", 
                                      "mg", "hepatitis", "investigational", 'prior', 'known',
                                      'clinically', 'clincal', 'clinic', 'period', 'following',
                                      'subject', 'visit', 'subjects', 'willing', 
                                      'participation', 'lab', 'laboratory', 'medical',
#                                       'health', 'healthy', 
                                      'response', 'diagnosis', 'stages', 'staged',
                                      'diagnoses', 'treat', 'treats', 'treated', 'program',
                                      'self', 'based', 'life', 'participate', 'english',
                                      'participates', 'able', 'community', 'support',
                                      'ability', 'coummunities', 'supports', 'supported',
                                      'consent', 'age', 'informed', 'consents', 'consented',
                                      'provide', 'provided', 'condition', 'conditions',
                                      'conditioned', 
#                                       'volunteers', 'volunteer', 
                                      'accept', 'accepted', 'accepting'
                                      'accepts', 'current', 'currently', 'controlled',
                                      'uncontrolled', 'status', 'recieved', 'times', 
                                      'limit', 'count', 'disorder', 'disorders', 'follow',
                                      'follows', 'followed', 'participant', 'related',
                                      'probability', 'probabilities', 'sample', 'samples',
                                      'practice', 'individual', 'individuals', 
                                      'individually', 'specific', 'specify', 'prevalence', 
                                      'limited', 'procedure', 'procedures', 'write',
                                      'obtain', 'practice', 'practices', 'practicing',
                                      'diagnostic']))


Import subset of data.

In [112]:
df = pd.read_csv('../frontend_dataset_final.csv')

In [115]:
df = df.drop(columns='Unnamed: 0')
df.head()

Unnamed: 0,agency,brief_title,brief_summary,city,condition,country,detailed_description,eligibility,gender,keyword,mesh_term,official_title,overall_status,phase,state,url
0,Fraunhofer Institute for Molecular Biology and...,PREVALENCE OF Anti-CCP POSITIVITY AND SUBCLINI...,"Non-interventional, prospective, observational...",Frankfurt am Main,Rheumatoid Arthritis,Germany,Studies of early arthritis cohorts have shown ...,population without pre-classified RA but new o...,All,Anti-CCP,Arthritis,PREVALENCE OF ANTI-CYCLIC CITRULLINATED PEPTID...,Recruiting,Missing,Hessia,https://clinicaltrials.gov/show/NCT03267147
1,University of Washington,Aldesleukin and Pembrolizumab in Treating Pati...,This phase I trial studies the side effects an...,Seattle,Clear Cell Renal Cell Carcinoma,United States,PRIMARY OBJECTIVES: I. To evaluate the safety ...,Inclusion Criteria: - Be willing and able to ...,All,Missing,Carcinoma,A Phase I Trial of Interleukin-2 (Aldesleukin)...,Recruiting,Phase 1,Washington,https://clinicaltrials.gov/show/NCT03260504
2,"CSPC ZhongQi Pharmaceutical Technology Co., Ltd.",Nab-Paclitaxel Versus Paclitaxel Plus Carbopla...,"This is a randomized, multicenter, open, contr...",Missing,Non-small Cell Lung Cancer,Missing,Subjects will receive one of two treatment reg...,Inclusion Criteria: - Accepted the purpose of...,All,Missing,Lung Neoplasms,"The Randomized,Open, Multicenter Phase III Stu...",Not yet recruiting,Phase 3,Missing,https://clinicaltrials.gov/show/NCT03262948
3,Fraunhofer Institute for Molecular Biology and...,PREVALENCE OF Anti-CCP POSITIVITY AND SUBCLINI...,"Non-interventional, prospective, observational...",Frankfurt am Main,Rheumatoid Arthritis,Germany,Studies of early arthritis cohorts have shown ...,population without pre-classified RA but new o...,All,Anti-CCP,Arthritis,PREVALENCE OF ANTI-CYCLIC CITRULLINATED PEPTID...,Recruiting,Missing,Hessia,https://clinicaltrials.gov/show/NCT03267147
4,Chang Gung Memorial Hospital,Analgesic Effect of Low Level Laser for Proced...,Neonates undergo several painful procedures an...,Kaohsiung,Procedural Pain,Taiwan,"This open-label, randomized controlled trial i...",Inclusion Criteria: - healthy fullterm neonat...,All,neonates,"Pain, Procedural",Analgesic Effect of Low Level Laser for Proced...,Recruiting,Missing,Missing,https://clinicaltrials.gov/show/NCT03268148


Choose colums to pull text from.

In [116]:
df['search_terms'] = (df['mesh_term'].str.lower()
                      + ' ' + df['condition'].str.lower() 
                      + ' ' + df['keyword'].str.lower()
                      + ' ' + df['brief_summary'].str.lower()
                      + ' ' + df['official_title'].str.lower()
#                       + ' ' + df['detailed_description'].str.lower()
                      + ' ' + df['eligibility'].str.lower()
                      )

In [117]:
df['search_terms'].iloc[0]

'arthritis rheumatoid arthritis anti-ccp non-interventional, prospective, observational study to assess the relative risk of anti-ccp positive patients to develop (subclinical) signs of inflammation in accordance with early rheumatoid arthritis (ra) in a population without pre-classified ra but new1 onset of non-specific musculoskeletal (msk) symptoms in general practices in germany and subsequent 36 months follow-up by rheumatologists prevalence of anti-cyclic citrullinated peptide (anti-ccp) positivity and subclinical signs of inflammation in patients with new onset of non-specific musculoskeletal symptoms possibly related to early rheumatoid arthritis in general practices in germany population without pre-classified ra but new onset of non-specific musculoskeletal (msk)  symptoms non-probability sample  inclusion criteria:  - new onset of non-specific msk symptoms, including, but not limited to, arthralgia of  the hands and the large joints such as wrists, knees, and shoulders  - wr

Remove some puctuation and 'missing' from any column without info.

In [119]:
def replace_missing(string):
    new = re.sub('missing', '', string)
    new = re.sub('-', ' ', new)
    new = re.sub('\(\S*\)', '', new)
    return u"{}".format(new)

In [120]:
search_terms = df['search_terms'].apply(replace_missing).values
len(search_terms)

22471

Split into training and test sets.

In [121]:
test_terms = search_terms.copy()[-2471:]
search_terms = search_terms[:20000]

In [122]:
len(search_terms), len(test_terms)

(20000, 2471)

In [123]:
test_terms[0]

' upper gastrointestinal subepithelial tumors stomach neoplasm to increase the diagnostic accuracy of subepithelial tumors, larger tissue samples are required. it is difficult to obtain adequate tissue samples. there were several biopsy methods to obtain tissue samples. pathological examination would include mitosis counts, particularly in hypoechoic subepithelial tumors located in the 4th layer of the gastric wall, where differentiation between leiomyoma of benign nature and gastrointestinal stromal tumor  of malignant potential is essential. so we hypothesize that unroofing biopsy is an more appropriate method than eus fnb(endoscopic ultrasonography guided fine needle biopsy). we will compare diagnostic accuracy and complications between eus fnb & unroofing biopsy.  inclusion criteria:    older than 19 years old and younger than 80 years old    suspected upper gastrointestinal subepithelial tumors(≥15mm, ≤35mm) that were  originated from muscularis propria layer on endoscopic ultraso

In [124]:
search_terms[0]

'arthritis rheumatoid arthritis anti ccp non interventional, prospective, observational study to assess the relative risk of anti ccp positive patients to develop  signs of inflammation in accordance with early rheumatoid arthritis  in a population without pre classified ra but new1 onset of non specific musculoskeletal  symptoms in general practices in germany and subsequent 36 months follow up by rheumatologists prevalence of anti cyclic citrullinated peptide (anti ccp) positivity and subclinical signs of inflammation in patients with new onset of non specific musculoskeletal symptoms possibly related to early rheumatoid arthritis in general practices in germany population without pre classified ra but new onset of non specific musculoskeletal   symptoms non probability sample  inclusion criteria:    new onset of non specific msk symptoms, including, but not limited to, arthralgia of  the hands and the large joints such as wrists, knees, and shoulders    written informed consent obta

Create lemmatization and tokenization functions.

In [126]:
def lemmatize(text):
    """Return new string of lemmatized words from text."""
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if token.text not in STOPWORDS])

In [127]:
def tokenize(text):
    """Return list of tokens from text."""
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

Apply functions element-wise with multiprocessing.

In [128]:
pool = Pool(processes=multiprocessing.cpu_count())
lemma = pool.map(lemmatize, search_terms)
keyword_tokens = pool.map(tokenize, lemma)

Apply Phraser to detect bi-grams and more-grams.

In [132]:
# token_ = [doc.split(" ") for doc in keyword_tokens]
bigram = Phrases(keyword_tokens,
                 min_count=10,
                 threshold=5,
                 delimiter=b' ')

bigram_phraser = Phraser(bigram)

bigram_tokens = []
for sent in keyword_tokens:
    bigram_tokens.append(bigram_phraser[sent])

Check bigrams for meaningfullness.

In [133]:
bigram_tokens[0]

['arthritis rheumatoid',
 'arthritis',
 'anti ccp',
 'anti ccp',
 'inflammation',
 'accordance',
 'early',
 'rheumatoid arthritis',
 'pre',
 'classify',
 'ra',
 'new onset',
 'specific',
 'musculoskeletal',
 'general practice',
 'germany',
 'subsequent',
 'rheumatologist',
 'prevalence',
 'anti',
 'cyclic',
 'citrullinate',
 'peptide',
 'anti ccp',
 'positivity',
 'subclinical',
 'inflammation',
 'new onset',
 'specific',
 'musculoskeletal',
 'possibly',
 'early',
 'rheumatoid arthritis',
 'general practice',
 'germany',
 'pre',
 'classify',
 'ra',
 'new onset',
 'specific',
 'musculoskeletal',
 'probability sample',
 'new onset',
 'specific',
 'msk',
 'limited',
 'arthralgia',
 'hand',
 'large',
 'joint',
 'wrist',
 'knee',
 'shoulder',
 'write obtain',
 'initiation',
 'protocol',
 'require',
 'procedure',
 'general',
 'understanding',
 'procedure',
 'exclusion',
 'ra',
 'diagnose',
 'accord modify',
 'eular',
 'acr',
 'american college',
 'rheumatology',
 'arthritis',
 'reason',
 'ms

Looks nice.

Create a gensim dictionary of our bigram tokens.

In [134]:
id2word = corpora.Dictionary(bigram_tokens)

In [135]:
len(id2word.keys())

59225

Filter our dictionary to, hopefully, more useful words.

In [136]:
id2word.filter_extremes(no_below=5, no_above=0.75)
len(id2word.keys())

26780

Create corpus for modeling.

In [137]:
corpus = [id2word.doc2bow(text) for text in bigram_tokens]

Fit an LDA model.

In [143]:
lda = LdaMulticore(corpus=corpus,
                   id2word=id2word,
#                    random_state=42,
                   chunksize=10000,
#                    num_topics=85,
                   passes=15,
                   workers=multiprocessing.cpu_count(),
                   )

  diff = np.log(self.expElogbeta)


Test out our model.

In [279]:
def get_search_terms(index):
    """Return topics for a given index in the test_tems list."""
    # Preprocess new document.
    lemma = list(map(lemmatize, [test_terms[index]]))
    other_texts = list(map(tokenize, lemma))
    other_corpus = []
    for sent in other_texts:
        other_corpus.append(bigram_phraser[sent])
    corpus = [id2word.doc2bow(text) for text in other_corpus]
    unseen_doc = corpus[0]
    # Feed processed document to the model.
    vector = lda[unseen_doc]  # get topic probability distribution for a document
    # Choose the set of topics the model gives the highest probability to.
    terms = [topic for topic in lda.print_topic(vector[np.argmax(vector, axis=0)[1]][0], 20).split(' + ')
             # Set a model probability cut-off for topic inclusion.
             if float(topic[:5]) >= .005]
    return terms

In [280]:
index_to_search = 949

In [281]:
get_search_terms(index_to_search)

['0.024*"pediatric"',
 '0.020*"child"',
 '0.017*"hiv"',
 '0.014*"center"',
 '0.012*"parent"',
 '0.010*"adolescent"',
 '0.009*"probability sample"',
 '0.007*"old"',
 '0.006*"protocol"',
 '0.006*"parent guardian"',
 '0.006*"hiv infection"',
 '0.005*"adherence"',
 '0.005*"survey"',
 '0.005*"art"',
 '0.005*"hospital"',
 '0.005*"cohort"']

In [282]:
test_terms[index_to_search]

'hiv infections hiv infections adherence, medication youth living with hiv  often face unique challenges achieving high and sustained rates of adherence to their antiretroviral therapy . poor adherence can lead to unsuppressed virus, more advanced hiv disease and poorer health outcomes, eventually exhausting treatment options. to date however, there are few demonstrated interventions for youth failing first line therapy. this study will evaluate a novel intervention that uses remote coaching through video enabled counseling sessions, a `smart` pill bottle that notifies an adherence coach when youth fail to open/close the device around dose time, and problem solving outreach by the coach when and as needed. this intensive `boot camp` strategy is implemented for 12 weeks followed by observation through 48 weeks. triggered escalating real time adherence intervention to promote rapid hiv viral suppression among youth living with hiv failing antiretroviral therapy: the tera study inclusion 

Pretty good.

In [317]:
index_to_search = 2344

In [318]:
get_search_terms(index_to_search)

['0.039*"lung cancer"',
 '0.028*"small cell"',
 '0.014*"ptsd"',
 '0.012*"nsclc"',
 '0.010*"tumor"',
 '0.010*"cancer"',
 '0.009*"cell lung"',
 '0.008*"lung neoplasm"',
 '0.007*"radiation"',
 '0.006*"surgery"',
 '0.006*"lung"',
 '0.005*"primary"',
 '0.005*"registration"',
 '0.005*"level"',
 '0.005*"endometrial cancer"',
 '0.005*"radiotherapy"']

In [319]:
test_terms[index_to_search]

' healthy mri the purpose of this study is to assess the image quality improvement provided by a custom mr spine coil in comparison with the fda approved coil when used for radiotherapy treatment planning. a custom designed mr coil for spine radiotherapy treatment planning enrollment in this study will be open to all mskcc employees to act as healthy volunteers  provided they meet the eligibility criteria for the protocol. employees who provide consent  willfully and voluntarily may participate as a healthy volunteer once. employees who are  under direct supervision of any investigators on the study will not be eligible to  participate. an irb approved flyer with pertinent contact information will be distributed  throughout the institution to inform all mskcc employees of the possibility of  participating in this study. no preferences will be given by race or gender for the  selection of participants. probability sample  inclusion criteria:    healthy volunteers age 18 or older  exclus

Might want to reconsider using 'healthy' and 'volunteer' in STOPWORDS

In [286]:
index_to_search = 244

In [287]:
get_search_terms(index_to_search)

['0.025*"pd"',
 '0.024*"parkinson"',
 '0.008*"motor"',
 '0.007*"gait"',
 '0.007*"parkinson parkinson"',
 '0.007*"vestibular"']

In [288]:
test_terms[index_to_search]

'parkinson disease parkinson`s disease parkinson disease parkinson`s disease  is a common neurodegenerative disorder affecting approximately 80,000 veterans, representing a priority area for va research. current medicines for pd only improve symptoms, treatments that slow disease progression are needed, and earlier diagnosis of pd may be the key to their development. pd symptoms can be mimicked by medicines (most commonly antipsychotic drugs that block dopamine), and some of these patients actually have underlying "prodromal" pd that was "unmasked" years before it would have caused symptoms. this problem is increasing as these medicines are now used for common conditions including post traumatic stress disorder and depression. the investigators will identify prodromal pd in patients with drug induced symptoms using brain scans. these patients will be enrolled in a randomized clinical trial of aerobic exercise which slows progression in animal models of pd and has other health benefits.

Also pretty good.

In [289]:
index_to_search = 1249

In [290]:
get_search_terms(index_to_search)

['0.013*"surgery"',
 '0.011*"vitamin"',
 '0.010*"undergo"',
 '0.010*"lung"',
 '0.008*"anesthesia"',
 '0.008*"procedure"',
 '0.006*"level"',
 '0.006*"general anesthesia"',
 '0.006*"require"',
 '0.006*"volume"',
 '0.005*"airway"',
 '0.005*"surgical"',
 '0.005*"measurement"',
 '0.005*"tissue"',
 '0.005*"cancer"']

In [291]:
test_terms[index_to_search]

'wounds and injuries critical illness trauma the aim of this study is to determine if the incidence of post operative complications can be decreased by the implementation of intra operative, minimally invasive hemodynamic monitoring  via flotrac™ and ev1000™ in trauma patients. fluid resuscitation optimization in surgical trauma patients  inclusion criteria:  1. 18 years of age or older  2. injury severity score > 15 (indicator of anticipated trauma mortality)  3. admission to surgical trauma icu   4. anticipated surgery within 72 hours of admission  5. american society of anesthesiology patient classification status  2 5  6. lactic acid > 2.5 within 24 hours of surgical procedure or base deficit ≥   5 mmol/l,  or persistent requirement for vasopressor support within 24 hours of surgical  procedure  7. patient requires mechanical ventilation prior to consenting surgery  8. vascular devices that include a minimum of an arterial line  9. minimally invasive hemodynamic monitoring initiate

Misses 'trauma', the major topic. It's going to need to be more consistent. 

I've requested access to a huge corpus of medical text from anonomized discharge notes as our set is relatively small. I'll also move over to sagemaker as it's getting really slow locally already.

One more...

In [304]:
index_to_search = 2255

In [305]:
get_search_terms(index_to_search)

['0.037*"leukemia"',
 '0.013*"aml"',
 '0.012*"relapse"',
 '0.010*"acute myeloid"',
 '0.009*"high"',
 '0.009*"bone marrow"',
 '0.008*"define"',
 '0.008*"cell"',
 '0.008*"receive"',
 '0.007*"chemotherapy"',
 '0.007*"syndrome"',
 '0.007*"enrollment"',
 '0.005*"relapse refractory"',
 '0.005*"myelodysplastic syndrome"',
 '0.005*"stem cell"',
 '0.005*"remission"',
 '0.005*"blast"',
 '0.005*"acute lymphoblastic"',
 '0.005*"mds"',
 '0.005*"infection"']

In [306]:
test_terms[index_to_search]

'leukemia leukemia, myelogenous, chronic leukemia, myelogenous, chronic the purpose of this study is to show that myeloablative hematopoietic progenitor cell transplantation  continues to offer acceptable disease free survival for select patients requiring hpct. myeloablative hematopoietic progenitor cell transplantation  for pediatric malignancies inclusion criteria:    malignant disease    chronic myleogenous leukemia in chronic or accelerated phase    acute lymphoblastic leukemia      first remission high risk all (ph+, t( 4 11) infants).     second remission all, after a short first remission (<36 mos from dx).     3rd or greater remission all.    acute myelogenous leukemia      first remission high risk acute nonlymphoblastic  (as defined by   cytogenetics), if a matched sibling donor is available.     initial partial remission aml (<20 percent blasts in the bone marrow).     aml that is refractory to two cycles of induction therapy.     second or greater remission aml    myelodys