Study categorization with scispacy

In [1]:
import csv
import gensim
import multiprocessing
import numpy as np
import os
import pandas as pd
import re
import spacy
import scispacy
import sys

from gensim import corpora
from gensim.models.phrases import Phrases, Phraser
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
from multiprocessing import Pool
from scispacy.abbreviation import AbbreviationDetector
from scispacy.umls_linking import UmlsEntityLinker




In [None]:
nlp = spacy.load("en_core_sci_lg")

abbreviation_pipe = AbbreviationDetector(nlp)
nlp.add_pipe(abbreviation_pipe)

linker = UmlsEntityLinker(resolve_abbreviations=True,
                         k=10,
                         max_entities_per_mention=3)
nlp.add_pipe(linker)

Add a ton of stopwords relating to procedural things to make room for the medical conditions we're looking to classify.

In [28]:
STOPWORDS = set(STOPWORDS).union(set(['patient', 'patients', 'study', 'disease',
                                      'treatment', 'randomized', 'statistical', 
                                      'analysis', 'group', 'trial', 'clinical', 
                                      'controlled', 'safety', 'associated', 'risk', 
                                      'intervention', 'care', 'health', 'therapy', 
                                      'participants', 'method', 'monitor', 'studies',
                                      'cohorts', 'percent', 'prospective', 'efficacy', 
                                      'days', 'months', 'evaluate', 'subjects', 
                                      'data', 'outcomes', 'research', 'function', 
                                      'effects', 'investigators', 'use', 'population', 
                                      'compared', 'quality', 'results', 'improve',
                                      'term', 'groups', 'weeks', 'week', 'groups', 
                                      'test', 'control', 'time', 'period', 'placebo', 
                                      'stimulation', 'symptoms', 'mortality', 'failure',
                                      'non', 'interventional', 'observational', 'assess', 
                                      'relative', 'positive', 'develop', 'signs', 
                                      'enrolled', 'randomly', 'assigned', 'ratio', 
                                      'stratified', 'multicenter', 'open', 'phase', 
                                      'stage', 'iiib', 'eligible', 'criteria', 
                                      'inclusion', 'exclustion', 'year', 'years',
                                      'before', 'after', 'therapy', 'therapies',
                                      'interventions', 'controlling', 'terms', 'risks',
                                      'enroll', 'studies', 'diseases', 'enrolling',
                                      'evaluating', 'evaluated', 'evaluates', 'effect',
                                      'invesetigate', 'investigator', 'investigation',
                                      'investigations', 'studying', 'eligibility', "dose", 
                                      "screening", "history", "drug", "active", "including",
                                      "significant", "day", 'days', "potential", "female", 
                                      "mg", "hepatitis", "investigational", 'prior', 'known',
                                      'clinically', 'clincal', 'clinic', 'period', 'following',
                                      'subject', 'visit', 'subjects', 'willing', 
                                      'participation', 'lab', 'laboratory', 'medical',
                                      'response', 'diagnosis', 'stages', 'staged',
                                      'diagnoses', 'treat', 'treats', 'treated', 'program',
                                      'self', 'based', 'life', 'participate', 'english',
                                      'participates', 'able', 'community', 'support',
                                      'ability', 'coummunities', 'supports', 'supported',
                                      'consent', 'age', 'informed', 'consents', 'consented',
                                      'provide', 'provided', 'condition', 'conditions',
                                      'conditioned', 'compliance', 'enrollment',
                                      'accept', 'accepted', 'accepting', 'enrolment',
                                      'accepts', 'current', 'currently', 'controlled',
                                      'uncontrolled', 'status', 'recieved', 'times', 
                                      'limit', 'count', 'disorder', 'disorders', 'follow',
                                      'follows', 'followed', 'participant', 'related',
                                      'probability', 'probabilities', 'sample', 'samples',
                                      'practice', 'individual', 'individuals', 
                                      'individually', 'specific', 'specify', 'prevalence', 
                                      'limited', 'procedure', 'procedures', 'write',
                                      'obtain', 'practice', 'practices', 'practicing',
                                      'diagnostic', 'mg', 'mcg', 'ml', 'qday', 'tid',
                                      'qid', 'bid', 'po', 'pr', 'ac', 'prn', 'am', 'pm',
                                      'market', 'receive', 'received', 'receives', 
                                      'require', 'requires', 'required', 'start', 'end',
                                      'starts', 'ends', 'starting', 'ending', 'allow',
                                      'allows', 'allowed', 'define', 'defines', 'defined',
                                      'evaluation', 'ongoing', 'examination', 'evaluations',
                                      'examinations', 'evidence', 'upper','lower', 'normal',
                                      'people', 'person', 'exclusion', 'hour', 'hours', 'hr',
                                      'hrs', 'min', 'minute', 'minutes', 'include', 'equal',
                                      'equals', 'double', 'undergo', 'level', 'dl', 
                                      'diagnose', 'increase', 'increases', 'decrease', 
                                      'decreases', 'facility' 'implementation', 'center',
                                      'training', 'provider', 'providers', 'centers',
                                      'facilities', 'trainings', 'previous', 'assess',
                                      'assesses', 'assessment', 'assessments', 'assessing',
                                      'cause', 'service', 'services', 'project', 'projects',
                                      'identify', 'live', 'design', 'designs', 'survey',
                                      'surveys', 'surveying', 'implementation', 
                                      'implementations', 'research', 'decision', 'improve',
                                      'investigate', 'information', 'conclusion', 'qualify',
                                      'quantify', 'researcher', 'researching', 'outcome',
                                      'prognosis', 'positively', 'objective', 'compare',
                                      'randomize', 'induce', 'treatment', 'change', 'lab',
                                      'laboratory', 'changes', 'labs', 'laboratories',
                                      'clinical trial'
                                      ]))


In [29]:
def replace_csv_missing(row):
    text = (row[0] + ' ' +  # 'brief_title'
            row[1] + ' ' +  # 'condition'
            row[2] + ' ' +  # 'brief_summary'
            row[3] + ' ' +  # 'keyword'
#             row[4][:500] + ' ' +  # 'eligibility' Keep first 500, tends to be exclusion criteria if it's very long.
#             row[5] + ' ' +   # 'official_title
            row[6] + ' ' +   # 'official_title
            row[7] + ' ' +   # 'official_title
            row[8]  # 'official_title
           )
    new = re.sub('missing', '', text)
    new = re.sub('-', ' ', new)
    new = re.sub('\(\S*\)', ' ', new)
    new = re.sub(' \s+', ' ', new)
    return u"{}".format(new)

In [30]:
def import_csv_files(file):
    all_rows = []
    with open(file, 'r') as csvinfile:
        csv_reader = csv.reader(csvinfile, delimiter=',')
        for row in csv_reader:
            all_rows.append(row)
    pool = Pool(processes=multiprocessing.cpu_count())
    with pool as p:
        transformed_rows = p.map(replace_csv_missing, all_rows)
    return transformed_rows
    

Import data.

In [31]:
search_terms = import_csv_files('../all_trials_text.csv')[1:]  # Discard header row.

In [32]:
len(search_terms)

316684

In [33]:
search_terms[8]

' The goal of this clinical research study is to learn if olaparib, when given after treatment with cabazitaxel, carboplatin, and prednisone, can help to control aggressive variant prostate cancer . The safety of these drugs will also be studied. This is an investigational study. Cabazitaxel and carboplatin are FDA approved and commercially available for the treatment of certain types of prostate cancer. Prednisone is FDA approved and commercially available as a corticosteroid. Olaparib is FDA approved and commercially available for the treatment of certain types of ovarian cancer. The combination of cabazitaxel and carboplatin followed by olaparib in this study is investigational. The study doctor can describe how the study drugs are designed to work. Up to 96 participants will be enrolled on this study. All will take part at MD Anderson. Study of Olaparib Maintenance Following Cabazitaxel Carbo in Men With AVPC Prostate Cancer Aggressiveness Carcinoma Prostatic Neoplasms prostate can

In [34]:
doc = nlp(search_terms[8])

In [35]:
entity = doc.ents[8]
entity

aggressive

In [36]:
# for entity in doc.ents:
for ent in entity._.umls_ents:
    print(linker.umls.cui_to_entity[ent[0]])

CUI: C0580822, Name: Aggressive course
Definition: In medicine, describes a tumor or disease that forms, grows, or spreads quickly. It may also describe treatment that is more severe or intense than usual.
TUI(s): T079
Aliases: (total: 2): 
	 Aggressive, aggressive
CUI: C1547300, Name: Precaution Code - Aggressive
Definition: None
TUI(s): T170
Aliases: (total: 1): 
	 Aggressive
CUI: C1548760, Name: Risk Codes - Aggressive
Definition: None
TUI(s): T170
Aliases: (total: 1): 
	 Aggressive


In [37]:
def get_search_terms(index):
    doc = nlp(search_terms[index])
    terms = pd.Series([text.lemma_.lower() for text in doc.ents if text.lemma_.lower() not in STOPWORDS]).value_counts(normalize=True)
    terms_ = pd.DataFrame({'terms': terms.index,
                           'probs': np.round(terms.values, 4)})
    print(search_terms[index])
    return list(terms_.itertuples(index=False, name=None))

In [38]:
get_search_terms(20)

 Electromagnetic field versus diclofenac drugs on primary dysmenorrhea: Arandomized controlled trial in the Egyptian women Electromagnetic Field Versus Diclofenac Drugs on Primary Dysmenorrhea Electromagnetic Field Dysmenorrhea Dysmenorrhea Dysmenorrhea Electromagnetic Field Versus Diclofenac Drugs on Primary Dysmenorrhea: Arandomized Controlled Trial in the Egyptian Women


[('primary dysmenorrhea', 0.2),
 ('electromagnetic field', 0.2),
 ('diclofenac drugs', 0.1333),
 ('arandomized controlled trial', 0.1333),
 ('electromagnetic field dysmenorrhea dysmenorrhea dysmenorrhea', 0.0667),
 ('women', 0.0667),
 ('diclofenac', 0.0667),
 ('egyptian', 0.0667),
 ('egyptian woman', 0.0667)]

In [39]:
get_search_terms(15)

 A split mouth design study will be performed regarding the use of platelet concentrates on ridge preservation: L PRF vs A PRF vs control. Patient needing multiple teeth extractions in the upper jaw (single rooted teeth) will be recruited. The use of each platelet concentrate or control will be randomized by means of a computer program. The results will be analysed clinical and radiographically . When the subject will choose for implant rehabilitation, a biopsy will be taken in the site of the preserved sockets. The region will be localized with a customized stent, fabricated with the position of the extracted teeth. VAS scales will be provided to evaluate the post operative discomfort. Effect of L PRF and A PRF in Ridge Preservation Atrophic Maxilla Atrophy leucocyte and platelet rich fibrin Atrophy The Use of L PRF and A PRF in Ridge Preservation: a Randomized Controlled Clinical Trial


[('prf', 0.1053),
 ('l prf', 0.0526),
 ('platelet concentrate', 0.0526),
 ('ridge preservation', 0.0526),
 ('single rooted tooth', 0.0263),
 ('radiographically', 0.0263),
 ('atrophic maxilla', 0.0263),
 ('customize', 0.0263),
 ('position', 0.0263),
 ('vas', 0.0263),
 ('post', 0.0263),
 ('tooth', 0.0263),
 ('platelet rich fibrin', 0.0263),
 ('socket', 0.0263),
 ('fabricate', 0.0263),
 ('biopsy', 0.0263),
 ('localized', 0.0263),
 ('site', 0.0263),
 ('multiple tooth', 0.0263),
 ('ridge', 0.0263),
 ('region', 0.0263),
 ('implant rehabilitation', 0.0263),
 ('atrophy', 0.0263),
 ('upper jaw', 0.0263),
 ('split mouth design', 0.0263),
 ('leucocyte', 0.0263),
 ('l', 0.0263),
 ('stent', 0.0263),
 ('result', 0.0263),
 ('randomized controlled clinical trial', 0.0263),
 ('analyse', 0.0263),
 ('computer program', 0.0263)]

In [40]:
get_search_terms(11)

 This clinical trial will evaluate safety, immunogenicity, and efficacy (prevention of Mtb infection as measured by IGRA conversions) of H56:IC31 in remotely BCG vaccinated adolescents. A Phase II Study of H56:IC31 in Healthy Adolescents Tuberculosis Infection Infection Tuberculosis Latent Tuberculosis Infection A Randomized, Placebo Controlled, Double Blind Phase II Study to Evaluate Safety, Immunogenicity, and Prevention of Infection With Mycobacterium Tuberculosis of H56:IC31 in Healthy Adolescents


[('h56:ic31', 0.1304),
 ('prevention', 0.087),
 ('healthy', 0.087),
 ('infection', 0.087),
 ('adolescent', 0.087),
 ('immunogenicity', 0.087),
 ('mtb infection', 0.0435),
 ('evaluate safety', 0.0435),
 ('igra', 0.0435),
 ('mycobacterium tuberculosis', 0.0435),
 ('remotely bcg', 0.0435),
 ('conversion', 0.0435),
 ('phase ii study', 0.0435),
 ('vaccinated', 0.0435),
 ('double blind phase ii study', 0.0435),
 ('latent tuberculosis', 0.0435)]

In [41]:
get_search_terms(23455)

 This study evaluates the utility and reliability of Somnocheck micro Weinmann for obstructive sleep apnea syndrome screening in patients affected by resistant systemic arterial hypertension. Results are compared with a modified portable sleep apnea testing (type III portable monitoring: Somnocheck 2 Weinmann). Screening of Obstructive Sleep Apnea Syndrome in Patients With Resistant Systemic Arterial Hypertension: Pilot Study Sleep Apnea, Obstructive Apnea Sleep Apnea Syndromes Sleep Apnea, Obstructive Hypertension Apnea Screening of Obstructive Sleep Apnea Syndrome in Patients With Resistant Systemic Arterial Hypertension: Pilot Study


[('resistant', 0.1304),
 ('obstructive sleep apnea syndrome', 0.1304),
 ('systemic', 0.1304),
 ('utility', 0.0435),
 ('somnocheck micro', 0.0435),
 ('affect', 0.0435),
 ('screen', 0.0435),
 ('modify', 0.0435),
 ('obstructive apnea sleep apnea syndromes sleep apnea', 0.0435),
 ('pilot study', 0.0435),
 ('pilot study sleep apnea', 0.0435),
 ('arterial hypertension', 0.0435),
 ('obstructive hypertension', 0.0435),
 ('somnocheck 2 weinmann', 0.0435),
 ('type iii portable', 0.0435),
 ('reliability', 0.0435),
 ('apnea screening', 0.0435)]

In [42]:
get_search_terms(31984)

 To test the hypothesis that nebulized Pulmicort Respules could reduce post operation pulmonary complication incidence in primary lung cancer patients with COPD. Compare post operation pulmonary complication incidence in two treatment arms among primary lung cancer patients with COPD required single lobectomy from 3days before operation to 7 days after operation. Efficacy of Nebulized Pulmicort Respules in Primary Lung Cancer Patients With COPD Post Operative Complication, Pulmonary Lung Neoplasms Postoperative Complications primary lung cancer patients Lung Neoplasms Efficacy of Nebulized Pulmicort Respules on Post Operation Pulmonary Complication During Pulmonary Single Lobectomy Surgery in Primary Lung Cancer Patients With COPD


[('copd', 0.1111),
 ('primary lung cancer', 0.1111),
 ('post', 0.1111),
 ('operation', 0.0833),
 ('pulmonary complication', 0.0833),
 ('respules', 0.0833),
 ('nebulized pulmicort', 0.0556),
 ('incidence', 0.0556),
 ('pulmonary single lobectomy surgery', 0.0278),
 ('reduce', 0.0278),
 ('nebulize', 0.0278),
 ('single lobectomy', 0.0278),
 ('pulmonary lung neoplasms', 0.0278),
 ('postoperative', 0.0278),
 ('lung neoplasm', 0.0278),
 ('complications', 0.0278),
 ('pulmicort', 0.0278),
 ('hypothesis', 0.0278),
 ('operative complication', 0.0278)]