In [1]:
import os
import requests
import json
import pandas as pd
from pandas import read_csv
import re
from collections import OrderedDict
import pickle
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Functions for fetching relevant metadata

In [2]:
## Pull all Clinical Trials data 

def fetch_src_size():
    pubmeta = requests.get("https://api.outbreak.info/resources/query?q=@type:ClinicalTrial&size=0&aggs=@type")
    pubjson = json.loads(pubmeta.text)
    pubcount = int(pubjson["facets"]["@type"]["total"])
    return(pubcount)


#### Pull ids from a json file
def get_ids_from_json(jsonfile):
    idlist = []
    for eachhit in jsonfile["hits"]:
        if eachhit["_id"] not in idlist:
            idlist.append(eachhit["_id"])
    return(idlist)


#### Ping the API and get all the ids for a clinical trials and scroll through the source until number of ids matches meta
def get_source_ids():
    source_size = fetch_src_size()
    r = requests.get("https://api.outbreak.info/resources/resource/query?q=@type:ClinicalTrial&fields=_id&fetch_all=true")
    response = json.loads(r.text)
    idlist = get_ids_from_json(response)
    try:
        scroll_id = response["_scroll_id"]
        while len(idlist) < source_size:
            r2 = requests.get("https://api.outbreak.info/resources/resource/query?q=@type:ClinicalTrial&fields=_id&fetch_all=true&scroll_id="+scroll_id)
            response2 = json.loads(r2.text)
            idlist2 = set(get_ids_from_json(response2))
            tmpset = set(idlist)
            idlist = list(tmpset.union(idlist2))
            try:
                scroll_id = response2["_scroll_id"]
            except:
                print("no new scroll id")
        return(idlist)
    except:
        return(idlist)
 

In [3]:
#### Get the key metadata for all clinical trials
#### Get the metadata for each list
#### Note, I've tried batches of 1000, and the post request has failed, so this uses a batch size that's less likely to fail
def batch_fetch_clin_meta(idlist):
    ## Break the list of ids into smaller chunks so the API doesn't fail the post request
    runs = round((len(idlist))/100,0)
    i=0 
    separator = ','
    ## Create dummy dataframe to store the meta data
    textdf = pd.DataFrame(columns = ['_id','abstract','trialName','trialDescription',
                                     'designPrimaryPurpose','studyType',
                                     'interventionCategory','interventionName'])
    while i < runs+1:
        if len(idlist)<100:
            sample = idlist
        elif i == 0:
            sample = idlist[i:(i+1)*100]
        elif i == runs:
            sample = idlist[i*100:len(idlist)]
        else:
            sample = idlist[i*100:(i+1)*100]
        sample_ids = separator.join(sample)
        ## Get the text-based metadata (abstract, title) and save it
        r = requests.post("https://api.outbreak.info/resources/query/", params = {'q': sample_ids, 'scopes': '_id', 'fields': 'name,abstract,description,interventions,studyDesign'})
        if r.status_code == 200:
            rawresult = json.loads(r.text)
            structuredresult = pd.json_normalize(rawresult)
            structuredresult.drop(columns=['studyDesign.@type','studyDesign.designModel',
                                           'studyDesign.phaseNumber','studyDesign.phase',
                                           'studyDesign.designAllocation','studyDesign.studyDesignText'],inplace=True)
            structuredresult.rename(columns={'name':'trialName', 'description':'trialDescription',
                                             'studyDesign.designPrimaryPurpose':'designPrimaryPurpose',
                                             'studyDesign.studyType':'studyType'},inplace=True)
            exploded = structuredresult.explode('interventions')
            no_interventions = exploded.loc[exploded['interventions'].isna()].copy()
            no_interventions_clean = no_interventions[['_id','abstract','trialName','trialDescription',
                                                       'designPrimaryPurpose','studyType']].copy()
            has_interventions = exploded.loc[~exploded['interventions'].isna()].copy()
            interventions = pd.concat([has_interventions.drop(['interventions'], axis=1), has_interventions['interventions'].apply(pd.Series)], axis=1)
            clean_interventions = interventions[['_id','abstract','trialName','trialDescription',
                                                 'designPrimaryPurpose','studyType',
                                                 'category','name']].copy()
            clean_interventions.rename(columns={'name':'interventionName','category':'interventionCategory'},inplace=True)
            textdf = pd.concat((textdf,clean_interventions,no_interventions_clean),ignore_index=True)
        i=i+1
    textdf.rename(columns={'trialName':'name','trialDescription':'description'},inplace=True)
    return(textdf)
        

## Functions for transforming the metadata

In [4]:
#### Merge text from the name, abstract, and description
#### Clean up up the text

def merge_texts(df):
    df.fillna('',inplace=True)
    df['text'] = df['name'].astype(str).str.cat(df['abstract'].astype(str).str.cat(df['description'],sep=' '),sep=' ')
    df['text'] = df['text'].str.replace(r'\W', ' ')
    df['text'] = df['text'].str.replace(r'\s+[a-zA-Z]\s+', ' ')
    df['text'] = df['text'].str.replace(r'\^[a-zA-Z]\s+', ' ')
    df['text'] = df['text'].str.lower()   
    return(df)
    

## Functions for training a classifier

In [5]:
def generate_training_df(pos_id,maybe_neg_id,original_data):
    neg_id = [item  for item in maybe_neg if item not in pos_id]
    original_data = merge_texts(original_data)
    training_set_pos = original_data[['_id','text']].loc[(original_data['_id'].isin(pos_id))&
                                                         (~original_data['text'].isna())]
    training_set_pos['target']='in category'
    training_set_neg = original_data[['_id','text']].loc[(original_data['_id'].isin(neg_id))&
                                                         (~original_data['text'].isna())]
    training_set_neg['target']='not in category'
    training_set = pd.concat((training_set_pos,training_set_neg),ignore_index=True)
    return(training_set)


### Fetch metadata

In [6]:
%%time
idlist = get_source_ids()
print(len(idlist))

9918
Wall time: 8.96 s


In [7]:
%%time
clin_meta = batch_fetch_clin_meta(idlist)
print(len(clin_meta))

14811
Wall time: 1min 30s


In [8]:
%%time
clin_meta = merge_texts(clin_meta)

Wall time: 1.86 s


### Split clinical trials up into specific topicCategories based on available Meta data
The clinical trials meta contains enough information to readily classify the records into broad topicCategories, but requires more work in order to map the data to specific topicCategories
Fields which contain valuable sorting information include:
1. designPrimaryPurpose
2. intervention category
3. intervention name (but this one is highly variable, and will need a lot of work to map)

#### Mapping the records based on designPrimaryPurpose

In [9]:
#print(clin_meta['designPrimaryPurpose'].unique().tolist())
designpurpose = clin_meta.groupby('designPrimaryPurpose').size().reset_index(name='counts')
designpurpose.sort_values('counts',ascending=False,inplace=True)
print(designpurpose)

                                 designPrimaryPurpose  counts
0                                                        7623
22                                          treatment    4342
13                                         prevention    1315
12                                              other     363
20                                    supportive care     331
5                                          diagnostic     249
10                           health services research     248
2                                       basic science     104
19                                          screening      91
17                                          prognosis      46
11                                    natural history      22
1                  basic research/physiological study      19
6                                  education/guidance      14
3                                  device feasibility      10
18                                       psychosocial       9
9       

The categories above may reasonably be mapped as follows:
* treatment: Treatment
* prevention: Prevention
* ~other~
* supportive care: Medical Care, Behavioral Research
* diagnostic: Diagnosis
* health services research: Medical Care
* ~basic science~
* screening: Diagnosis
* prognosis: Prognosis (should be a subcategory for Treatment)
* natural history: Case Descriptions
* ~basic research/physiological study~
* education/guidance: Behavioral Research
* ~device feasibility~
* psychosocial: Behavioral Research
* ~health service research~
* ~educational / counselling / training~

In [10]:
print(clin_meta.loc[clin_meta['designPrimaryPurpose']=='screening'].head(n=2))

              _id                                           abstract  \
1055  NCT04710316  The new coronavirus known as SARS-Cov-2 (sever...   
1056  NCT04710316  The new coronavirus known as SARS-Cov-2 (sever...   

                                                   name description  \
1055  Étude de l'épidémie de SARS-CoV-2 Dans Les Ser...               
1056  Étude de l'épidémie de SARS-CoV-2 Dans Les Ser...               

     designPrimaryPurpose       studyType interventionCategory  \
1055            screening  interventional      diagnostic test   
1056            screening  interventional      diagnostic test   

                               interventionName  \
1055  SARS-CoV-2 screening by molecular biology   
1056                      Serological screening   

                                                   text  
1055  étude de épidémie de sars cov 2 dans les servi...  
1056  étude de épidémie de sars cov 2 dans les servi...  


#### Export the records which cannot be mapped based on designPrimaryPurpose
Check if they have an intervention category. If they don't have any of these, they are candidates to be classified by the algorithm

In [22]:
DATAPATH = 'data/'
nopurpose = clin_meta.loc[clin_meta['designPrimaryPurpose']==""]
## Note, the interventionCategory is used to identify drugs and supplements downstream
## Hence, even if some entries have drugs under interventionName, they may be neglected 
no_int = nopurpose.loc[nopurpose['interventionCategory']==""]
with open(os.path.join(DATAPATH,'blank_entries.pickle'),'wb') as uncategorized:
    pickle.dump(no_int,uncategorized)

#### Mapping the records based on intervention category
The intervention categories are primarily used only in NCT records and generally fall under the following categories which can potentially be mapped as follows:
* 'biological': Biologics
* 'diagnostic test': Diagnosis
    * Note that the Intervention name can give insight into subcategories for Diagnosis
* 'drug': Treatment
    * This can be subdivided into Pharmaceutical Treatments or Repurposing
    * To determine which, query the drug in Wikidata and if it has an NDF-RT ID it's FDA approved, therefore Repurpose
    * Note that NDF-RT is obsolete, so if it was FDA approved after the fact, it will not appear as such
* 'behavioral': Behavioral Research
* ~'other'~
* ~'combination product'~ (some standard of care, some pharmaceutical
* ~'device'~
* 'procedure': Medical Care
* 'dietary supplement': Repurposing
* 'radiation': Medical Care
* 'genetic': Host Factors

#### Generate keyword lists based on term frequencies for interventionName in various categories for subclassification

In [10]:
## Diagnostics subclassifiecation
diagnostics = clin_meta[['_id','interventionName']].loc[((clin_meta['interventionCategory'].astype(str).str.contains('diagnostic test'))|
                                                         (clin_meta['designPrimaryPurpose'].astype(str).str.contains('screening'))|
                                                         (clin_meta['designPrimaryPurpose'].astype(str).str.contains('diagnosis')))]
diag_word_freq = diagnostics.groupby('interventionName').size().reset_index(name='counts')
diag_word_freq.sort_values('counts',ascending=False,inplace=True)
print(diag_word_freq)

                                      interventionName  counts
287                                    Lung ultrasound       8
318                                Nasopharyngeal swab       8
15                             Anti-SARS-CoV2 Serology       4
87                                     COVID-19 RT-PCR       4
47                                        Blood sample       4
..                                                 ...     ...
234                               Heterologous stimuli       1
235  Home Sleep Apnea Testing or In-hospital Polyso...       1
236                  Hospital Anxiety Depression Scale       1
237                           Human biological samples       1
672                              visual analogue scale       1

[673 rows x 2 columns]


In [11]:
## Prevention subclassification
prevention = clin_meta.loc[clin_meta['designPrimaryPurpose']=='prevention']
prev_word_freq = prevention.groupby('interventionName').size().reset_index(name='counts')
prev_word_freq.sort_values('counts',ascending=False,inplace=True)
print(prev_word_freq.head(n=4))
#print(prev_word_freq['interventionName'].tolist())

        interventionName  counts
581              Placebo     116
361   Hydroxychloroquine      27
604  Placebo oral tablet      11
893            mRNA-1273       8


In [8]:
#### Create the maps and search terms

intervention_map={'genetic':'Host Factors',
                  'biological':'Biologics',
                  'behavioral':'Behavioral Research',
                  'radiation':'Medical Care',
                  'procedure': 'Medical Care',
                  'dietary supplement': 'Repurposing',
                  'diagnostic test': 'Diagnosis'}


diagnostickeywords = {'Pathology/Radiology':['graphy','ultrasound','ECG','Pulmonary Function Test','Spirometry','biopsy'],
                      'Rapid Diagnostics':['rapid','Rapid'],
                      'Virus Detection':['RT-PCR','PCR'],
                      #'Antibody Detection':['IgG','IgM','IgE','IgA','antibod','Antibod','antigen','Antigen','ELISA','ELISPOT'],
                      'Antibody Detection':['antibod','Antibod','antigen','Anti-SARS-CoV2','Antigen','ELISA','ELISPOT'],
                      'Symptoms':['symptom','clinical sign','presenting with','clinical presentation']}

treatmentkeywords = {'Vaccines':['vaccin','Vaccin','inactivated virus'],
                     'Medical Care':['Ventilat','ventilat','standard of care','soc','s.o.c.']}

preventionkeywords = {'Public Health Interventions':['policy','travel restriction','lockdown','quarantine','campaign','closures'],
                      'Individual Prevention':['counsel','training','education','awareness','PPE','face mask','face covering','device'],
                      'Vaccines':['vaccin','Vaccin','inactivated virus']}

designpurposemap = {'treatment': 'Treatment',
                    'prevention': 'Prevention',
                    'diagnostic': 'Diagnosis',
                    'health services research': 'Medical Care',
                    'screening': 'Diagnosis',
                    'natural history': 'Case Descriptions',
                    'education/guidance': 'Behavioral Research',
                    'psychosocial': 'Behavioral Research'}

#### Potential use of combinations to describe subcategories
combi_cats = {"Individual Prevention":{'designPrimaryPurpose':'prevention','interventionCategory':'device'}}

#### Potential use of single cats to describe combi cats
multi_cats = {'supportive care': ['Medical Care','Behavioral Research']}

## Apply the mappings

### Clean up the drugs for classification
Sort drugs as Repurposed (if FDA approved), or pharmaceutical treatment (if it's a chemical compound but not a medicine or pharmaceutical drug)

In Wikidata:
* Chemical compound: Q11173
* medication: Q12140
* essential medicine: Q35456
* pharmaceutical product: Q28885102
* drug: Q8386

Instances of medication, essential medicine, and pharmaceutical product will be classified as Repurpose
Instances of Chemical compound not classified as these others will be considered pharmaceutical treatment

In [19]:
## Remove stopwords and other text that may cause issues when searching Wikidata results
drug_stopwords = {" Oral Tablet":"",
                  " oral tablet":"",
                  " oral capsule":"",
                  " Oral Product":"",
                  " For Injection":"",
                  " Administration":"",
                  " Nasal Spray and Gargle":"",
                  " Inhalation Solution":"",
                  " Injectable Solution":"",
                  "  - Weekly Dosing":"",
                  "Single Dose of ":"",
                  " twice a day":"",
                  " Regular dose":"",
                  " Film Tablets":""}
general_stopwords = {" Tablet":"",
                     " tablet":"",
                     " inhalation":"",
                     " intravenous":"",
                     " injection":"",
                     " Injection":"",
                     " pill":"",
                     " gas":"",
                     " comparator":""}
pharma_amounts = r"((?:\d{1,3}|0\.\d{1,3})\s(?:(?:MG/ML)|(?:mg/mL)|mg|MG|Mg))"
odd_fractions = r"(/((?:\d{1,3}|0\.\d{1,3})\s(?:mL|ML|Ml)|(?:KG|kg)))"

#print(clin_meta['interventionCategory'].unique().tolist())

drugs = clin_meta[['_id','interventionName','text']].loc[clin_meta['interventionCategory'].astype(str).str.contains('drug')].copy()
drugs['interventionName'] = drugs['interventionName'].replace(drug_stopwords,regex=True)
drugs['interventionName'] = drugs['interventionName'].replace(general_stopwords,regex=True)
drugs['interventionName'] = drugs['interventionName'].str.replace(pharma_amounts,"",regex=True)
drugs['interventionName'] = drugs['interventionName'].str.replace(odd_fractions,"",regex=True)
drugs['interventionName'] = drugs['interventionName'].str.replace(' / ','/')
drugs['interventionName'] = drugs['interventionName'].str.strip()
drug_word_freq = drugs.groupby('interventionName').size().reset_index(name='counts')
drug_word_freq.sort_values('counts',ascending=False,inplace=True)
drugfreq = drug_word_freq.loc[drug_word_freq['counts']>1].copy()
druglist = drugfreq['interventionName'].unique().tolist()

In [None]:
print(druglist)

In [11]:
def parse_wikidata(data):
    tmplist = []
    for item in data['results']['bindings']:
        try:
            tmplist.append(OrderedDict({
            'wdid':item['item']['value'].replace('http://www.wikidata.org/entity/',''),
            'drug_name': item['itemLabel']['value'],
            'name': item['itemLabel']['value'].lower(),
            'alias': "None"}))
            tmp= item['itemAltLabel']['value'].split(',')
            for altname in tmp:
                if len(altname.strip())>3:
                    tmplist.append(OrderedDict({
                    'wdid':item['item']['value'].replace('http://www.wikidata.org/entity/',''),
                    'drug_name': item['itemLabel']['value'],
                    'name': item['itemLabel']['value'].lower(),
                    'alias': altname.strip().lower()
                    }))
        except:
            tmplist.append(OrderedDict({
            'wdid':item['item']['value'].replace('http://www.wikidata.org/entity/',''),
            'drug_name': item['itemLabel']['value'],
            'name': item['itemLabel']['value'].lower(),
            'alias': "None"
            }))
    tmpdf = pd.DataFrame(tmplist)
    return(tmpdf)
        
def get_wd_drugs(): 
    repurposetypes = ['Q12140', 'Q35456', 'Q28885102','Q8386']
    url = 'https://query.wikidata.org/sparql'
    querystart = """
    SELECT
      ?item ?itemLabel ?itemAltLabel
      ?value 
    WHERE 
    {
      ?item wdt:P31 wd:"""
    queryend = """.        
      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    """
    repurpose = pd.DataFrame(columns=['wdid','drug_name','name','alias'])
    for eachwdid in repurposetypes:
        query = querystart+eachwdid+queryend
        r = requests.get(url, params = {'format': 'json', 'query': query})
        data = r.json()
        tmpdf = parse_wikidata(data)
        repurpose = pd.concat((repurpose,tmpdf),ignore_index=True)
    repurpose.drop_duplicates(keep='first',inplace=True)
    return(repurpose)

Note that originally, the script also did a Wikidata query for instance of chemical compounds, but there are so many in Wikidata that the query will time out. However, if a clinical trial is classified as a drug trial, it should generally fall under either repurposing or pharmaceutical intervention depending on whether or not the drug is novel. For this reason, we will define repurposing based on matches in Wikidata and define pharmaceutical based on exclusion from repurposing.

In [12]:
%%time
## Batch fetch names, aliases, and wdid's from Wikidata for medications, pharma products, etc.3
repurposedf = get_wd_drugs()

Wall time: 12.8 s


In [13]:
print(len(repurposedf))
print(repurposedf.head(n=5))

30623
   wdid drug_name     name              alias
0  Q153   ethanol  ethanol               None
1  Q153   ethanol  ethanol             spirit
2  Q153   ethanol  ethanol             tecsol
3  Q153   ethanol  ethanol            alcohol
4  Q153   ethanol  ethanol  denatured alcohol


In [16]:
all_drugs = list(set(repurposedf['name'].unique().tolist()).union(set(repurposedf['alias'].unique().tolist())))
druglist_lower = [x.lower() for x in druglist]
all_drugs_lower = [x.lower() for x in all_drugs]
in_common = list(set(druglist_lower).intersection(set(all_drugs_lower)))
missing = [x for x in druglist_lower if x not in in_common]
print(missing)

['placebo', 'remdesivir', 'hydroxychloroquine sulfate', 'placebos', 'standard of care', 'standard of care', 'camostat mesilate', 'clazakizumab', 'das181', 'standard medical treatment', 'convalescent plasma', 'standard treatment', 'placebo', 'hydroxychloroquine (hcq)', 'opaganib', 'oxygen', 'zinc', 'hcq', 'normal saline', 'dwrx2003', 'colchicines', 'ly3819253', 'corticosteroid', 'rls-0071', 'mavrilimumab', 'regn10933+regn10987 combination therapy', 'placebo (normal saline solution)', 'sng001', 'ivermectins', 'leronlimab (700mg)', 'nitric oxide gas', 'cannabidiol', 'normal saline', 'soc', 'ace inhibitor', 'apixaban 2.', 'chloroquine or hydroxychloroquine', 'control', 'suspension of heat killed (autoclaved) mycobacterium w', 'slv213', 'vitamin d', 'eidd-2801', 'brii-196 and brii-198', 'fisetin', 'proxalutamide', 'pulmozyme', 'azd7442', 'hzvsf-v13', 'snpp protoporphyrin plus sunlight exposure', 'bamlanivimab', 'bcg vaccine', 'lopinavir/ ritonavir', 'selinexor', 'quercetin', 'aerosolized 13

In [22]:
DATAPATH = 'data/topicCategories/'

repurpose_cts = drugs['_id'].loc[drugs['interventionName'].astype(str).str.lower().isin(in_common)].unique().tolist()
drug_repurposing = clin_meta.loc[(clin_meta['interventionCategory']=='dietary supplement')|
                                 (clin_meta['_id'].isin(repurpose_cts))]
pharma_cts = drugs['_id'].loc[~drugs['_id'].isin(repurpose_cts)].unique().tolist()
pharma_info = clin_meta.loc[clin_meta['_id'].isin(pharma_cts)]
print(len(repurpose_cts))
print(len(pharma_cts))
with open(os.path.join(DATAPATH,'Repurposing.pickle'),'wb') as dumpfile:
    pickle.dump(drug_repurposing,dumpfile)
with open(os.path.join(DATAPATH,'Pharmaceutical Treatments.pickle'),'wb') as dumpfile:
    pickle.dump(pharma_info,dumpfile)

580
1060


### Use the mappings to generate datasets, ignoring drug and other broad categories 
which may require further downstream processing

In [26]:
for eachintervention in intervention_map.keys():
    tmpdf = clin_meta.loc[clin_meta['interventionCategory'].astype(str).str.contains(eachintervention)]
    with open(os.path.join(DATAPATH,intervention_map[eachintervention]+'.pickle'),'wb') as outpath:
        pickle.dump(tmpdf,outpath)

## Do the same for design purpose
for eachpurpose in designpurposemap.keys():
    tmpdf = clin_meta.loc[clin_meta['designPrimaryPurpose'].astype(str).str.contains(eachpurpose)]
    try:
        originaldf = pickle.load(open(os.path.join(DATAPATH,designpurposemap[eachpurpose]+'.pickle'),'rb'))
        combidf = pd.concat((originaldf,tmpdf),ignore_index=True)
        combi.drop_duplicates(keep="first",inplace=True)
    except:
        combidf = tmpdf
    with open(os.path.join(DATAPATH,designpurposemap[eachpurpose]+'.pickle'),'wb') as outpath:
        pickle.dump(combidf,outpath)

### Apply diagnostic keywords to subclassify diagnosis
1. Use the designpurposemap and the intervention_map to identify CT's that should be classified as 'Diagnosis'
2. Use the diagnostickeywords to search through the intervention names, or descriptions to further categorize the results

In [28]:
diagnosis = clin_meta.loc[((clin_meta['interventionCategory'].astype(str).str.contains('diagnostic test'))|
                          (clin_meta['designPrimaryPurpose'].astype(str).str.contains('diagnostic'))|
                          (clin_meta['designPrimaryPurpose'].astype(str).str.contains('screening')))].copy()
print(diagnosis.head(n=2))

            _id                                           abstract  \
11  NCT04648709  Current data in the literature demonstrate tha...   
12  NCT04648709  Current data in the literature demonstrate tha...   

                                                 name description  \
11  Evaluation and Longitudinal Follow-up of Bioma...               
12  Evaluation and Longitudinal Follow-up of Bioma...               

   designPrimaryPurpose      studyType interventionCategory interventionName  \
11                       observational      diagnostic test          ELISPOT   
12                       observational      diagnostic test      QUANTIFERON   

                                                 text  
11  evaluation and longitudinal follow up of bioma...  
12  evaluation and longitudinal follow up of bioma...  


In [29]:
for eachdiag in diagnostickeywords.keys():
    keywordlist = diagnostickeywords[eachdiag]
    topicCategory = eachdiag
    searchregex = re.compile('|'.join(keywordlist), re.IGNORECASE)
    tmpdf = diagnosis.loc[((diagnosis['interventionName'].str.contains(searchregex))|
                          (diagnosis['text'].str.contains(searchregex)))]
    print(eachdiag,': ',len(tmpdf))
    with open(os.path.join(DATAPATH,eachdiag.replace('/','_')+'.pickle'),'wb') as outpath:
        pickle.dump(tmpdf,outpath)        

Pathology/Radiology :  173
Rapid Diagnostics :  197
Virus Detection :  252
Antibody Detection :  239
Symptoms :  369


### Apply treatment keywords to subclassify Treatments
Note that drugs are classified separately in Clinical Trials and so will be handled separately. Additionally, biologics are also separated out. 

In [30]:
alltreatment = clin_meta.loc[(clin_meta['designPrimaryPurpose'].astype(str).str.contains('treatment'))]
treatment = alltreatment.loc[((alltreatment['interventionCategory']!='drug')&
                              (alltreatment['interventionCategory']!='biological')&
                              (alltreatment['interventionCategory']!='genetic'))].copy()

In [31]:
for eachtreat in treatmentkeywords.keys():
    keywordlist = treatmentkeywords[eachtreat]
    topicCategory = eachtreat
    searchregex = re.compile('|'.join(keywordlist), re.IGNORECASE)
    tmpdf = treatment.loc[((treatment['interventionName'].str.contains(searchregex))|
                          (treatment['text'].str.contains(searchregex)))]
    print(eachtreat,': ',len(tmpdf))
    try:
        originaldf = pickle.load(open(os.path.join(DATAPATH,eachtreat+'.pickle'),'rb'))
        combidf = pd.concat((originaldf,tmpdf),ignore_index=True)
        combi.drop_duplicates(keep="first",inplace=True)
    except:
        combidf = tmpdf

    with open(os.path.join(DATAPATH,eachtreat+'.pickle'),'wb') as outpath:
        pickle.dump(combidf,outpath)
    combidf.to_csv(os.path.join(DATAPATH,eachtreat+'.tsv'),sep='\t',header=True)

Vaccines :  66
Medical Care :  693


### Subclassify Preventions
1. Get frequencies of Intervention names listed under prevention
2. Map frequent terms to prevention subcategories
Note that vaccines are considered preventative, so include search for vaccines. Screening/early diagnosis can also be considered a preventative strategy

In [32]:
prevention = clin_meta.loc[clin_meta['designPrimaryPurpose'].astype(str).str.contains('prevention')].copy()
print(prevention.head(n=2))
print(len(prevention))

            _id                                           abstract  \
19  NCT04760743  This study is to assess the safety, reactogeni...   
20  NCT04760743  This study is to assess the safety, reactogeni...   

                                                 name  \
19  A Phase I, Placebo-controlled, Randomized, Obs...   
20  A Phase I, Placebo-controlled, Randomized, Obs...   

                                          description designPrimaryPurpose  \
19  This is a first-in-human, Phase I, randomized,...           prevention   
20  This is a first-in-human, Phase I, randomized,...           prevention   

         studyType interventionCategory  \
19  interventional           biological   
20  interventional           biological   

                                interventionName  \
19  NBP2001 adjuvanted with alum (RBD 30μg/dose)   
20  NBP2001 adjuvanted with alum (RBD 50μg/dose)   

                                                 text  
19  a phase placebo controlled  randomi

In [46]:
print(prevention['interventionName'].loc[prevention['interventionCategory']=='device'].unique().tolist())

['WARD CSS', 'High volume evacuation (HVE)', 'Extraoral vacuum aspirator (EVA)', 'External evacuation device (EED)', 'COVID-19 barrier box', 'nasal spray', 'Extubation Advisor', 'CELLECTRA® 2000', 'UVC Irradiation', 'Fit test', 'Tensile strength', 'Cliniporator', 'N95 mask', 'Nordell single E-100 layer mask', 'Nordell double E-100 layer mask', 'non-contact magnetically-controlled capsule endoscopy', 'Medical Mask', 'N95 respirator', 'mouthrinse with bêta-cyclodextrin and citrox', 'mouthrinse without bêta-cyclodextrin and citrox', 'Biocontainment Device For Aerosol Generating Procedures (Biobox)', 'Control for aerosol generating procedures', 'Personal protective equipment', 'Cliniporator® and EPSGun', 'FFP2', 'Facial mask', 'MFS', 'V-CAMS (aka Jaspr)', 'tight-fitting of KF94 mask with clip', 'PEP flute', 'Threshold IMT device', 'Filtration Test', 'VESTA respirator', 'Conventional N95 respirator', 'Face mask', 'Inspiratory training device', 'Expiratory training device', 'Face Mask + Soap

In [33]:
for eachprevent in preventionkeywords.keys():
    keywordlist = preventionkeywords[eachprevent]
    topicCategory = eachprevent
    searchregex = re.compile('|'.join(keywordlist), re.IGNORECASE)
    tmpdf = prevention.loc[((prevention['interventionName'].str.contains(searchregex))|
                            (prevention['text'].str.contains(searchregex)))]
    print(eachprevent,': ',len(tmpdf))
    try:
        originaldf = pickle.load(open(os.path.join(DATAPATH,eachprevent+'.pickle'),'rb'))
        combidf = pd.concat((originaldf,tmpdf),ignore_index=True)
        combidf.drop_duplicates(keep="first",inplace=True)
    except:
        combidf = tmpdf

    with open(os.path.join(DATAPATH,eachprevent+'.pickle'),'wb') as outpath:
        pickle.dump(combidf,outpath)

Public Health Interventions :  78
Individual Prevention :  315
Vaccines :  613


In [34]:
## Take into account combi maps
individual_prevention = prevention.loc[prevention['interventionCategory'].astype(str).str.contains('device')]
originaldf = pickle.load(open(os.path.join(DATAPATH,'Individual Prevention.pickle'),'rb'))
combidf = pd.concat((originaldf,individual_prevention),ignore_index=True)
combidf.drop_duplicates(keep="first",inplace=True)
with open(os.path.join(DATAPATH,'Individual Prevention.pickle'),'wb') as outpath:
    pickle.dump(combidf,outpath)

In [None]:
#### Basic process
## Create binary training set using Behavioral as an example
original_data = read_csv('data/NCT_classification.csv', delimiter=',',header=0)

with open('data/behavioral.txt','rb') as dmpfile:
    behavioral = pickle.load(dmpfile)

with open('data/specificCats.txt','rb') as dompfile:
    traindf = pickle.load(dompfile)

with open('data/diagnosis.txt','rb') as dmpfile:
    diagnosis = pickle.load(dmpfile)  

with open('data/prevention.txt','rb') as dmpfile:
    prevention = pickle.load(dmpfile)

with open('data/treatment.txt','rb') as dmpfile:
    treatment = pickle.load(dmpfile)

with open('data/drug.txt','rb') as dmpfile:
    drug = pickle.load(dmpfile)
    
pos_id = list(set(behavioral).union(set(traindf['_id'].loc[traindf['topicCategory']=='Behavioral Research'].tolist())))
maybe_neg = list(set(traindf['_id'].loc[traindf['topicCategory']!='Behavioral Research'].tolist()).union(
                 set(diagnosis).union(set(prevention).union(set(treatment)))))

neg_id = [item  for item in maybe_neg if item not in pos_id]
original_data = merge_texts(original_data)
training_set_pos = original_data[['_id','text']].loc[(original_data['_id'].isin(pos_id))&
                                                     (~original_data['text'].isna())]
    
training_set_pos['target']='behavioral'
training_set_neg = original_data[['_id','text']].loc[(original_data['_id'].isin(neg_id))&
                                                     (~original_data['text'].isna())]
training_set_neg['target']='not behavioral'

training_set = pd.concat((training_set_pos,training_set_neg),ignore_index=True)

In [None]:
## Create binary training set using Diagnosis as an example
original_data = read_csv('data/NCT_classification.csv', delimiter=',',header=0)

with open('data/behavioral.txt','rb') as dmpfile:
    behavioral = pickle.load(dmpfile)

with open('data/specificCats.txt','rb') as dompfile:
    traindf = pickle.load(dompfile)

with open('data/diagnosis.txt','rb') as dmpfile:
    diagnosis = pickle.load(dmpfile)  

with open('data/prevention.txt','rb') as dmpfile:
    prevention = pickle.load(dmpfile)

with open('data/treatment.txt','rb') as dmpfile:
    treatment = pickle.load(dmpfile)

with open('data/drug.txt','rb') as dmpfile:
    drug = pickle.load(dmpfile)

pos_id = list(set(diagnosis).union(set(traindf['_id'].loc[traindf['topicCategory']=='Diagnosis'].tolist())))
maybe_neg = list(set(traindf['_id'].loc[traindf['topicCategory']!='Diagnosis'].tolist()).union(
                 set(treatment).union(set(prevention))))

neg_id = [item  for item in maybe_neg if item not in pos_id]
original_data = merge_texts(original_data)
training_set_pos = original_data[['_id','text']].loc[(original_data['_id'].isin(pos_id))&
                                                     (~original_data['text'].isna())]
    
training_set_pos['target']='diagnosis'
training_set_neg = original_data[['_id','text']].loc[(original_data['_id'].isin(neg_id))&
                                                     (~original_data['text'].isna())]
training_set_neg['target']='not diagnosis'

training_set = pd.concat((training_set_pos,training_set_neg),ignore_index=True)

In [None]:
## Create binary training set using Prevention as an example
original_data = read_csv('data/NCT_classification.csv', delimiter=',',header=0)

with open('data/behavioral.txt','rb') as dmpfile:
    behavioral = pickle.load(dmpfile)

with open('data/specificCats.txt','rb') as dompfile:
    traindf = pickle.load(dompfile)

with open('data/diagnosis.txt','rb') as dmpfile:
    diagnosis = pickle.load(dmpfile)  

with open('data/prevention.txt','rb') as dmpfile:
    prevention = pickle.load(dmpfile)

with open('data/treatment.txt','rb') as dmpfile:
    treatment = pickle.load(dmpfile)

with open('data/drug.txt','rb') as dmpfile:
    drug = pickle.load(dmpfile)
    
pos_id = list(set(prevention).union(set(traindf['_id'].loc[(traindf['topicCategory']=='Prevention')|
                                                           (traindf['topicCategory']=='Individual Prevention')].tolist())))
maybe_neg = list(set(traindf['_id'].loc[((traindf['topicCategory']!='Prevention')&
                                        (traindf['topicCategory']!='Individual Prevention'))].tolist()).union(
                 set(treatment).union(set(diagnosis))))

neg_id = [item  for item in maybe_neg if item not in pos_id]
original_data = merge_texts(original_data)
training_set_pos = original_data[['_id','text']].loc[(original_data['_id'].isin(pos_id))&
                                                     (~original_data['text'].isna())]
    
training_set_pos['target']='prevention'
training_set_neg = original_data[['_id','text']].loc[(original_data['_id'].isin(neg_id))&
                                                     (~original_data['text'].isna())]
training_set_neg['target']='not prevention'

training_set = pd.concat((training_set_pos,training_set_neg),ignore_index=True)

In [None]:
## Create binary training set using Treatment as an example
original_data = read_csv('data/NCT_classification.csv', delimiter=',',header=0)

with open('data/behavioral.txt','rb') as dmpfile:
    behavioral = pickle.load(dmpfile)

with open('data/specificCats.txt','rb') as dompfile:
    traindf = pickle.load(dompfile)

with open('data/diagnosis.txt','rb') as dmpfile:
    diagnosis = pickle.load(dmpfile)  

with open('data/prevention.txt','rb') as dmpfile:
    prevention = pickle.load(dmpfile)

with open('data/treatment.txt','rb') as dmpfile:
    treatment = pickle.load(dmpfile)

with open('data/drug.txt','rb') as dmpfile:
    drug = pickle.load(dmpfile)

with open('data/basic science.txt','rb') as dmpfile:
    basic_science = pickle.load(dmpfile)

with open('data/observational.txt','rb') as dmpfile:
    observation = pickle.load(dmpfile)

    
pos_id = list(set(treatment).union(set(traindf['_id'].loc[((traindf['topicCategory']=='Treatment')|
                                                           (traindf['topicCategory']=='Vaccines')|
                                                           (traindf['topicCategory']=='Biologics')|
                                                           (traindf['topicCategory']=='Medical Care'))].tolist()))-set(basic_science)-set(observation))
maybe_neg = list(set(traindf['_id'].loc[((traindf['topicCategory']!='Treatment')&
                                         (traindf['topicCategory']!='Vaccines')&
                                         (traindf['topicCategory']!='Biologics')&
                                         (traindf['topicCategory']!='Medical Care'))].tolist()).union(
                 set(prevention).union(set(diagnosis).union(set(basic_science).union(set(observation))))))

neg_id = [item  for item in maybe_neg if item not in pos_id]
original_data = merge_texts(original_data)
training_set_pos = original_data[['_id','text']].loc[(original_data['_id'].isin(pos_id))&
                                                     (~original_data['text'].isna())]
    
training_set_pos['target']='treatment'
training_set_neg = original_data[['_id','text']].loc[(original_data['_id'].isin(neg_id))&
                                                     (~original_data['text'].isna())]
training_set_neg['target']='not treatment'

training_set = pd.concat((training_set_pos,training_set_neg),ignore_index=True)

In [None]:
with open('data/specificCats.txt','rb') as dompfile:
    traindf = pickle.load(dompfile)

print(len(traindf))
print(traindf.groupby('topicCategory').size())
print(len(traindf['_id'].loc[((traindf['topicCategory']!='Treatment')&
                                         (traindf['topicCategory']!='Vaccines')&
                                         (traindf['topicCategory']!='Biologics')&
                                         (traindf['topicCategory']!='Medical Care'))]))

print(len(traindf['_id'].loc[((traindf['topicCategory']=='Treatment')|
                                                           (traindf['topicCategory']=='Vaccines')|
                                                           (traindf['topicCategory']=='Biologics')|
                                                           (traindf['topicCategory']=='Medical Care'))]))

In [None]:
print(len(training_set_pos))
print(len(training_set_neg))

In [None]:
####Vectorize the text for classifier

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(training_set['text'])
features = vectorizer.get_feature_names()
print(X.shape)

#### Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, training_set.target, test_size=0.2, random_state=0)

In [None]:
#### Classify training text as in category or not in category
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

print(confusion_matrix(y_test,y_pred))

report = classification_report(y_test,y_pred,output_dict=True)
print(pd.DataFrame(report))

probs = classifier.predict_proba(X_test)
probs = probs[:, 1]
auc = roc_auc_score(y_test, probs)
print(auc)
print('[[true neg     false pos]]')
print('[[false neg     true pos]]')

## Expand beyond training sets

In [None]:
#### Run the classifier on the entire training dataset without splitting
####Vectorize the training set for classifier

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(training_set['text'])
features = vectorizer.get_feature_names()
print(X.shape)

## Save the vectorizer
#vectorizerfile = "results/models/behavioral_vectorizer.pickle"
#vectorizerfile = "results/models/diagnosis_vectorizer.pickle"
#vectorizerfile = "results/models/prevention_vectorizer.pickle"
vectorizerfile = "results/models/treatment_vectorizer.pickle"
pickle.dump(vectorizer, open(vectorizerfile, "wb"))

#### train the model on all the data
classifier = RandomForestClassifier(n_estimators=1000, random_state=None)
classifier.fit(X, training_set.target)

## Save the Model
#filename = 'results/models/behavioral_randomforest.sav'
#filename = 'results/models/diagnosis_randomforest.sav'
#filename = 'results/models/prevention_randomforest.sav'
filename = 'results/models/treatment_randomforest.sav'
pickle.dump(classifier, open(filename, 'wb'))


In [None]:
## Load the saved models
loaded_model = pickle.load(open(filename, 'rb'))


In [None]:
#### Apply the model
## Pull out all the data 
idlist = get_source_ids()
textdf = batch_fetch_meta(idlist)
textdf = merge_texts(textdf)

## Remove records with no text
nonan = textdf.loc[((~textdf['text'].isna())&
                   (textdf['text'].str.len()>5))]

print(len(idlist), len(nonan))

In [None]:

## Vectorize the text based on the previously trained vectorizer and run the classifier
labels = nonan['_id']
M = vectorizer.transform(nonan['text'])
prediction = classifier.predict(M)

## Save the results
#classifier_results = pd.DataFrame(list(zip(labels, prediction)), columns =['_id', 'behavioral_prediction'])
#print(classifier_results.head(n=2))
#classifier_results.to_csv('results/predictions/behavioral_randomforest.tsv',
#                          sep='\t', header=True)

#classifier_results = pd.DataFrame(list(zip(labels, prediction)), columns =['_id', 'diagnosis_prediction'])
#print(classifier_results.head(n=2))
#classifier_results.to_csv('results/predictions/diagnosis_randomforest.tsv',
#                          sep='\t', header=True)

#classifier_results = pd.DataFrame(list(zip(labels, prediction)), columns =['_id', 'prevention_prediction'])
#print(classifier_results.head(n=2))
#classifier_results.to_csv('results/predictions/prevention_randomforest.tsv',
#                          sep='\t', header=True)

classifier_results = pd.DataFrame(list(zip(labels, prediction)), columns =['_id', 'treatment_prediction'])
print(classifier_results.head(n=2))
classifier_results.to_csv('results/predictions/treatment_randomforest.tsv',
                          sep='\t', header=True)

In [None]:
inspect_results = classifier_results.groupby('treatment_prediction').size().reset_index(name='counts')
print(inspect_results)