# Subtopics classifier for multiple resouce types

Because there are over 25 'subcategories' in the outbreak.info topicCategories, it is difficult automatically create large enough training datasets for each of them.  For this reason, it is necessary to pool the data from curate.outbreak.info and the clinical trials classifier as well as from litcovid as identified via keyword mapping in order to generate a large enough training dataset.

### Import the necessary libraries and generate the datapaths

In [1]:
import os
import pathlib
import pickle
import pandas as pd
from pandas import read_csv

from src.classify_pubs import *
from src.common import load_classifiers

#### MAIN
#script_path = pathlib.Path(__file__).parent.absolute()
#try:
#    general_path = pathlib.Path(__file__).parents[1].absolute()
#except:
#    general_path = pathlib.Path(__file__).resolve().parents[1].absolute()

script_path = os.path.abspath('')
general_path = os.path.abspath(os.path.join(os.getcwd(),"../"))

DATAPATH = os.path.join(script_path,'data/')
RESULTSPATH = os.path.join(script_path,'results/')
MODELPATH = os.path.join(script_path,'models/')
PREDICTPATH = os.path.join(script_path,'predictions/')

littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(DATAPATH,'othertopics.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
topiclist = topicsdf['topicCategory'].unique().tolist()
litsubtopicsfile = os.path.join(DATAPATH,'subtopics.tsv')
litsubtopics = read_csv(litsubtopicsfile,delimiter='\t',header=0,index_col=0)

### Pull the data from litcovid keyword mapping

In [2]:
with open(os.path.join(DATAPATH,'updated_pmids_for_training.pickle'),'rb') as litfile:
    litkeytopics = pickle.load(litfile)

keysubtopics = litkeytopics.loc[litkeytopics['subcategory']==True]

### Generate the litcovid training dataframe

In [3]:
litsubtopics['topicCategory'] = litsubtopics['topicCategory'].astype(str).str.replace('-','/')
print(litsubtopics['topicCategory'].unique().tolist())

['Antibody Detection', 'Pathology/Radiology', 'Rapid Diagnostics', 'Symptoms', 'Testing Prevalence', 'Virus Detection', 'Classical epidemiology', 'Molecular epidemiology', 'Host Factors', 'Immunological Response', 'Mechanism of Infection', 'Mechanism of Transmission', 'Virus Factors', 'Individual Prevention', 'Public Health Interventions', 'Host/Intermediate Reservoirs', 'Viral Shedding/Persistence', 'Biologics', 'Medical Care', 'Pharmaceutical Treatments', 'Prognosis', 'Repurposing', 'Vaccines']


In [4]:
boom = keysubtopics.explode('matching_pmids')
boom.rename(columns={'matching_pmids':'_id'},inplace=True)
boom_clean = boom[['_id','topicCategory']].copy()
boom_clean['topicCategory'] = boom_clean['topicCategory'].astype(str).str.replace(' / ','/')
print(boom_clean['topicCategory'].unique().tolist())

['Case Descriptions', 'Risk Factors', 'Antibody Detection', 'Pathology/Radiology', 'Rapid Diagnostics', 'Symptoms', 'Testing Prevalence', 'Virus Detection', 'Classical epidemiology', 'Molecular epidemiology', 'Host Factors', 'Immunological Response', 'Mechanism of Infection', 'Mechanism of Transmission', 'Virus Factors', 'Individual Prevention', 'Public Health Interventions', 'Host/Intermediate Reservoirs', 'Viral Shedding/Persistence', 'Biologics', 'Medical Care', 'Pharmaceutical Treatments', 'Repurposing', 'Vaccines']


In [5]:
%%time
from src.common import *

pmidlist = list(set(boom_clean['_id'].unique().tolist()).union(set(litsubtopics['_id'].unique().tolist())))
textdf = batch_fetch_meta(pmidlist)
textdf = merge_texts(textdf)
clean_textdf = textdf[['_id','text']]
combidf = pd.concat((boom_clean,litsubtopics[['_id','topicCategory']]),ignore_index=True)
litmergeddf = combidf.merge(textdf,on='_id',how='left')
print(len(litmergeddf))

144130
Wall time: 10min 27s


### Pull the data from curate.outbreak.info
Note that PDB datasets generally have very little by way of useful description and will generally be categorized as host factors (in the case of a human protein) or virus factors (when it's about virus proteins). Due to the overall/general lack of useful text in PDB records, they should be omitted from the training data since they generally won't provide much info to train on.

In [6]:
with open(os.path.join(general_path,'curate_outbreak_data/results/curated_training_df.pickle'),'rb') as curate_file:
    curate_data = pickle.load(curate_file)

curate_df = curate_data[['_id','text','category']].copy()
curate_df.rename(columns={'category':'topicCategory'},inplace=True)
print(curate_df.head(n=2))

                _id                                               text  \
0  figshare11752752  comparative model of novel coronavirus 2019 nc...   
1  figshare11806065  prisma scoping review checklist for nowak and ...   

               topicCategory  
0              Virus Factors  
1  Pharmaceutical Treatments  


### Pull the data from clinical trials
Note that the CT data was saved along with name, abstract, description, and text so it is ready to use in the classifier

In [7]:
ct_classified = os.path.join(general_path,'outbreak_CT_classifier/data/topicCategories/')
ct_training_files = os.listdir(ct_classified)

ct_subtopics = pd.DataFrame(columns = ['_id','text','topicCategory'])
for eachfile in ct_training_files:
    category = eachfile.replace('.pickle','')
    with open(os.path.join(ct_classified,eachfile),"rb") as tmpfile:
        tmpdata = pickle.load(tmpfile)
    tmpdata['topicCategory'] = category.replace('_','/')
    cleandata = tmpdata[['_id','text','topicCategory']].copy()
    ct_subtopics = pd.concat((ct_subtopics,cleandata),ignore_index=True)
print(len(ct_subtopics))
ct_subtopics.drop_duplicates(keep='first',inplace=True)
print(len(ct_subtopics))

12086
7088


In [8]:
ct_subtopics_freq = ct_subtopics.groupby('_id').size().reset_index(name='counts')
ct_subtopics_freq.sort_values('counts',ascending=False,inplace=True)
print(ct_subtopics_freq.head(n=5))
print(ct_subtopics.loc[ct_subtopics['_id']=='NCT04516512'])

              _id  counts
2493  NCT04516512       6
1510  NCT04377802       6
1988  NCT04429724       5
1816  NCT04408456       5
3024  NCT04616846       5
               _id                                               text  \
185    NCT04516512  sars cov 2 seroprevalence among adults people ...   
1021   NCT04516512  sars cov 2 seroprevalence among adults people ...   
1325   NCT04516512  sars cov 2 seroprevalence among adults people ...   
6210   NCT04516512  sars cov 2 seroprevalence among adults people ...   
6739   NCT04516512  sars cov 2 seroprevalence among adults people ...   
12051  NCT04516512  sars cov 2 seroprevalence among adults people ...   

            topicCategory  
185    Antibody Detection  
1021            Biologics  
1325            Diagnosis  
6210    Rapid Diagnostics  
6739             Symptoms  
12051     Virus Detection  


### Merge all the data

In [9]:
allsubtopicsdf = pd.concat((ct_subtopics,curate_df,litmergeddf),ignore_index=True)
allsubtopicsdf['topicCategory'] = allsubtopicsdf['topicCategory'].astype(str).str.replace('/','-')
allsubtopicsdf['topicCategory'] = allsubtopicsdf['topicCategory'].astype(str).str.replace(' - ','-')
print(allsubtopicsdf['topicCategory'].unique().tolist())
subtopics_only = allsubtopicsdf.loc[~allsubtopicsdf['topicCategory'].isin(topiclist)] 
print(len(subtopics_only['topicCategory'].unique().tolist()))

['Antibody Detection', 'Behavioral Research', 'Biologics', 'Case Descriptions', 'Diagnosis', 'Host Factors', 'Individual Prevention', 'Medical Care', 'Pathology-Radiology', 'Pharmaceutical Treatments', 'Prevention', 'Public Health Interventions', 'Rapid Diagnostics', 'Repurposing', 'Symptoms', 'Treatment', 'Vaccines', 'Virus Detection', 'Virus Factors', 'Viral Shedding-Persistence', 'Clinical', 'Mechanism of Infection', 'Transmission', 'Mechanism', 'Epidemiology', 'Forecasting', 'Risk Factors', 'Immunological Response', 'Classical epidemiology', 'Host-Intermediate Reservoirs', 'Mechanism of Transmission', 'Molecular epidemiology', 'Testing Prevalence', 'Prognosis']
26


### Clean up and export the training data

In [None]:
from src.common import *
training_to_export = allsubtopicsdf[['_id','topicCategory']].copy()
training_to_export.drop_duplicates(keep='first',inplace=True)
training_to_export['topicCategory'] = training_to_export['topicCategory'].astype(str).str.replace('-','/')
cleanresults = clean_results(training_to_export)
cleanresults.to_csv(os.path.join(RESULTSPATH,'topicCats.tsv'),mode='w',sep='\t',header=True)
print(cleanresults.tail(n=2))

### Train the models on the data

In [None]:
def generate_sub_models(MODELPATH,subtopicsdf,classifiers,traintopics="all"):
    breakdown = subtopicsdf.groupby('topicCategory').size().reset_index(name='counts')
    if traintopics != "all":
        eachtopic = traintopics
        trainingset = generate_training_df(subtopicsdf,eachtopic)
        X = generate_vectorizer(MODELPATH,trainingset,eachtopic)
        for eachclassifier in classifiers.keys():
            classifier=classifiers[eachclassifier]
            classifier.fit(X, trainingset.target)
            save_model(MODELPATH,classifier,eachclassifier,eachtopic)     
    else:
        for eachtopic in breakdown['topicCategory'].tolist():
            trainingset = generate_training_df(subtopicsdf,eachtopic)
            X = generate_vectorizer(MODELPATH,trainingset,eachtopic)
            for eachclassifier in classifiers.keys():
                classifier=classifiers[eachclassifier]
                classifier.fit(X, trainingset.target)
                save_model(MODELPATH,classifier,eachclassifier,eachtopic)  

In [None]:
%%time
from src.train_classifier import *  

classifiers = load_classifiers('best')
generate_models(MODELPATH,subtopics_only,classifiers,"all",False)

In [None]:
print(subtopics_only['topicCategory'].unique().tolist())

In [None]:
def run_sub_test(RESULTPATH,subtopicsdf,classifierset_type='best',export_report=False):
    classifiers = load_classifiers(classifierset_type)
    breakdown = subtopicsdf.groupby('topicCategory').size().reset_index(name='counts')
    testresults = []
    for eachtopic in breakdown['topicCategory'].tolist():
        print("now testing: ",eachtopic,datetime.now())
        training_set = generate_training_df(subtopicsdf,eachtopic)
        X = vectorize_text(training_set)
        for classifier in classifiers.keys():
            i=0
            while i<5:
                timestart = datetime.now()
                cmresult,report,auc = train_test_classify(classifiers[classifier],training_set,X,i)
                runtime = datetime.now() - timestart
                testresults.append({'topicCategory':eachtopic,'set size':len(training_set),'classifier':classifier,
                                    'runtime':runtime,'auc':auc,'report':report,'matrix':cmresult,'i':i})
                i=i+1
    testresultsdf = pd.DataFrame(testresults)
    if export_report==True:
        testresultsdf.to_csv(os.path.join(RESULTPATH,'in_depth_classifier_test.tsv'),sep='\t',header=True)
    return(testresultsdf)

In [None]:
### Analyze and validate the models
testresultsdf = run_sub_test(RESULTSPATH,subtopics_only,classifierset_type='best',export_report=True)

### Run the classifications and check the results
Use the existing models to run classification predictions on:
1. Clinical Trials
    * The original mapping system can be used to validate the accuracy of the predictions
2. Preprints and new LitCovid entries
    * Sanity/spot check results
3. Datasets

In [10]:
from src.classify_pubs import *
from src.common import *

In [11]:
classifiers = load_classifiers('best')
classifierlist = classifiers.keys()
subtopiclist = subtopics_only['topicCategory'].unique().tolist()
print(subtopiclist)
print(len(subtopiclist))

In [31]:
%%time
## Load Clinical Trials records which have name, abstract, text
## But do NOT have info on primary design purpose, or intervention
with open(os.path.join(general_path,'outbreak_CT_classifier/data/blank_entries.pickle'),'rb') as unclassified:
    ct_to_classify = pickle.load(unclassified)

subtopiclist = subtopics_only['topicCategory'].unique().tolist()
predict_class(MODELPATH,PREDICTPATH,subtopiclist,classifierlist,ct_to_classify,newonly = False)

5584 0
Wall time: 21.8 ms


In [35]:
%%time
total_agree = merge_predictions(PREDICTPATH,subtopiclist,classifierlist,agreetype='perfect')
allresults = total_agree.merge(ct_to_classify,on='_id',how='inner')
cleanresults = clean_results(allresults)
print(cleanresults.head(n=5))

                   _id                                      topicCategory
0  ACTRN12620000617965  [Individual Prevention, Public Health Interven...
1  CTRI/2020/04/024413  [Individual Prevention, Public Health Interven...
2  CTRI/2020/04/024479  [Biologics, Medical Care, Pharmaceutical Treat...
3  CTRI/2020/04/024659             [Biologics, Pharmaceutical Treatments]
4  CTRI/2020/04/024706             [Biologics, Pharmaceutical Treatments]
Wall time: 7.33 s


In [38]:
misc_map={'Pathology/Radiology':['graphy','ultrasound','ECG','Pulmonary Function Test','Spirometry','biopsy'],
          'Rapid Diagnostics':['rapid','Rapid'],
          'Virus Detection':['RT-PCR','PCR'],
          'Antibody Detection':['antibod','Antibod','antigen','Anti-SARS-CoV2','Antigen','ELISA','ELISPOT'],
          'Symptoms':['symptom','clinical sign','presenting with','clinical presentation'],
          'Vaccines':['vaccin','Vaccin','inactivated virus'],
                     'Medical Care':['Ventilat','ventilat','standard of care','soc','s.o.c.'],
          'Public Health Interventions':['policy','travel restriction','lockdown','quarantine','campaign','closures'],
          'Individual Prevention':['counsel','training','education','awareness','PPE','face mask','face covering','device'],
          'Vaccines':['vaccin','Vaccin','inactivated virus']
         }

In [52]:
%%time
import re
tmpresults = []
for eachkey in misc_map.keys():
    keywordlist = misc_map[eachkey]
    topicCategory = eachkey
    searchregex = re.compile('|'.join(keywordlist), re.IGNORECASE)
    tmpdf = allresults.loc[((allresults['interventionName'].str.contains(searchregex))|
                          (allresults['text'].str.contains(searchregex)))]
    unique_ids = len(tmpdf['_id'].unique().tolist())
    predicted_df = allresults.loc[allresults['topicCategory']==topicCategory.replace('/','-')]
    matching_ids = list(set(predicted_df['_id'].unique().tolist()).intersection(set(tmpdf['_id'].unique().tolist())))
    algo_not_in_regex = [x for x in predicted_df['_id'].unique().tolist() if x not in tmpdf['_id'].unique().tolist()]
    regex_not_in_algo = [x for x in tmpdf['_id'].unique().tolist() if x not in predicted_df['_id'].unique().tolist()]
    tmpresults.append({'topicCategory':topicCategory,
                       'ids_found_via_regex':unique_ids,
                       'ids_found_via_algorithm':len(predicted_df['_id'].unique().tolist()),
                       'ids_matching_both':len(matching_ids),
                       'regex_percent_pos':len(matching_ids)/unique_ids,
                       'algo_percent_pos':len(matching_ids)/len(predicted_df['_id'].unique().tolist()),
                       'ids_in_algo_not_regex':len(algo_not_in_regex),
                       'ids_in_regex_not_algo':len(regex_not_in_algo),
                       'algo_id_list':predicted_df['_id'].unique().tolist(),
                       'algo_not_in_reg':algo_not_in_regex,
                       'regex_id_list':matching_topics['_id'].unique().tolist(),
                       'regex_not_in_algo':regex_not_in_algo
                      })
resultdf = pd.DataFrame(tmpresults)
resultdf.to_csv('results/nonNCT_clinical_trial_check.tsv',sep='\t',header=True)

Wall time: 4.44 s


In [None]:
### Clean up the results