In [1]:
import os
import requests
import json
import pandas as pd
from pandas import read_csv
import pickle
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from datetime import datetime

# Functions for fetching relevant metadata
LitCovid aleady classifies the majority of its records into a few broad categories. Here we leverage those categories to build a broad classifier to classify preprints based on their abstracts. If more detailed classification is desired, we can run a 2-step classifier (first to broadly classify, then to narrow down)

We start with a dump of the classifications using the LitCovidTopics parser

In [2]:
#### Get the name, abstract for the pmids
#### Note, I've tried batches of 1000, and the post request has failed, so this uses a batch size that's less likely to fail
def batch_fetch_meta(idlist):
    ## Break the list of ids into smaller chunks so the API doesn't fail the post request
    runs = round((len(idlist))/100,0)
    i=0 
    separator = ','
    ## Create dummy dataframe to store the meta data
    textdf = pd.DataFrame(columns = ['_id','abstract','name','description'])
    while i < runs+1:
        if len(idlist)<100:
            sample = idlist
        elif i == 0:
            sample = idlist[i:(i+1)*100]
        elif i == runs:
            sample = idlist[i*100:len(idlist)]
        else:
            sample = idlist[i*100:(i+1)*100]
        sample_ids = separator.join(sample)
        ## Get the text-based metadata (abstract, title) and save it
        r = requests.post("https://api.outbreak.info/resources/query/", params = {'q': sample_ids, 'scopes': '_id', 'fields': 'name,abstract,description'})
        if r.status_code == 200:
            rawresult = pd.read_json(r.text)
            checkcols = rawresult.columns
            if (('description' not in checkcols) and ('abstract' in checkcols)):
                rawresult['description']=" "
            elif (('description' in checkcols) and ('abstract' not in checkcols)):
                rawresult['abstract']=" "
            elif (('description' not in checkcols) and ('abstract' not in checkcols)):
                rawresult['abstract']=" "
                rawresult['description']=" "
            cleanresult = rawresult[['_id','name','abstract','description']].loc[rawresult['_score']==1].fillna(" ").copy()
            cleanresult.drop_duplicates(subset='_id',keep="first", inplace=True)
            textdf = pd.concat((textdf,cleanresult))
        i=i+1
    return(textdf)
        

## Functions for transforming the metadata

In [3]:
#### Merge text from the name, abstract, and description
#### Clean up up the text

def merge_texts(df):
    df.fillna('',inplace=True)
    df['text'] = df['name'].astype(str).str.cat(df['abstract'].astype(str).str.cat(df['description'],sep=' '),sep=' ')
    df['text'] = df['text'].str.replace(r'\W', ' ')
    df['text'] = df['text'].str.replace(r'\s+[a-zA-Z]\s+', ' ')
    df['text'] = df['text'].str.replace(r'\^[a-zA-Z]\s+', ' ')
    df['text'] = df['text'].str.lower()   
    return(df)


def fetch_categorized_data(df):
    alldata = pd.DataFrame(columns=['_id','name','abstract','description','text','topicCategory'])
    breakdown = df.groupby('topicCategory').size().reset_index(name='counts')
    for eachtopic in breakdown['topicCategory'].tolist():
        tmpids = df['_id'].loc[df['topicCategory']==eachtopic]
        tmptxtdf = batch_fetch_meta(tmpids)
        tmptxtdf = merge_texts(tmptxtdf)
        tmptxtdf['topicCategory']=eachtopic
        alldata = pd.concat((alldata,tmptxtdf),ignore_index=True)
    return(alldata)


def generate_training_df(df,category):
    positiveids = df['_id'].loc[df['topicCategory']==category].tolist()
    training_set_pos = df[['_id','text']].loc[df['topicCategory']==category].copy()
    training_set_pos['target']='in category'
    max_negs = len(positiveids)
    if len(positiveids)<len(df.loc[~df['_id'].isin(positiveids)]):
        training_set_neg = df[['_id','text']].loc[~df['_id'].isin(positiveids)].sample(n=max_negs).copy()
    else:
        training_set_neg = df[['_id','text']].loc[~df['_id'].isin(positiveids)].copy()
    training_set_neg['target']='not in category'
    training_set = pd.concat((training_set_pos,training_set_neg),ignore_index=True)
    return(training_set)


#### Note that this function is to clean up the classification predictions and format it as annotations
def clean_results(allresults):
    allresults.drop_duplicates(keep="first",inplace=True)
    counts = allresults.groupby('_id').size().reset_index(name='counts')
    duplicates = counts.loc[counts['counts']>1]
    singles = counts.loc[counts['counts']==1]
    dupids = duplicates['_id'].unique().tolist()
    tmplist = []
    for eachid in dupids:
        catlist = allresults['topicCategory'].loc[allresults['_id']==eachid].tolist()
        tmplist.append({'_id':eachid,'topicCategory':catlist})
    tmpdf = pd.DataFrame(tmplist)  
    tmpsingledf = allresults[['_id','topicCategory']].loc[allresults['_id'].isin(singles['_id'].tolist())]
    idlist = tmpsingledf['_id'].tolist()
    catlist = tmpsingledf['topicCategory'].tolist()
    cattycat = [[x] for x in catlist]
    list_of_tuples = list(zip(idlist,cattycat))
    singledf = pd.DataFrame(list_of_tuples, columns = ['_id', 'topicCategory']) 
    cleanresults = pd.concat((tmpdf,singledf),ignore_index=True)
    return(cleanresults)

## Functions for training a classifier

In [4]:
def train_test_classify(classifier,training_set,X,i=0):
    X_train, X_test, y_train, y_test = train_test_split(X, training_set.target, test_size=0.2, random_state=i)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    cmresult = confusion_matrix(y_test,y_pred)
    report = pd.DataFrame(classification_report(y_test,y_pred,output_dict=True))
    probs = classifier.predict_proba(X_test)
    probs = probs[:, 1]
    auc = roc_auc_score(y_test, probs)
    return(cmresult,report,auc)


def vectorize_text(training_set):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(training_set['text'])
    features = vectorizer.get_feature_names()
    return(X)


def generate_vectorizer(training_set,category):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(training_set['text'])
    features = vectorizer.get_feature_names()
    vectorizerfile = os.path.join(MODELPATH,"vectorizer_"+category+".pickle")
    xfile = os.path.join(MODELPATH,"X_"+category+".pickle")
    pickle.dump(vectorizer, open(vectorizerfile, "wb"))
    pickle.dump(X, open(xfile, "wb"))
    return(X)


def save_model(classifier,classname,category):
    filename = os.path.join(MODELPATH,classname+"_"+category+".sav")
    pickle.dump(classifier, open(filename, 'wb'))


## Functions for testing different classifiers

In [5]:
#### Load the classifiers

def load_classifiers(classifierset_type):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.neural_network import MLPClassifier
    from sklearn import tree
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
    from sklearn.linear_model import LogisticRegression
    all_available = {
        'Random Forest':RandomForestClassifier(n_estimators=1000, random_state=0),
        'MultinomialNB':MultinomialNB(),
        'Neural Net':MLPClassifier(alpha=1, max_iter=1000),
        'Decision Tree':tree.DecisionTreeClassifier(max_depth=5),
        'Nearest Neighbor':KNeighborsClassifier(3),
        'AdaBoost':AdaBoostClassifier(),
        'Logistic Regression':LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr')}
    best = {
        'Random Forest':RandomForestClassifier(n_estimators=1000, random_state=0),
        'MultinomialNB':MultinomialNB(),
        'Logistic Regression':LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr')}
    if classifierset_type=='best':
        return(best)
    else:
        return(all_available)

## Functions for loading preprint data

In [6]:
#### Get the size of the source (to make it easy to figure out when to stop scrolling)
def fetch_src_size(source):
    pubmeta = requests.get("https://api.outbreak.info/resources/query?q=((@type:Publication) AND (curatedBy.name:"+source+"))&size=0&aggs=@type")
    pubjson = json.loads(pubmeta.text)
    pubcount = int(pubjson["facets"]["@type"]["total"])
    return(pubcount)

#### Pull ids from a json file
def get_ids_from_json(jsonfile):
    idlist = []
    for eachhit in jsonfile["hits"]:
        if eachhit["_id"] not in idlist:
            idlist.append(eachhit["_id"])
    return(idlist)

#### Ping the API and get all the ids for a specific source and scroll through the source until number of ids matches meta
def get_source_ids(source):
    source_size = fetch_src_size(source)
    r = requests.get("https://api.outbreak.info/resources/query?q=((@type:Publication) AND (curatedBy.name:"+source+"))&fields=_id&fetch_all=true")
    response = json.loads(r.text)
    idlist = get_ids_from_json(response)
    try:
        scroll_id = response["_scroll_id"]
        while len(idlist) < source_size:
            r2 = requests.get("https://api.outbreak.info/resources/query?q=((@type:Publication) AND (curatedBy.name:"+source+"))&fields=_id&fetch_all=true&scroll_id="+scroll_id)
            response2 = json.loads(r2.text)
            idlist2 = set(get_ids_from_json(response2))
            tmpset = set(idlist)
            idlist = tmpset.union(idlist2)
            try:
                scroll_id = response2["_scroll_id"]
            except:
                print("no new scroll id")
        return(idlist)
    except:
        return(idlist)

#### Pull ids from the major publication sources (litcovid, medrxiv,biorxiv)
def get_preprint_ids():
    biorxiv_ids = get_source_ids("bioRxiv")
    medrxiv_ids = get_source_ids("medRxiv")
    preprint_ids = list(set(medrxiv_ids).union(set(biorxiv_ids)))
    return(preprint_ids)



In [7]:
#### Functions for classifying all publications (not yet classified)

def get_pub_ids(sourceset):
    pub_srcs = {"preprint":["bioRxiv","medRxiv"],"litcovid":["litcovid"],
                "other":["Figshare","Zenodo","MRC Centre for Global Infectious Disease Analysis"],
                "all":["Figshare","Zenodo","MRC Centre for Global Infectious Disease Analysis",
                       "bioRxiv","medRxiv","litcovid"]}
    sourcelist = pub_srcs[sourceset]
    allids = []
    for eachsource in sourcelist:
        sourceids = get_source_ids(eachsource)
        allids = list(set(allids).union(set(sourceids)))
    return(allids)



## Functions for utilizing pre-trained models

In [8]:
def load_vectorizer(category):
    vectorizerfile = os.path.join(MODELPATH,"vectorizer_"+category+".pickle")
    vectorizer = pickle.load(open(vectorizerfile,'rb'))
    return(vectorizer)

def predict_class(topiclist,classifierlist,df):
    labels = df['_id']
    for eachtopic in topiclist:
        vectorizer = load_vectorizer(eachtopic)
        M = vectorizer.transform(df['text'])
        for eachclassifier in classifierlist:
            classifierfile = os.path.join(MODELPATH, eachclassifier+"_"+eachtopic+'.sav')
            classifier = pickle.load(open(classifierfile, 'rb'))
            prediction = classifier.predict(M)
            list_of_tuples = list(zip(labels,prediction))
            predictiondf = pd.DataFrame(list_of_tuples, columns = ['_id', 'prediction'])
            predictiondf['topicCategory']=eachtopic
            predictiondf['classifier']=eachclassifier
            predictiondf.to_csv(os.path.join(PREDICTPATH,eachtopic+"_"+eachclassifier+'.tsv'),sep='\t',header=True)    

## Functions for evaluating classification predictions

In [9]:
def get_agreement(eachtopic,classifierlist,PREDICTPATH):
    agreement = pd.DataFrame(columns=['_id','topicCategory','pos_pred_count','pos_pred_algorithms'])
    classresult = pd.DataFrame(columns=['_id','prediction','topicCategory','classifier'])
    for eachclass in classifierlist:
        tmpfile = read_csv(os.path.join(PREDICTPATH,eachtopic+"_"+eachclass+".tsv"),delimiter='\t',header=0,index_col=0)
        classresult = pd.concat((classresult,tmpfile),ignore_index=True)
    posresults = classresult.loc[classresult['prediction']=='in category']
    agreecounts = posresults.groupby('_id').size().reset_index(name='counts')
    no_agree = posresults.loc[posresults['_id'].isin(agreecounts['_id'].loc[agreecounts['counts']==1].tolist())].copy()
    no_agree.rename(columns={'classifier':'pos_pred_algorithms'},inplace=True)
    no_agree['pos_pred_count']=1
    no_agree.drop('prediction',axis=1,inplace=True)
    perfect_agree = posresults.loc[posresults['_id'].isin(agreecounts['_id'].loc[agreecounts['counts']==len(classifierlist)].tolist())].copy()
    perfect_agree['pos_pred_count']=len(classifierlist)
    perfect_agree['pos_pred_algorithms']=str(classifierlist)
    perfect_agree.drop(['prediction','classifier'],axis=1,inplace=True)
    perfect_agree.drop_duplicates('_id',keep='first',inplace=True)
    partialcountids = agreecounts['_id'].loc[((agreecounts['counts']>1)&
                                          (agreecounts['counts']<len(classifierlist)))].tolist()
    tmplist = []
    for eachid in list(set(partialcountids)):
        tmpdf = posresults.loc[posresults['_id']==eachid]
        tmpdict = {'_id':eachid,'topicCategory':eachtopic,'pos_pred_count':len(tmpdf),
                   'pos_pred_algorithms':str(tmpdf['classifier'].tolist())}
        tmplist.append(tmpdict)
    partial_agree = pd.DataFrame(tmplist)    
    agreement = pd.concat((agreement,no_agree,partial_agree,perfect_agree),ignore_index=True)
    return(agreement)

def filter_agreement(topiclist,classifierlist,agreetype='perfect'):
    allagreement = pd.DataFrame(columns=['_id','topicCategory','pos_pred_count','pos_pred_algorithms'])
    for eachtopic in topiclist:
        agreement = get_agreement(eachtopic,classifierlist,PREDICTPATH)
        allagreement = pd.concat((allagreement,agreement),ignore_index=True)
    if agreetype=='perfect':
        filtered_agreement = allagreement[['_id','topicCategory']].loc[allagreement['pos_pred_count']==len(classifierlist)].copy()
    elif agreetype=='None':
        filtered_agreement = allagreement[['_id','topicCategory']].loc[allagreement['pos_pred_count']==1].copy()
    else:
        partialcountids = allagreement['_id'].loc[((allagreement['pos_pred_count']>1)&
                                          (allagreement['pos_pred_count']<len(classifierlist)))].tolist()
        filtered_agreement = allagreement[['_id','topicCategory']].loc[allagreement['_id'].isin(partialcountids)].copy()
    return(filtered_agreement)


def merge_predictions(topiclist,classifierlist,agreetype='perfect'):
    agreement = filter_agreement(topiclist,classifierlist,agreetype='perfect')
    agreement.drop_duplicates(inplace=True,keep="first")
    return(agreement)

'def merge_predictions(topiclist,classifierlist,agreetype=\'perfect\'):\n    totalagree = pd.DataFrame(columns=[\'_id\',\'topicCategory\'])\n    for eachtopic in topiclist:\n        agreement = filter_agreement(topiclist,classifierlist,agreetype=\'perfect\')\n        totalagree = pd.concat((totalagree,agreement),ignore_index=True)\n    totalagree.drop_duplicates(inplace=True,keep="first")\n    return(totalagree)'

## Primary Functions
Primary functions for performing testing, training, classification prediction, and providing the results as annotations
0. Run algorithm tests against the data and report results
1. Train the classifiers on the LitCovid data
2. Apply the classifiers to the preprint data
3. Clean up the results and serve them up as annotations

In [10]:
def run_test(topicsdf,classifierset_type='best',export_report=False):
    classifiers = load_classifiers(classifierset_type)
    fetchstarttime = datetime.now()
    print("fetching the abstracts: ", fetchstarttime)
    alldata = fetch_categorized_data(topicsdf)
    fetchtime = datetime.now()-fetchstarttime
    print("fetching complete: ",fetchtime)
    breakdown = alldata.groupby('topicCategory').size().reset_index(name='counts')
    testresults = []
    for eachtopic in breakdown['topicCategory'].tolist():
        print("now testing: ",eachtopic,datetime.now())
        training_set = generate_training_df(alldata,eachtopic)
        X = vectorize_text(training_set)
        for classifier in classifiers.keys():
            i=0
            while i<5:
                timestart = datetime.now()
                cmresult,report,auc = train_test_classify(classifiers[classifier],training_set,X,i)
                runtime = datetime.now() - timestart
                testresults.append({'topicCategory':eachtopic,'set size':len(training_set),'classifier':classifier,
                                    'runtime':runtime,'auc':auc,'report':report,'matrix':cmresult,'i':i})
                i=i+1
    testresultsdf = pd.DataFrame(testresults)
    if export_report==True:
        testresultsdf.to_csv(os.path.join(RESULTPATH,'in_depth_classifier_test.tsv'),sep='\t',header=True)
    return(testresultsdf)


def generate_models(topicsdf,classifiers):
    alldata = fetch_categorized_data(topicsdf)
    breakdown = alldata.groupby('topicCategory').size().reset_index(name='counts')

    for eachtopic in breakdown['topicCategory'].tolist():
        trainingset = generate_training_df(alldata,eachtopic)
        X = generate_vectorizer(trainingset,eachtopic)
        for eachclassifier in classifiers.keys():
            classifier=classifiers[eachclassifier]
            classifier.fit(X, trainingset.target)
            save_model(classifier,eachclassifier,eachtopic)  

            
def classify_preprints(topiclist,classifiers):
    preprint_ids = get_pub_ids(sourceset="preprint")
    preprintdf = batch_fetch_meta(preprint_ids)
    preprintdata = merge_texts(preprintdf)    
    classifierlist = classifiers.keys()
    predict_class(topiclist,classifierlist,preprintdata)

    
def classify_pubs(topiclist,classifiers):
    all_ids = get_pub_ids(sourceset="other")
    alldf = batch_fetch_meta(all_ids)
    alldata = merge_texts(alldf)    
    classifierlist = classifiers.keys()
    predict_class(topiclist,classifierlist,alldata)
    
    
def load_annotations(topicsdf,classifiers):
    topiclist = topicsdf['topicCategory'].unique().tolist()
    classify_pubs(topiclist,classifiers)
    classifierlist = classifiers.keys()
    total_agree = merge_predictions(topiclist,classifierlist,agreetype='perfect')
    allresults = pd.concat((total_agree,topicsdf),ignore_index=True)
    cleanresults = clean_results(allresults)
    cleanresults.to_csv(os.path.join(RESULTPATH,'predictions.tsv'),sep='\t',header=0)
    cleanresults.to_json(os.path.join(RESULTPATH,'predictions.json'), orient='records')

"def load_annotations(topicsdf,classifiers):\n    topiclist = topicsdf['topicCategory'].unique().tolist()\n    classify_pubs(topiclist,classifiers)\n    classifierlist = classifiers.keys()\n    total_agree = merge_predictions(topiclist,classifierlist,agreetype='perfect')\n    cleanresults = clean_results(total_agree)\n    originalresults = clean_results(topicsdf)\n    allresults = pd.concat((originalresults,cleanresults),ignore_index=True)\n    allresults.to_csv(os.path.join(RESULTPATH,'predictions.tsv'),sep='\t',header=0)\n    #allresults.to_json(os.path.join(RESULTPATH,'predictions.json'), orient='records')"

## Check Script Run times

In [None]:
#idlist = ["zenodo.3766145","figshare12864673","2020.12.06.413682","pmid33677050"]
idlist = get_pub_ids(sourceset="other")
runs = round((len(idlist))/100,0)
i=12 
separator = ','
## Create dummy dataframe to store the meta data
textdf = pd.DataFrame(columns = ['_id','abstract','name','description'])
while i < runs+1:
    if len(idlist)<100:
        sample = idlist
    elif i == 0:
        sample = idlist[i:(i+1)*100]
    elif i == runs:
        sample = idlist[i*100:len(idlist)]
    else:
        sample = idlist[i*100:(i+1)*100]
    sample_ids = separator.join(sample)
    print(sample_ids)
    ## Get the text-based metadata (abstract, title) and save it
    r = requests.post("https://api.outbreak.info/resources/query/", params = {'q': sample_ids, 'scopes': '_id', 'fields': 'name,abstract,description'})
    if r.status_code == 200:
        rawresult = pd.read_json(r.text)
        checkcols = rawresult.columns
        if (('description' not in checkcols) and ('abstract' in checkcols)):
            rawresult['description']=" "
        elif (('description' in checkcols) and ('abstract' not in checkcols)):
            rawresult['abstract']=" "
        elif (('description' not in checkcols) and ('abstract' not in checkcols)):
            rawresult['abstract']=" "
            rawresult['description']=" "
        cleanresult = rawresult[['_id','name','abstract','description']].loc[rawresult['_score']==1].fillna(" ").copy()
        cleanresult.drop_duplicates(subset='_id',keep="first", inplace=True)
        textdf = pd.concat((textdf,cleanresult))
    i=i+1
    print(i)

In [20]:
## Pull the classifications from the LitCovidTopics parser
DATAPATH = 'data/'
RESULTPATH = 'results/'
MODELPATH = 'models/'
PREDICTPATH = 'predictions/'
littopicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
offtopicsfile = os.path.join(RESULTPATH,'training_ids.tsv')
littopicsdf = read_csv(littopicsfile,delimiter='\t',header=0,index_col=0)
offtopicsdf = read_csv(offtopicsfile,delimiter='\t',header=0,index_col=0)
topicsdf = pd.concat((littopicsdf,offtopicsdf),ignore_index=True)
topiclist = topicsdf['topicCategory'].unique().tolist()

"""
runtimesinfo=[]
timestart = datetime.now()
print('testing start: ',timestart)
testresultsdf = run_test(topicsdf,classifierset_type='best',export_report=True)
runtime = datetime.now()-timestart
print(runtime)
runtimesinfo.append({'starttime':timestart,'runtime':runtime,'function':'run_test()'})

timestart = datetime.now()
print('creating models: ',timestart)
classifiers = load_classifiers('best')
generate_models(topicsdf,classifiers)
runtime = datetime.now()-timestart
print(runtime)
runtimesinfo.append({'starttime':timestart,'runtime':runtime,'function':'generate_models()'})

runtimesinfo=[]
timestart = datetime.now()
print('load_classifiers: ',timestart)
classifiers = load_classifiers('best')
runtime = datetime.now()-timestart
print(runtime)
runtimesinfo.append({'starttime':timestart,'runtime':runtime,'function':'generate_models()'})

timestart = datetime.now()
print('classifying non-litcovid: ',timestart)
classify_pubs(topiclist,classifiers)
runtime = datetime.now()-timestart
print(runtime)
runtimesinfo.append({'starttime':timestart,'runtime':runtime,'function':'classify_preprints()'})

timestart = datetime.now()
print('merging results: ',timestart)
classifierlist = classifiers.keys()
total_agree = merge_predictions(topiclist,classifierlist,agreetype='perfect')
runtime = datetime.now()-timestart
print(runtime)
runtimesinfo.append({'starttime':timestart,'runtime':runtime,'function':'classify_preprints()'})
"""
runtimesinfo=[]
timestart = datetime.now()
print('creating models: ',timestart)
classifiers = load_classifiers('best')
generate_models(topicsdf,classifiers)
runtime = datetime.now()-timestart
print(runtime)
runtimesinfo.append({'starttime':timestart,'runtime':runtime,'function':'generate_models()'})

timestart = datetime.now()
print('load annotations: ',timestart)
load_annotations(topicsdf,classifiers)
runtime = datetime.now()-timestart
print(runtime)
runtimesinfo.append({'starttime':timestart,'runtime':runtime,'function':'load_annotations()'})

runtimesdf = pd.DataFrame(runtimesinfo)
runtimesdf.to_csv(os.path.join(RESULTPATH,'runtimes.tsv'),sep='\t',header=True)


creating models:  2021-04-16 09:39:15.692008
2:39:55.335474
load annotations:  2021-04-16 12:19:11.027482
1:26:46.556467


In [None]:
#### To do
## Check quality of Information sciences training data (because some of the results look iffy)
## Time the clean process and the merge process
## Verify the load annotations function works as expected

In [None]:
allresults = pd.concat((total_agree,topicsdf),ignore_index=True)
print(allresults)
cleanresults = clean_results(allresults)
print(cleanresults)

In [None]:
#### Check merging of predicted classes with litcovid and other training classes

allresults = pd.concat((total_agree,topicsdf),ignore_index=True)
print(len(allresults))
cleanresults = clean_results(allresults)
print(len(cleanresults))

In [None]:
print(total_agree.head(n=2))
print(len(total_agree))

In [None]:
#classifiers = load_classifiers('best')
classifierlist = classifiers.keys()
total_agree = merge_predictions(topiclist,classifierlist,agreetype='perfect')
cleanresults = clean_results(total_agree)
cleanresults.to_csv(os.path.join(RESULTPATH,'predictions.tsv'),sep='\t',header=0)

In [None]:
cleanresults.to_json(os.path.join(RESULTPATH,'predictions.json'), orient='records')

# Testing different classifiers

In [None]:
## Pull the classifications from the LitCovidTopics parser
DATAPATH = 'data/'
topicsfile = os.path.join(DATAPATH,'litcovidtopics.tsv')
topicsdf = read_csv(topicsfile,delimiter='\t',header=0,index_col=0)
RESULTPATH = 'results/'
MODELPATH = 'models/'
PREDICTPATH = 'predictions/'

## Reserve records which span multiple categories for test since they are ambiguous and could confuse the training
frequencies = topicsdf.groupby('_id').size().reset_index(name='counts')
ambiguous = frequencies['_id'].loc[frequencies['counts']>1].tolist()
unambiguous = topicsdf.loc[~topicsdf['_id'].isin(ambiguous)]
print(len(ambiguous),len(unambiguous))


In [None]:
alldata = fetch_categorized_data(unambiguous)
breakdown = alldata.groupby('topicCategory').size().reset_index(name='counts')
testresults = []
for eachtopic in breakdown['topicCategory'].tolist():
    print(eachtopic)
    training_set = generate_training_df(alldata,eachtopic)
    X = vectorize_text(training_set)
    for classifier in classifiers.keys():
        timestart = datetime.now()
        print(classifier)
        cmresult,report,auc = train_test_classify(classifiers[classifier],training_set,X)
        runtime = datetime.now() - timestart
        testresults.append({'topicCategory':eachtopic,'set size':len(training_set),'classifier':classifier,
                            'runtime':runtime,'auc':auc,'report':report,'matrix':cmresult})

testresultsdf = pd.DataFrame(testresults)
#testresultsdf.to_csv(os.path.join(RESULTPATH,'classifier_test.tsv'),sep='\t',header=True)
#print(testresultsdf)

#max_auc = testresultsdf.groupby(['topicCategory','set size']).auc.max()
max_auc = testresultsdf.groupby('classifier')['auc'].max()
min_time = testresultsdf.groupby('classifier')['runtime'].min()
avg_auc = testresultsdf.groupby('classifier')['auc'].mean()
avg_time = testresultsdf.groupby('classifier')['runtime'].sum()
sorted_results = testresultsdf.sort_values('auc',ascending=False)
print(sorted_results)
#print(sorted_results.iloc[0]['report'])

In [None]:
alldata.to_csv(os.path.join(RESULTPATH,'unambiguous_categories_data.tsv'),sep='\t',header=True)

In [None]:
alldata=read_csv(os.path.join(RESULTPATH,'unambiguous_categories_data.tsv'),delimiter='\t',header=0,index_col=0)

The logistic regression and multinomialnb classifiers appear to be giving the highest AUC while having the shortest run times. Random forest provides decent results but has much longer run times. Using different classifiers and identifying the ones they agree on, could be a way to improve confidence when classify preprints into general litcovid categories. Using the disagreement between the classifiers may be able help to identify the ambiguous instances.

## In depth testing
Run sampling and testing 5 times, calculate average auc, create average report to evaluate performance of each of the three algorithms on the different types of data.

In [None]:
#alldata = fetch_categorized_data(unambiguous)
breakdown = alldata.groupby('topicCategory').size().reset_index(name='counts')
testresults = []
for eachtopic in breakdown['topicCategory'].tolist():
    training_set = generate_training_df(alldata,eachtopic)
    X = vectorize_text(training_set)
    for classifier in classifiers.keys():
        i=0
        while i<5:
            timestart = datetime.now()
            cmresult,report,auc = train_test_classify(classifiers[classifier],training_set,X,i)
            runtime = datetime.now() - timestart
            testresults.append({'topicCategory':eachtopic,'set size':len(training_set),'classifier':classifier,
                                'runtime':runtime,'auc':auc,'report':report,'matrix':cmresult,'i':i})
            i=i+1

testresultsdf = pd.DataFrame(testresults)
testresultsdf.to_csv(os.path.join(RESULTPATH,'in_depth_classifier_test.tsv'),sep='\t',header=True)

## More Testing: Ambiguous litcovid categories for classification verification

In [None]:
ambiguousdf = topicsdf.loc[topicsdf['_id'].isin(ambiguous)]
amdata = fetch_categorized_data(ambiguousdf)
ambreakdown = amdata.groupby('topicCategory').size().reset_index(name='counts')
#amdata.to_csv(os.path.join(RESULTPATH,'ambiguous_categories_data.tsv'),sep='\t',header=True)

In [None]:
## Load the saved models
filename = os.path.join(MODELPATH,'MultinomialNB_Treatment.sav')
classifier = pickle.load(open(filename, 'rb'))
vectorizername = os.path.join(MODELPATH,'vectorizer_Treatment.pickle')
vectorizer = pickle.load(open(vectorizername,'rb'))

In [None]:
#treatment = amdata.loc[amdata['topicCategory']=='Treatment']
labels = amdata['_id']
M = vectorizer.transform(amdata['text'])
prediction = classifier.predict(M)
list_of_tuples = list(zip(labels,prediction))
predictiondf = pd.DataFrame(list_of_tuples, columns = ['_id', 'prediction'])

In [None]:
checkdf = predictiondf.merge(treatment,on='_id',how='left')
trueneg = checkdf.loc[((checkdf['prediction']=='not in category')&(checkdf['topicCategory']!='Treatment'))]
truepos = checkdf.loc[((checkdf['prediction']=='in category')&(checkdf['topicCategory']=='Treatment'))]
falseneg = checkdf.loc[((checkdf['prediction']=='not in category')&(checkdf['topicCategory']=='Treatment'))]
falsepos = checkdf.loc[((checkdf['prediction']=='in category')&(checkdf['topicCategory']!='Treatment'))]
print("total predictions: ",len(checkdf))
print("true negative: ",len(trueneg)," or ",len(trueneg)/len(checkdf)*100, '%')
print("true positive: ",len(truepos)," or ",len(truepos)/len(checkdf)*100, '%')
print("false negative: ",len(falseneg)," or ",len(falseneg)/len(checkdf)*100, '%')
print("false positive: ",len(falsepos)," or ",len(falsepos)/len(checkdf)*100, '%')



## Expand beyond training sets

In [None]:
## Run the vectorizer and preferred classifiers on the litcovid dataset with unambiguous categories and save the models
#alldata = fetch_categorized_data(unambiguous)
alldata = fetch_categorized_data(topicsdf)
breakdown = alldata.groupby('topicCategory').size().reset_index(name='counts')

for eachtopic in breakdown['topicCategory'].tolist():
    trainingset = generate_training_df(alldata,eachtopic)
    X = generate_vectorizer(trainingset,eachtopic)
    for eachclassifier in classifiers.keys():
        classifier=classifiers[eachclassifier]
        classifier.fit(X, trainingset.target)
        save_model(classifier,classname,category)
        

## Load Preprints for classification

#### Generate the preprint dataframe

In [None]:
print('fetching preprint ids: ',datetime.now())
preprint_ids = get_preprint_ids()
print('fetching preprint abstracts: ',datetime.now())
preprintdf = batch_fetch_meta(preprint_ids)
print('cleaning up text: ',datetime.now())
preprintdata = merge_texts(preprintdf)

#### Apply classifiers to predict classifications

In [None]:
topiclist = breakdown['topicCategory'].tolist()
classifierlist = ['Logistic Regression','MultinomialNB','Random Forest']
predict_class(topiclist,classifierlist,preprintdata)

#### evaluate classifier predictions and identify areas of agreement and disagreement

In [None]:
perfect_agree = filter_agreement(eachtopic,classifierlist,agreetype='perfect')

#### clean up the results in case of multicategories

In [None]:
cleanresults = clean_results(perfect_agree)
print(len(cleanresults))

## Scrap

In [None]:
labels = preprintdata['_id']
for eachtopic in breakdown['topicCategory'].tolist():
    classifierlist = ['Logistic Regression','MultinomialNB','Random Forest']
    vectorizer = load_vectorizer(eachtopic)
    M = vectorizer.transform(preprintdata['text'])
    for eachclassifier in classifierlist:
        classifierfile = os.path.join(MODELPATH, eachclassifier+"_"+eachtopic+'.sav')
        classifier = pickle.load(open(classifierfile, 'rb'))
        prediction = classifier.predict(M)
        list_of_tuples = list(zip(labels,prediction))
        predictiondf = pd.DataFrame(list_of_tuples, columns = ['_id', 'prediction'])
        predictiondf['topicCategory']=eachtopic
        predictiondf['classifier']=eachclassifier
        predictiondf.to_csv(os.path.join(RESULTPATH,'predictions/'+eachtopic+"_"+eachclassifier+'.tsv'),sep='\t',header=True)


In [None]:
####Vectorize the text for classifier

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(training_set['text'])
features = vectorizer.get_feature_names()
print(X.shape)

#### Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, training_set.target, test_size=0.2, random_state=0)

#### Classify training text as in category or not in category
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

print(confusion_matrix(y_test,y_pred))

report = classification_report(y_test,y_pred,output_dict=True)
print(pd.DataFrame(report))

probs = classifier.predict_proba(X_test)
probs = probs[:, 1]
auc = roc_auc_score(y_test, probs)
print(auc)
print('[[true neg     false pos]]')
print('[[false neg     true pos]]')

In [None]:
inspect_results = classifier_results.groupby('treatment_prediction').size().reset_index(name='counts')
print(inspect_results)