In [1]:
import os
import requests
import pandas as pd
from pandas import read_csv
from collections import OrderedDict
import pickle
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Functions for fetching relevant metadata

In [2]:
## Pull all Clinical Trials data 

def fetch_src_size():
    pubmeta = requests.get("https://api.outbreak.info/resources/query?q=@type:ClinicalTrial&size=0&aggs=@type")
    pubjson = json.loads(pubmeta.text)
    pubcount = int(pubjson["facets"]["@type"]["total"])
    return(pubcount)


#### Pull ids from a json file
def get_ids_from_json(jsonfile):
    idlist = []
    for eachhit in jsonfile["hits"]:
        if eachhit["_id"] not in idlist:
            idlist.append(eachhit["_id"])
    return(idlist)


#### Ping the API and get all the ids for a clinical trials and scroll through the source until number of ids matches meta
def get_source_ids():
    source_size = fetch_src_size()
    r = requests.get("https://api.outbreak.info/resources/resource/query?q=@type:ClinicalTrial&fields=_id&fetch_all=true")
    response = json.loads(r.text)
    idlist = get_ids_from_json(response)
    try:
        scroll_id = response["_scroll_id"]
        while len(idlist) < source_size:
            r2 = requests.get("https://api.outbreak.info/resources/resource/query?q=@type:ClinicalTrial&fields=_id&fetch_all=true&scroll_id="+scroll_id)
            response2 = json.loads(r2.text)
            idlist2 = set(get_ids_from_json(response2))
            tmpset = set(idlist)
            idlist = list(tmpset.union(idlist2))
            try:
                scroll_id = response2["_scroll_id"]
            except:
                print("no new scroll id")
        return(idlist)
    except:
        return(idlist)
 

In [3]:
#### Get the key metadata for all clinical trials
#### Get the metadata for each list
#### Note, I've tried batches of 1000, and the post request has failed, so this uses a batch size that's less likely to fail
def batch_fetch_clin_meta(idlist):
    ## Break the list of ids into smaller chunks so the API doesn't fail the post request
    runs = round((len(idlist))/100,0)
    i=0 
    separator = ','
    ## Create dummy dataframe to store the meta data
    textdf = pd.DataFrame(columns = ['_id','abstract','trialName','trialDescription',
                                     'designPrimaryPurpose','studyType',
                                     'interventionCategory','interventionName'])
    while i < runs+1:
        if len(idlist)<100:
            sample = idlist
        elif i == 0:
            sample = idlist[i:(i+1)*100]
        elif i == runs:
            sample = idlist[i*100:len(idlist)]
        else:
            sample = idlist[i*100:(i+1)*100]
        sample_ids = separator.join(sample)
        ## Get the text-based metadata (abstract, title) and save it
        r = requests.post("https://api.outbreak.info/resources/query/", params = {'q': sample_ids, 'scopes': '_id', 'fields': 'name,abstract,description,interventions,studyDesign'})
        if r.status_code == 200:
            rawresult = json.loads(r.text)
            structuredresult = pd.json_normalize(rawresult)
            structuredresult.drop(columns=['studyDesign.@type','studyDesign.designModel',
                                           'studyDesign.phaseNumber','studyDesign.phase',
                                           'studyDesign.designAllocation','studyDesign.studyDesignText'],inplace=True)
            structuredresult.rename(columns={'name':'trialName', 'description':'trialDescription',
                                             'studyDesign.designPrimaryPurpose':'designPrimaryPurpose',
                                             'studyDesign.studyType':'studyType'},inplace=True)
            exploded = structuredresult.explode('interventions')
            no_interventions = exploded.loc[exploded['interventions'].isna()].copy()
            no_interventions_clean = no_interventions[['_id','abstract','trialName','trialDescription',
                                                       'designPrimaryPurpose','studyType']].copy()
            has_interventions = exploded.loc[~exploded['interventions'].isna()].copy()
            interventions = pd.concat([has_interventions.drop(['interventions'], axis=1), has_interventions['interventions'].apply(pd.Series)], axis=1)
            clean_interventions = interventions[['_id','abstract','trialName','trialDescription',
                                                 'designPrimaryPurpose','studyType',
                                                 'category','name']].copy()
            clean_interventions.rename(columns={'name':'interventionName','category':'interventionCategory'},inplace=True)
            textdf = pd.concat((textdf,clean_interventions,no_interventions_clean),ignore_index=True)
        i=i+1
    textdf.rename(columns={'trialName':'name','trialDescription':'description'},inplace=True)
    return(textdf)
        

## Functions for transforming the metadata

In [4]:
#### Merge text from the name, abstract, and description
#### Clean up up the text

def merge_texts(df):
    df.fillna('',inplace=True)
    df['text'] = df['name'].astype(str).str.cat(df['abstract'].astype(str).str.cat(df['description'],sep=' '),sep=' ')
    df['text'] = df['text'].str.replace(r'\W', ' ')
    df['text'] = df['text'].str.replace(r'\s+[a-zA-Z]\s+', ' ')
    df['text'] = df['text'].str.replace(r'\^[a-zA-Z]\s+', ' ')
    df['text'] = df['text'].str.lower()   
    return(df)
    

## Functions for training a classifier

In [5]:
def generate_training_df(pos_id,maybe_neg_id,original_data):
    neg_id = [item  for item in maybe_neg if item not in pos_id]
    original_data = merge_texts(original_data)
    training_set_pos = original_data[['_id','text']].loc[(original_data['_id'].isin(pos_id))&
                                                         (~original_data['text'].isna())]
    training_set_pos['target']='in category'
    training_set_neg = original_data[['_id','text']].loc[(original_data['_id'].isin(neg_id))&
                                                         (~original_data['text'].isna())]
    training_set_neg['target']='not in category'
    training_set = pd.concat((training_set_pos,training_set_neg),ignore_index=True)
    return(training_set)


In [None]:
#### Basic process
## Create binary training set using Behavioral as an example
original_data = read_csv('data/NCT_classification.csv', delimiter=',',header=0)

with open('data/behavioral.txt','rb') as dmpfile:
    behavioral = pickle.load(dmpfile)

with open('data/specificCats.txt','rb') as dompfile:
    traindf = pickle.load(dompfile)

with open('data/diagnosis.txt','rb') as dmpfile:
    diagnosis = pickle.load(dmpfile)  

with open('data/prevention.txt','rb') as dmpfile:
    prevention = pickle.load(dmpfile)

with open('data/treatment.txt','rb') as dmpfile:
    treatment = pickle.load(dmpfile)

with open('data/drug.txt','rb') as dmpfile:
    drug = pickle.load(dmpfile)
    
pos_id = list(set(behavioral).union(set(traindf['_id'].loc[traindf['topicCategory']=='Behavioral Research'].tolist())))
maybe_neg = list(set(traindf['_id'].loc[traindf['topicCategory']!='Behavioral Research'].tolist()).union(
                 set(diagnosis).union(set(prevention).union(set(treatment)))))

neg_id = [item  for item in maybe_neg if item not in pos_id]
original_data = merge_texts(original_data)
training_set_pos = original_data[['_id','text']].loc[(original_data['_id'].isin(pos_id))&
                                                     (~original_data['text'].isna())]
    
training_set_pos['target']='behavioral'
training_set_neg = original_data[['_id','text']].loc[(original_data['_id'].isin(neg_id))&
                                                     (~original_data['text'].isna())]
training_set_neg['target']='not behavioral'

training_set = pd.concat((training_set_pos,training_set_neg),ignore_index=True)

In [None]:
## Create binary training set using Diagnosis as an example
original_data = read_csv('data/NCT_classification.csv', delimiter=',',header=0)

with open('data/behavioral.txt','rb') as dmpfile:
    behavioral = pickle.load(dmpfile)

with open('data/specificCats.txt','rb') as dompfile:
    traindf = pickle.load(dompfile)

with open('data/diagnosis.txt','rb') as dmpfile:
    diagnosis = pickle.load(dmpfile)  

with open('data/prevention.txt','rb') as dmpfile:
    prevention = pickle.load(dmpfile)

with open('data/treatment.txt','rb') as dmpfile:
    treatment = pickle.load(dmpfile)

with open('data/drug.txt','rb') as dmpfile:
    drug = pickle.load(dmpfile)

pos_id = list(set(diagnosis).union(set(traindf['_id'].loc[traindf['topicCategory']=='Diagnosis'].tolist())))
maybe_neg = list(set(traindf['_id'].loc[traindf['topicCategory']!='Diagnosis'].tolist()).union(
                 set(treatment).union(set(prevention))))

neg_id = [item  for item in maybe_neg if item not in pos_id]
original_data = merge_texts(original_data)
training_set_pos = original_data[['_id','text']].loc[(original_data['_id'].isin(pos_id))&
                                                     (~original_data['text'].isna())]
    
training_set_pos['target']='diagnosis'
training_set_neg = original_data[['_id','text']].loc[(original_data['_id'].isin(neg_id))&
                                                     (~original_data['text'].isna())]
training_set_neg['target']='not diagnosis'

training_set = pd.concat((training_set_pos,training_set_neg),ignore_index=True)

In [None]:
## Create binary training set using Prevention as an example
original_data = read_csv('data/NCT_classification.csv', delimiter=',',header=0)

with open('data/behavioral.txt','rb') as dmpfile:
    behavioral = pickle.load(dmpfile)

with open('data/specificCats.txt','rb') as dompfile:
    traindf = pickle.load(dompfile)

with open('data/diagnosis.txt','rb') as dmpfile:
    diagnosis = pickle.load(dmpfile)  

with open('data/prevention.txt','rb') as dmpfile:
    prevention = pickle.load(dmpfile)

with open('data/treatment.txt','rb') as dmpfile:
    treatment = pickle.load(dmpfile)

with open('data/drug.txt','rb') as dmpfile:
    drug = pickle.load(dmpfile)
    
pos_id = list(set(prevention).union(set(traindf['_id'].loc[(traindf['topicCategory']=='Prevention')|
                                                           (traindf['topicCategory']=='Individual Prevention')].tolist())))
maybe_neg = list(set(traindf['_id'].loc[((traindf['topicCategory']!='Prevention')&
                                        (traindf['topicCategory']!='Individual Prevention'))].tolist()).union(
                 set(treatment).union(set(diagnosis))))

neg_id = [item  for item in maybe_neg if item not in pos_id]
original_data = merge_texts(original_data)
training_set_pos = original_data[['_id','text']].loc[(original_data['_id'].isin(pos_id))&
                                                     (~original_data['text'].isna())]
    
training_set_pos['target']='prevention'
training_set_neg = original_data[['_id','text']].loc[(original_data['_id'].isin(neg_id))&
                                                     (~original_data['text'].isna())]
training_set_neg['target']='not prevention'

training_set = pd.concat((training_set_pos,training_set_neg),ignore_index=True)

In [None]:
## Create binary training set using Treatment as an example
original_data = read_csv('data/NCT_classification.csv', delimiter=',',header=0)

with open('data/behavioral.txt','rb') as dmpfile:
    behavioral = pickle.load(dmpfile)

with open('data/specificCats.txt','rb') as dompfile:
    traindf = pickle.load(dompfile)

with open('data/diagnosis.txt','rb') as dmpfile:
    diagnosis = pickle.load(dmpfile)  

with open('data/prevention.txt','rb') as dmpfile:
    prevention = pickle.load(dmpfile)

with open('data/treatment.txt','rb') as dmpfile:
    treatment = pickle.load(dmpfile)

with open('data/drug.txt','rb') as dmpfile:
    drug = pickle.load(dmpfile)

with open('data/basic science.txt','rb') as dmpfile:
    basic_science = pickle.load(dmpfile)

with open('data/observational.txt','rb') as dmpfile:
    observation = pickle.load(dmpfile)

    
pos_id = list(set(treatment).union(set(traindf['_id'].loc[((traindf['topicCategory']=='Treatment')|
                                                           (traindf['topicCategory']=='Vaccines')|
                                                           (traindf['topicCategory']=='Biologics')|
                                                           (traindf['topicCategory']=='Medical Care'))].tolist()))-set(basic_science)-set(observation))
maybe_neg = list(set(traindf['_id'].loc[((traindf['topicCategory']!='Treatment')&
                                         (traindf['topicCategory']!='Vaccines')&
                                         (traindf['topicCategory']!='Biologics')&
                                         (traindf['topicCategory']!='Medical Care'))].tolist()).union(
                 set(prevention).union(set(diagnosis).union(set(basic_science).union(set(observation))))))

neg_id = [item  for item in maybe_neg if item not in pos_id]
original_data = merge_texts(original_data)
training_set_pos = original_data[['_id','text']].loc[(original_data['_id'].isin(pos_id))&
                                                     (~original_data['text'].isna())]
    
training_set_pos['target']='treatment'
training_set_neg = original_data[['_id','text']].loc[(original_data['_id'].isin(neg_id))&
                                                     (~original_data['text'].isna())]
training_set_neg['target']='not treatment'

training_set = pd.concat((training_set_pos,training_set_neg),ignore_index=True)

In [None]:
with open('data/specificCats.txt','rb') as dompfile:
    traindf = pickle.load(dompfile)

print(len(traindf))
print(traindf.groupby('topicCategory').size())
print(len(traindf['_id'].loc[((traindf['topicCategory']!='Treatment')&
                                         (traindf['topicCategory']!='Vaccines')&
                                         (traindf['topicCategory']!='Biologics')&
                                         (traindf['topicCategory']!='Medical Care'))]))

print(len(traindf['_id'].loc[((traindf['topicCategory']=='Treatment')|
                                                           (traindf['topicCategory']=='Vaccines')|
                                                           (traindf['topicCategory']=='Biologics')|
                                                           (traindf['topicCategory']=='Medical Care'))]))

In [None]:
print(len(training_set_pos))
print(len(training_set_neg))

In [None]:
####Vectorize the text for classifier

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(training_set['text'])
features = vectorizer.get_feature_names()
print(X.shape)

#### Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, training_set.target, test_size=0.2, random_state=0)

In [None]:
#### Classify training text as in category or not in category
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

print(confusion_matrix(y_test,y_pred))

report = classification_report(y_test,y_pred,output_dict=True)
print(pd.DataFrame(report))

probs = classifier.predict_proba(X_test)
probs = probs[:, 1]
auc = roc_auc_score(y_test, probs)
print(auc)
print('[[true neg     false pos]]')
print('[[false neg     true pos]]')

## Expand beyond training sets

In [None]:
#### Run the classifier on the entire training dataset without splitting
####Vectorize the training set for classifier

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(training_set['text'])
features = vectorizer.get_feature_names()
print(X.shape)

## Save the vectorizer
#vectorizerfile = "results/models/behavioral_vectorizer.pickle"
#vectorizerfile = "results/models/diagnosis_vectorizer.pickle"
#vectorizerfile = "results/models/prevention_vectorizer.pickle"
vectorizerfile = "results/models/treatment_vectorizer.pickle"
pickle.dump(vectorizer, open(vectorizerfile, "wb"))

#### train the model on all the data
classifier = RandomForestClassifier(n_estimators=1000, random_state=None)
classifier.fit(X, training_set.target)

## Save the Model
#filename = 'results/models/behavioral_randomforest.sav'
#filename = 'results/models/diagnosis_randomforest.sav'
#filename = 'results/models/prevention_randomforest.sav'
filename = 'results/models/treatment_randomforest.sav'
pickle.dump(classifier, open(filename, 'wb'))


In [None]:
## Load the saved models
loaded_model = pickle.load(open(filename, 'rb'))


In [None]:
#### Apply the model
## Pull out all the data 
idlist = get_source_ids()
textdf = batch_fetch_meta(idlist)
textdf = merge_texts(textdf)

## Remove records with no text
nonan = textdf.loc[((~textdf['text'].isna())&
                   (textdf['text'].str.len()>5))]

print(len(idlist), len(nonan))

In [None]:

## Vectorize the text based on the previously trained vectorizer and run the classifier
labels = nonan['_id']
M = vectorizer.transform(nonan['text'])
prediction = classifier.predict(M)

## Save the results
#classifier_results = pd.DataFrame(list(zip(labels, prediction)), columns =['_id', 'behavioral_prediction'])
#print(classifier_results.head(n=2))
#classifier_results.to_csv('results/predictions/behavioral_randomforest.tsv',
#                          sep='\t', header=True)

#classifier_results = pd.DataFrame(list(zip(labels, prediction)), columns =['_id', 'diagnosis_prediction'])
#print(classifier_results.head(n=2))
#classifier_results.to_csv('results/predictions/diagnosis_randomforest.tsv',
#                          sep='\t', header=True)

#classifier_results = pd.DataFrame(list(zip(labels, prediction)), columns =['_id', 'prevention_prediction'])
#print(classifier_results.head(n=2))
#classifier_results.to_csv('results/predictions/prevention_randomforest.tsv',
#                          sep='\t', header=True)

classifier_results = pd.DataFrame(list(zip(labels, prediction)), columns =['_id', 'treatment_prediction'])
print(classifier_results.head(n=2))
classifier_results.to_csv('results/predictions/treatment_randomforest.tsv',
                          sep='\t', header=True)

In [None]:
inspect_results = classifier_results.groupby('treatment_prediction').size().reset_index(name='counts')
print(inspect_results)