## Clinical Trials training set creation

In [1]:
import pandas
from pandas import read_csv
import pickle

In [2]:
original_data = read_csv('data/NCT_classification.csv', delimiter=',',header=0)
print(original_data.head(n=2))

           _id                                               name  \
0  NCT03348670  Follow NCT03305341 - Conducting an Initial Sma...   
1  NCT04273646  Clinical Study of Human Umbilical Cord Mesench...   

                                            abstract  \
0  Conducting an initial small, controlled clinic...   
1  The novel coronavirus pneumonia is a kind of n...   

                                         description  \
0  Conducting an initial small, controlled trial ...   
1  Since December 2019, Wuhan has successively fo...   

                                            keywords  \
0  ['Escape Mutation', 'ADE', 'Antigen Presentati...   
1  ['2019 Novel Coronavirus Pneumonia', 'Safety',...   

                                               url interventionCategory  \
0  https://clinicaltrials.gov/ct2/show/NCT03348670           biological   
1  https://clinicaltrials.gov/ct2/show/NCT04273646     biological; drug   

        studyType designAllocation designPrimaryPurpose  
0

In [3]:
keyworddflist = []

for i in range(len(original_data)):
    identifier = original_data.iloc[i]['_id']
    keywordlist = str(original_data.iloc[i]['keywords']).strip('[').strip(']').replace('-',' ').strip('"').replace('"','').replace("'","").split(',')
    for eachword in keywordlist:
        keyworddflist.append({'_id':identifier,'keyword':eachword.lower()})

keyworddf = pandas.DataFrame(keyworddflist)
keyworddf.sort_values('keyword',ascending=True,inplace=True)
print(keyworddf.head(n=2))

              _id keyword
370   NCT04313322        
7226  NCT04466839        


In [4]:
frequencies = keyworddf.groupby('keyword').size().reset_index(name='counts')
frequencies.sort_values('counts',ascending=False,inplace=True)
print(frequencies.head(n=60))

                                               keyword  counts
2994                                               nan    1268
2704                                          covid 19     498
641                                           covid 19     396
2130                                        sars cov 2     245
619                                        coronavirus     136
3171                                        sars cov 2      98
2685                                       coronavirus      66
1817                                         pneumonia      63
1127                                hydroxychloroquine      61
306                                               ards      58
2730                                           covid19      44
667                                            covid19      41
640                                              covid      37
2143                                         sars cov2      36
2701                                             covid 

In [5]:
interventionlist = []

for i in range(len(original_data)):
    identifier = original_data.iloc[i]['_id']
    purpose = original_data.iloc[i]['designPrimaryPurpose']
    tmplist = str(original_data.iloc[i]['interventionCategory']).replace('; ',';').split(';')
    for eachword in tmplist:
        interventionlist.append({'_id':identifier,'designPrimaryPurpose':purpose,
                                 'interventionCategory':eachword.lower()})

interventiondf = pandas.DataFrame(interventionlist)
purpose_frequency = interventiondf.groupby(['designPrimaryPurpose','interventionCategory']).size().reset_index(name='counts')
purpose_frequency.sort_values('counts',ascending=False,inplace=True)
print(purpose_frequency.head(n=2))

   designPrimaryPurpose interventionCategory  counts
63            treatment                 drug     796
64            treatment                other     245


In [6]:
mapping_dict = {'prevention':'Prevention',
                'depression':'Behavioral Research',
                'anxiety':'Behavioral Research',
                'mental health':'Behavioral Research',
                'stress':'Behavioral Research',
                'lung ultrasound':'Pathology/Radiology',
                'treatment':'Treatment',
                'convalescent plasma':'Biologics',
                'mortality':'Risk Factors',
                'mechanical ventilation':'Medical Care',
                'critical care':'Medical Care',
                'epidemiology':'Epidemiology',
                'personal protective equipment':'Individual Prevention',
                'vaccine':'Vaccines',
                'anosmia':'Symptoms',
                'cell therapy':'Biologics',
                'mesenchymal stem cells':'Biologics',
                'diagnosis':'Diagnosis',
                'rehabilitation':'Medical Care',
                'pain':'Symptoms',
                'obesity':'Risk Factors'}

treatmentmap = {"biological":'Biologics',
                "behavioral":"Behavioral Research",
                "device":"Medical Care",
                "procedure":"Medical Care",
                "radiation":"Medical Care"}
basicmap = {"diagnostic":"Diagnosis",
            "treatment":"Treatment",
            "prevention":"Prevention",
            "supportive care":"Treatment",
            "behavioral":"Behavioral Research"}

In [7]:
## Creating the lists
behavioral = interventiondf['_id'].loc[interventiondf['interventionCategory']=='behavioral'].tolist()
print(len(behavioral))
treatment = interventiondf['_id'].loc[(interventiondf['designPrimaryPurpose']=='treatment')|
                              (interventiondf['designPrimaryPurpose']=='supportive care')].tolist()

prevention = interventiondf['_id'].loc[interventiondf['designPrimaryPurpose']=='prevention'].tolist()
diagnosis = interventiondf['_id'].loc[interventiondf['designPrimaryPurpose']=='diagnostic'].tolist()

drug = interventiondf['_id'].loc[interventiondf['interventionCategory']=='drug'].tolist()

basic_science = interventiondf['_id'].loc[interventiondf['designPrimaryPurpose']=='basic science'].tolist()

observational = original_data['_id'].loc[original_data['studyType']=='observational'].tolist()

print("behavioral: ",len(behavioral),", treatment: ",len(treatment))
print("prevention: ",len(prevention),", diagnosis: ",len(diagnosis))

with open('data/behavioral.txt','wb') as dmpfile:
    pickle.dump(behavioral,dmpfile)

with open('data/treatment.txt','wb') as dumpfile:
    pickle.dump(treatment,dumpfile)

with open('data/prevention.txt','wb') as dampfile:
    pickle.dump(prevention,dampfile)

with open('data/diagnosis.txt','wb') as dimpfile:
    pickle.dump(diagnosis,dimpfile)

with open('data/drug.txt','wb') as dompfile:
    pickle.dump(drug,dompfile)

with open('data/basic science.txt','wb') as dampfile:
    pickle.dump(basic_science,dampfile)

with open('data/observational.txt','wb') as doompfile:
    pickle.dump(observational,doompfile)

191
behavioral:  191 , treatment:  1537
prevention:  276 , diagnosis:  77


In [None]:
### To determine if a drug is new or being repurposed, look the drug up in drugbank.ca or Wikidata for market status
### The training datasets for Diagnosis, prevention, and behavioral look to be at about the right size
### The treatment set should be subsetted as much as possible

In [8]:
mapping_text = list(mapping_dict.keys())
print(mapping_text)

['prevention', 'depression', 'anxiety', 'mental health', 'stress', 'lung ultrasound', 'treatment', 'convalescent plasma', 'mortality', 'mechanical ventilation', 'critical care', 'epidemiology', 'personal protective equipment', 'vaccine', 'anosmia', 'cell therapy', 'mesenchymal stem cells', 'diagnosis', 'rehabilitation', 'pain', 'obesity']


In [10]:
traindocs = []
for i in range(len(original_data)):
    identifier = original_data.iloc[i]['_id']
    keywords = str(original_data.iloc[i]['keywords']).strip('[').strip(']').replace('-',' ').strip('"').replace('"','').replace("'","").replace(',',' ')
    for eachword in mapping_text:
        if eachword in keywords:
            traindocs.append({'topicCategory':mapping_dict[eachword],'_id':identifier})

traindf = pandas.DataFrame(traindocs)
print(len(traindf))

with open('data/specificCats.txt','wb') as dompfile:
    pickle.dump(traindf,dompfile)

309


In [90]:
print(len(traindf.loc[traindf['topicCategory']=='Vaccines']))

21
