In [1]:
import os, re
import numpy as np
import pickle as pkl

In [2]:
import psycopg2
import pandas as pd
from sqlalchemy import create_engine


#conn = psycopg2.connect("dbname=ct user=thomas password=mypass")

cargs = {'database': "aact"}
alchemyEngine   = create_engine('postgresql+psycopg2://thomas:mypass@127.0.0.1', pool_recycle=3600, connect_args=cargs)

dbConnection    = alchemyEngine.connect();

df_hiv_trials = pd.read_sql("""SELECT nct_id
	FROM ctgov.browse_conditions
	where mesh_term like 'Acquired Immunodeficiency Syndrome';""", dbConnection)

df_tb_trials = pd.read_sql("""SELECT nct_id
	FROM ctgov.browse_conditions
	where mesh_term like 'Tuberculosis';""", dbConnection)


In [4]:
input_folder = "/media/thomas/642d0db5-2c98-4156-b591-1a3572c5868c/data_open/clinicaltrials.gov/data/plain_text/"


In [26]:
file_to_text = {}
file_to_category = {}
ctr = 0

for root, folder, files in os.walk(input_folder):
    for file_name in files:
        
        nct_id = re.sub(r'_.+', '', re.sub(r'.+NCT', 'NCT', file_name))
        
        category = 0
        if nct_id in set(df_hiv_trials.nct_id):
            category = 1
        elif nct_id in set(df_tb_trials.nct_id):
            category = 2
            
        if category == 0:
            ctr += 1
            if ctr > 40:
                continue
        
#         if not (file_name.startswith("TB") or file_name.startswith("HIV")):
#             continue
        if not file_name.endswith("txt"):
            continue
        pdf_file = re.sub(".txt", "", file_name)
        
        full_file = input_folder + "/" + file_name
        with open(full_file, 'r') as f:
            text = f.read()
        file_to_text[pdf_file] = text
        file_to_category[pdf_file] = category

In [20]:
df_hiv_trials.nct_id

0       NCT03738410
1       NCT03541642
2       NCT03541382
3       NCT00863668
4       NCT00856154
           ...     
1619    NCT01590017
1620    NCT04201288
1621    NCT00841360
1622    NCT00836212
1623    NCT00846599
Name: nct_id, Length: 1624, dtype: object

In [14]:
nct_id

'NCT01990768'

In [5]:
from sklearn.model_selection import train_test_split


In [35]:
words = []
categories = []

for file, pages in file_to_text.items():
    
        
    cat = 0
    repetitions = 1
    if file.startswith("HIV"):
        cat = 1
        repetitions = 2
    elif file.startswith("TB"):
        cat = 2
        repetitions = 4
    
    for j in range(repetitions):
        words.append(" ".join(pages))
        categories.append(cat
                         )

In [36]:
from collections import Counter
Counter(categories)

Counter({2: 48, 1: 48, 0: 40})

In [37]:
features_train, features_test, labels_train, labels_test = train_test_split(words, categories, test_size=0.1, random_state=10)


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer


In [39]:
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB



In [172]:
vectoriser = TfidfVectorizer(stop_words='english',min_df=5,max_features=1000,
                              token_pattern=r'(?u)\b[a-zA-Z][a-zA-Z]+\b'
#                                vocabulary={"hiv", "tb", "tuberculosis"}
#                               ngram_range=(2, 2)
                              )
nb = MultinomialNB()
model = make_pipeline(vectoriser, nb)


In [173]:
len(features_train)

122

In [174]:
model.fit(features_train, labels_train)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_features=1000, min_df=5,
                                 stop_words='english',
                                 token_pattern='(?u)\\b[a-zA-Z][a-zA-Z]+\\b')),
                ('multinomialnb', MultinomialNB())])

In [175]:
labels_train

[0,
 2,
 1,
 1,
 0,
 1,
 2,
 0,
 2,
 2,
 1,
 1,
 1,
 0,
 2,
 1,
 2,
 0,
 1,
 0,
 1,
 2,
 0,
 2,
 0,
 2,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 2,
 1,
 1,
 2,
 2,
 0,
 2,
 1,
 2,
 1,
 2,
 0,
 1,
 1,
 0,
 2,
 2,
 1,
 2,
 0,
 2,
 2,
 2,
 0,
 2,
 1,
 2,
 1,
 1,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 2,
 1,
 2,
 0,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 0,
 1,
 2,
 2,
 1,
 0,
 0,
 0,
 2,
 0,
 2,
 2,
 0,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1]

In [176]:
vectoriser.vocabulary_

{'title': 928,
 'repeat': 764,
 'drug': 269,
 'control': 174,
 'malaria': 513,
 'safety': 802,
 'efficacy': 283,
 'study': 886,
 'short': 835,
 'trial': 936,
 'irb': 462,
 'gov': 356,
 'sponsor': 864,
 'university': 949,
 'delivery': 214,
 'usa': 956,
 'phone': 643,
 'edu': 278,
 'principal': 683,
 'investigators': 459,
 'infectious': 432,
 'diseases': 250,
 'laboratory': 482,
 'department': 217,
 'email': 288,
 'institute': 443,
 'en': 289,
 'la': 479,
 'centre': 116,
 'com': 137,
 'confidentiality': 163,
 'document': 256,
 'confidential': 162,
 'information': 435,
 'investigator': 458,
 'team': 914,
 'relevant': 762,
 'ethics': 304,
 'regulatory': 757,
 'contents': 172,
 'research': 775,
 'protocol': 702,
 'non': 580,
 'registration': 754,
 'data': 200,
 'summary': 893,
 'background': 81,
 'rationale': 726,
 'primary': 682,
 'objective': 587,
 'hypothesis': 388,
 'overview': 611,
 'design': 222,
 'sites': 846,
 'population': 657,
 'interventions': 454,
 'outcome': 607,
 'measures': 5

In [177]:
vectoriser.transform(features_train).todense()

matrix([[0.0099286 , 0.0065001 , 0.        , ..., 0.00755548, 0.00415705,
         0.        ],
        [0.        , 0.00158689, 0.00033628, ..., 0.        , 0.        ,
         0.00177962],
        [0.02909252, 0.00926583, 0.00381802, ..., 0.00119669, 0.00065843,
         0.        ],
        ...,
        [0.        , 0.00280325, 0.00445534, ..., 0.00325839, 0.        ,
         0.        ],
        [0.        , 0.01052561, 0.        , ..., 0.00489383, 0.07449533,
         0.        ],
        [0.00160176, 0.00838919, 0.        , ..., 0.00609455, 0.0013413 ,
         0.        ]])

In [178]:
model.predict_proba(["potato"])

array([[0.31147541, 0.3442623 , 0.3442623 ]])

In [179]:
model.predict_proba(["hiv"])

array([[0.0941472 , 0.70621191, 0.19964089]])

In [180]:
model.predict_proba(["tb"])

array([[0.06643297, 0.15703032, 0.77653671]])

In [181]:
nb.coef_



array([[-6.95350599, -7.06616611, -7.11439007, ..., -6.31644532,
        -7.02925314, -7.13227937],
       [-6.96662908, -7.08167102, -7.1334698 , ..., -6.8012508 ,
        -7.05411167, -7.2775513 ],
       [-7.21523829, -7.17056852, -7.03392871, ..., -7.0535232 ,
        -7.22725682, -7.25994778]])

In [182]:
vocab_to_coef = dict([(w, nb.coef_[0][s]) for w, s in vectoriser.vocabulary_.items()])

In [183]:
# top words:
# {"hiv", "vaginal", "tb", "tuberculosis", "viral"}

In [184]:
import operator

sorted(vocab_to_coef.items(), key=operator.itemgetter(1), reverse=True)

[('study', -4.630897782375148),
 ('vaccine', -5.641307836341274),
 ('malaria', -5.653302823022601),
 ('treatment', -5.659814134505136),
 ('data', -5.668358325476766),
 ('protocol', -5.765899099051372),
 ('trial', -5.857468306915456),
 ('adverse', -5.9694918818533385),
 ('dose', -5.971441659192785),
 ('clinical', -5.987034454525645),
 ('health', -6.002490542151783),
 ('children', -6.041113702389754),
 ('blood', -6.049190553165462),
 ('safety', -6.090435646691197),
 ('analysis', -6.09826634554034),
 ('day', -6.1176564523321755),
 ('version', -6.121560400217724),
 ('maternal', -6.132844449101317),
 ('visit', -6.144752168928951),
 ('consent', -6.147041093600412),
 ('infant', -6.166214004746069),
 ('vaccination', -6.173826827633823),
 ('time', -6.175787509920507),
 ('events', -6.207587456791659),
 ('participants', -6.208516976968679),
 ('subjects', -6.219249566334848),
 ('information', -6.219399573252411),
 ('drug', -6.236074722009025),
 ('age', -6.244691589995005),
 ('medical', -6.24623624

In [185]:
sorted(vocab_to_coef.items(), key=operator.itemgetter(1))

[('ou', -7.282729596096433),
 ('inh', -7.282729596096433),
 ('hvtn', -7.282729596096433),
 ('ics', -7.282729596096433),
 ('mab', -7.282729596096433),
 ('mdr', -7.282729596096433),
 ('bdq', -7.282729596096433),
 ('bedaquiline', -7.282729596096433),
 ('aeras', -7.282729596096433),
 ('qft', -7.282729596096433),
 ('dtg', -7.282729596096433),
 ('prep', -7.2821693204930495),
 ('pyrazinamide', -7.281494563487743),
 ('mtb', -7.2799793193452205),
 ('rif', -7.279577347951183),
 ('hptn', -7.279558977823938),
 ('xpert', -7.27918240297722),
 ('isoniazid', -7.277247067363821),
 ('ring', -7.276778810428119),
 ('itraconazole', -7.27648385974948),
 ('iris', -7.276303487617861),
 ('pour', -7.275694583211842),
 ('docx', -7.271642281136356),
 ('rpv', -7.2698659010677895),
 ('stream', -7.264304317478764),
 ('proprietary', -7.264220003482225),
 ('cab', -7.26321251199107),
 ('bcg', -7.260280919716448),
 ('injections', -7.259456797686266),
 ('tuberculosis', -7.258314843225692),
 ('des', -7.256947197045764),
 

In [186]:
y_pred = model.predict(features_test)

In [187]:
y_pred

array([0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 0, 2, 2])

In [200]:
labels_test

[0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 0, 2, 2]

In [194]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [195]:
accuracy_score(labels_test, y_pred)

0.9285714285714286

In [196]:
model.predict_proba(["hiv"])

array([[0.0941472 , 0.70621191, 0.19964089]])

In [197]:
model.predict_proba(["potato"])

array([[0.31147541, 0.3442623 , 0.3442623 ]])

In [198]:
model.predict_log_proba(["potato"])

array([[-1.16643489, -1.06635143, -1.06635143]])

In [199]:
model.predict_log_proba(["hiv"])

array([[-2.36289577, -0.34783993, -1.61123507]])

NameError: name 'features_test' is not defined