In [1]:
import os, re
import numpy as np
import pickle as pkl

In [2]:
input_folder = "../data/preprocessed_tika/"


In [4]:
file_to_text = {}
for root, folder, files in os.walk(input_folder):
    for file_name in files:
#         if not (file_name.startswith("TB") or file_name.startswith("HIV")):
#             continue
        if not file_name.endswith("pkl"):
            continue
        pdf_file = re.sub(".pkl", "", file_name)
        
        full_file = input_folder + "/" + file_name
#         print (full_file)
        with open(full_file, 'rb') as f:
            text = pkl.load(f)
        file_to_text[pdf_file] = text

In [5]:
from sklearn.model_selection import train_test_split


In [35]:
words = []
categories = []

for file, pages in file_to_text.items():
    
        
    cat = 0
    repetitions = 1
    if file.startswith("HIV"):
        cat = 1
        repetitions = 2
    elif file.startswith("TB"):
        cat = 2
        repetitions = 4
    
    for j in range(repetitions):
        words.append(" ".join(pages))
        categories.append(cat
                         )

In [36]:
from collections import Counter
Counter(categories)

Counter({2: 48, 1: 48, 0: 40})

In [37]:
features_train, features_test, labels_train, labels_test = train_test_split(words, categories, test_size=0.1, random_state=10)


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer


In [39]:
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB



In [241]:
vectoriser = TfidfVectorizer(stop_words='english',min_df=5,max_features=1000,
                              token_pattern=r'(?u)\b[a-zA-Z][a-zA-Z]+\b'
#                                vocabulary={"hiv", "tb", "tuberculosis"}
#                               ngram_range=(2, 2)
                              )
nb = MultinomialNB()
model = make_pipeline(vectoriser, nb)


In [242]:
len(features_train)

122

In [243]:
model.fit(features_train, labels_train)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_features=1000, min_df=5,
                                 stop_words='english',
                                 token_pattern='(?u)\\b[a-zA-Z][a-zA-Z]+\\b')),
                ('multinomialnb', MultinomialNB())])

In [244]:
labels_train

[0,
 2,
 1,
 1,
 0,
 1,
 2,
 0,
 2,
 2,
 1,
 1,
 1,
 0,
 2,
 1,
 2,
 0,
 1,
 0,
 1,
 2,
 0,
 2,
 0,
 2,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 2,
 1,
 1,
 2,
 2,
 0,
 2,
 1,
 2,
 1,
 2,
 0,
 1,
 1,
 0,
 2,
 2,
 1,
 2,
 0,
 2,
 2,
 2,
 0,
 2,
 1,
 2,
 1,
 1,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 2,
 1,
 2,
 0,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 0,
 1,
 2,
 2,
 1,
 0,
 0,
 0,
 2,
 0,
 2,
 2,
 0,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1]

In [245]:
vectoriser.vocabulary_

{'title': 928,
 'repeat': 764,
 'drug': 269,
 'control': 174,
 'malaria': 513,
 'safety': 802,
 'efficacy': 283,
 'study': 886,
 'short': 835,
 'trial': 936,
 'irb': 462,
 'gov': 356,
 'sponsor': 864,
 'university': 949,
 'delivery': 214,
 'usa': 956,
 'phone': 643,
 'edu': 278,
 'principal': 683,
 'investigators': 459,
 'infectious': 432,
 'diseases': 250,
 'laboratory': 482,
 'department': 217,
 'email': 288,
 'institute': 443,
 'en': 289,
 'la': 479,
 'centre': 116,
 'com': 137,
 'confidentiality': 163,
 'document': 256,
 'confidential': 162,
 'information': 435,
 'investigator': 458,
 'team': 914,
 'relevant': 762,
 'ethics': 304,
 'regulatory': 757,
 'contents': 172,
 'research': 775,
 'protocol': 702,
 'non': 580,
 'registration': 754,
 'data': 200,
 'summary': 893,
 'background': 81,
 'rationale': 726,
 'primary': 682,
 'objective': 587,
 'hypothesis': 388,
 'overview': 611,
 'design': 222,
 'sites': 846,
 'population': 657,
 'interventions': 454,
 'outcome': 607,
 'measures': 5

In [246]:
vectoriser.transform(features_train).todense()

matrix([[0.0099286 , 0.0065001 , 0.        , ..., 0.00755548, 0.00415705,
         0.        ],
        [0.        , 0.00158689, 0.00033628, ..., 0.        , 0.        ,
         0.00177962],
        [0.02909252, 0.00926583, 0.00381802, ..., 0.00119669, 0.00065843,
         0.        ],
        ...,
        [0.        , 0.00280325, 0.00445534, ..., 0.00325839, 0.        ,
         0.        ],
        [0.        , 0.01052561, 0.        , ..., 0.00489383, 0.07449533,
         0.        ],
        [0.00160176, 0.00838919, 0.        , ..., 0.00609455, 0.0013413 ,
         0.        ]])

In [247]:
model.predict_proba(["potato"])

array([[0.31147541, 0.3442623 , 0.3442623 ]])

In [248]:
model.predict_proba(["hiv"])

array([[0.0941472 , 0.70621191, 0.19964089]])

In [249]:
model.predict_proba(["tb"])

array([[0.06643297, 0.15703032, 0.77653671]])

In [250]:
nb.coef_



array([[-6.95350599, -7.06616611, -7.11439007, ..., -6.31644532,
        -7.02925314, -7.13227937],
       [-6.96662908, -7.08167102, -7.1334698 , ..., -6.8012508 ,
        -7.05411167, -7.2775513 ],
       [-7.21523829, -7.17056852, -7.03392871, ..., -7.0535232 ,
        -7.22725682, -7.25994778]])

In [251]:
vocab_to_coef = dict([(w, nb.coef_[0][s]) for w, s in vectoriser.vocabulary_.items()])

In [252]:
# top words:
# {"hiv", "vaginal", "tb", "tuberculosis", "viral"}

In [253]:
import operator

sorted(vocab_to_coef.items(), key=operator.itemgetter(1), reverse=True)

[('study', -4.630897782375148),
 ('vaccine', -5.641307836341274),
 ('malaria', -5.653302823022601),
 ('treatment', -5.659814134505136),
 ('data', -5.668358325476766),
 ('protocol', -5.765899099051372),
 ('trial', -5.857468306915456),
 ('adverse', -5.9694918818533385),
 ('dose', -5.971441659192785),
 ('clinical', -5.987034454525645),
 ('health', -6.002490542151783),
 ('children', -6.041113702389754),
 ('blood', -6.049190553165462),
 ('safety', -6.090435646691197),
 ('analysis', -6.09826634554034),
 ('day', -6.1176564523321755),
 ('version', -6.121560400217724),
 ('maternal', -6.132844449101317),
 ('visit', -6.144752168928951),
 ('consent', -6.147041093600412),
 ('infant', -6.166214004746069),
 ('vaccination', -6.173826827633823),
 ('time', -6.175787509920507),
 ('events', -6.207587456791659),
 ('participants', -6.208516976968679),
 ('subjects', -6.219249566334848),
 ('information', -6.219399573252411),
 ('drug', -6.236074722009025),
 ('age', -6.244691589995005),
 ('medical', -6.24623624

In [254]:
sorted(vocab_to_coef.items(), key=operator.itemgetter(1))

[('ou', -7.282729596096433),
 ('inh', -7.282729596096433),
 ('hvtn', -7.282729596096433),
 ('ics', -7.282729596096433),
 ('mab', -7.282729596096433),
 ('mdr', -7.282729596096433),
 ('bdq', -7.282729596096433),
 ('bedaquiline', -7.282729596096433),
 ('aeras', -7.282729596096433),
 ('qft', -7.282729596096433),
 ('dtg', -7.282729596096433),
 ('prep', -7.2821693204930495),
 ('pyrazinamide', -7.281494563487743),
 ('mtb', -7.2799793193452205),
 ('rif', -7.279577347951183),
 ('hptn', -7.279558977823938),
 ('xpert', -7.27918240297722),
 ('isoniazid', -7.277247067363821),
 ('ring', -7.276778810428119),
 ('itraconazole', -7.27648385974948),
 ('iris', -7.276303487617861),
 ('pour', -7.275694583211842),
 ('docx', -7.271642281136356),
 ('rpv', -7.2698659010677895),
 ('stream', -7.264304317478764),
 ('proprietary', -7.264220003482225),
 ('cab', -7.26321251199107),
 ('bcg', -7.260280919716448),
 ('injections', -7.259456797686266),
 ('tuberculosis', -7.258314843225692),
 ('des', -7.256947197045764),
 

In [255]:
y_pred = model.predict(features_test)

In [256]:
y_pred

array([0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 0, 2, 2])

In [257]:
labels_test

[0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 0, 2, 2]

In [258]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [259]:
accuracy_score(labels_test, y_pred)

0.9285714285714286

In [260]:
model.predict_proba(["hiv"])

array([[0.0941472 , 0.70621191, 0.19964089]])

In [261]:
model.predict_proba(["potato"])

array([[0.31147541, 0.3442623 , 0.3442623 ]])

In [262]:
model.predict_log_proba(["potato"])

array([[-1.16643489, -1.06635143, -1.06635143]])

In [263]:
model.predict_log_proba(["hiv"])

array([[-2.36289577, -0.34783993, -1.61123507]])

In [204]:
import pickle as pkl
with open("condition_classifier.pkl", "wb") as f:
    pkl.dump(model, f)

In [218]:
from nltk.tokenize import RegexpTokenizer
tok = RegexpTokenizer(r'[\w%]+')

def manual_classifier(text):
    condition_to_pages = {"TB": [], "HIV": []}
    
    tokens = []
    for word in tok.tokenize(text):
        tokens.append(word)
        
    for token in tokens:
        if token == "HIV":
            condition_to_pages["HIV"].append(0)
        elif token.lower() in {"tb", "tuberculosis"}:
            condition_to_pages["TB"].append(0)
    
    print (len(condition_to_pages["TB"]), len(condition_to_pages["HIV"]))
    
    prediction = 0
    if len(condition_to_pages["TB"]) > len(condition_to_pages["HIV"]):
        prediction = 2
    elif len(condition_to_pages["HIV"]) > len(condition_to_pages["TB"]):
        prediction = 1
        
    return prediction

In [219]:
y_pred = []
for f in features_test:
    y_pred.append(manual_classifier(f))

0 8
0 196
6 664
167 343
1 759
13 2108
7 467
156 170
392 82
583 28
135 27
3 1
115 62
115 62


In [220]:
accuracy_score(labels_test, y_pred)

0.7857142857142857

In [221]:
y_pred

[1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2]

In [264]:
features_test[0]

'\n \nINTERNAL USE ONLY \n\n \n \n\nThe following clinical study protocol was shared in confidence by the \nPrincipal Investigator. The information and content contained in this \ndocument is confidential and only intended for reference of The Bill & \nMelinda Gates Foundation. Publishing, distributing, duplicating, or \notherwise sharing this document outside of The Bill & Melinda Gates \nFoundation is not permitted. \n\n \nPATH VAC 041 CSR – Study Information v1.0; 20DEC2018 \n\nConfidential/Proprietary Information Date of database lock: 05APR2018 \n\n16.1.1 Protocol and Protocol Amendments \n \n\n  \n\n \nVAC 041 \n\nA Phase I/II double-blind, randomized, placebo-controlled, \ndescending-age, dose-escalation study to examine \nthe safety, tolerability and immunogenicity of the \n\ntrivalent P2-VP8 subunit rotavirus vaccine in healthy \nSouth African adults, toddlers and infants \n\nCONFIDENTIAL \n\n21 June 2016 \n\nSponsored by: \n\nPATH Vaccine Solutions \n\nNational Principal Inve

In [325]:
model.predict_proba(["the participants"])

array([[0.24504705, 0.45971338, 0.29523957]])

In [268]:
transformed_document = vectoriser.transform([features_test[0]]).todense()

In [307]:
probas = np.zeros((transformed_document.shape[1]))
for i in range(transformed_document.shape[1]):
    zeros = np.zeros(transformed_document.shape)
    zeros[0,i] = transformed_document[0,i]
    proba = nb.predict_log_proba(zeros)
    probas[i] = proba[0,1]

In [308]:
np.argmax(probas)

620

In [310]:
np.argsort(-probas)

array([620, 619,  47, 435, 441, 921, 472, 133, 923,  24, 702, 689, 919,
       775, 129, 970,  29, 857, 818, 704, 377,  83, 866, 489, 981, 369,
       110, 361, 804,  82,  46, 294, 973, 972,  16,  50, 490, 957, 440,
       852, 845, 551, 684,  45,  35, 422, 106, 348, 601, 808,  20, 920,
       983, 453, 687, 330, 545, 412, 703, 885,  23, 733, 547,  30, 975,
       642, 846, 364, 701, 114, 386, 292,  70, 809, 378, 409, 495, 439,
       198, 822,  21, 881, 924, 514, 759, 305, 922, 690, 468, 333, 635,
       572, 308, 174, 265, 194, 613, 111, 166, 996, 285, 794, 491,  18,
       576,  14, 143,  11,  15, 357, 592, 979, 184, 522, 430, 531,  34,
       960, 448, 663, 413, 902, 476, 359,  38, 142, 666, 580, 429, 772,
       189, 723, 124,  99,   6,   0, 817, 423, 501, 786, 352, 735, 820,
       122, 148, 894, 318, 671, 623, 828, 245, 925, 126, 584, 375, 397,
       374, 275, 660, 171, 170, 541, 630, 279, 826, 956, 163, 254, 173,
       643,  33, 714, 955, 248, 843, 579, 401, 535, 795, 528, 73

{'title': 928,
 'repeat': 764,
 'drug': 269,
 'control': 174,
 'malaria': 513,
 'safety': 802,
 'efficacy': 283,
 'study': 886,
 'short': 835,
 'trial': 936,
 'irb': 462,
 'gov': 356,
 'sponsor': 864,
 'university': 949,
 'delivery': 214,
 'usa': 956,
 'phone': 643,
 'edu': 278,
 'principal': 683,
 'investigators': 459,
 'infectious': 432,
 'diseases': 250,
 'laboratory': 482,
 'department': 217,
 'email': 288,
 'institute': 443,
 'en': 289,
 'la': 479,
 'centre': 116,
 'com': 137,
 'confidentiality': 163,
 'document': 256,
 'confidential': 162,
 'information': 435,
 'investigator': 458,
 'team': 914,
 'relevant': 762,
 'ethics': 304,
 'regulatory': 757,
 'contents': 172,
 'research': 775,
 'protocol': 702,
 'non': 580,
 'registration': 754,
 'data': 200,
 'summary': 893,
 'background': 81,
 'rationale': 726,
 'primary': 682,
 'objective': 587,
 'hypothesis': 388,
 'overview': 611,
 'design': 222,
 'sites': 846,
 'population': 657,
 'interventions': 454,
 'outcome': 607,
 'measures': 5

In [322]:
for ctr, j in enumerate(np.argsort(-probas)):
    for w, i in vectoriser.vocabulary_.items():
        if i == j:
            print (ctr, w)

0 participants
1 participant
2 antibody
3 information
4 injections
5 testing
6 june
7 cohort
8 text
9 adults
10 protocol
11 product
12 test
13 research
14 clinic
15 version
16 africa
17 south
18 section
19 provided
20 hiv
21 baseline
22 staff
23 level
24 week
25 health
26 causing
27 groups
28 samples
29 based
30 antibodies
31 enrollment
32 virus
33 viral
34 additional
35 appendix
36 levels
37 use
38 injection
39 social
40 site
41 ml
42 prior
43 anti
44 al
45 indicated
46 care
47 functional
48 oral
49 schedule
50 administered
51 tested
52 weeks
53 intervention
54 procedures
55 female
56 mg
57 incidence
58 provide
59 studies
60 adult
61 receive
62 minimal
63 african
64 visits
65 phase
66 sites
67 guidelines
68 protein
69 center
70 human
71 endpoints
72 assessments
73 scheduled
74 home
75 impact
76 limited
77 initiation
78 daids
79 self
80 administration
81 stored
82 therapy
83 male
84 related
85 evaluate
86 tests
87 products
88 iv
89 figure
90 performed
91 national
92 evaluations
93 cont

739 clinically
740 procedure
741 deaths
742 definitions
743 double
744 interim
745 reason
746 protection
747 results
748 ecg
749 corticosteroids
750 assays
751 completed
752 duration
753 condition
754 developed
755 relative
756 unexpected
757 total
758 summarized
759 signed
760 exclusion
761 exact
762 sop
763 surveillance
764 drugs
765 present
766 value
767 identified
768 presence
769 planned
770 induced
771 included
772 used
773 controlled
774 points
775 log
776 allocation
777 na
778 window
779 reports
780 objective
781 electronic
782 completion
783 cases
784 component
785 original
786 determined
787 hemoglobin
788 term
789 global
790 birth
791 respectively
792 findings
793 infectious
794 ci
795 model
796 associated
797 malaria
798 ethics
799 concomitant
800 overview
801 occurred
802 ensure
803 delivery
804 forms
805 presented
806 point
807 does
808 randomization
809 details
810 field
811 analyses
812 noted
813 outcome
814 specimens
815 result
816 assay
817 code
818 summary
819 pain
8

In [304]:
np.log(transformed_document[0]) + nb.coef_[1]

  np.log(transformed_document[0]) + nb.coef_[1]


matrix([[-12.24026268, -12.44243422, -12.99598769, -14.34752173,
                 -inf, -12.49871836, -12.23571418, -12.90272187,
         -11.33902089,         -inf, -13.72557819, -10.86209848,
         -11.80768099, -12.29229576, -10.88979916, -11.16477869,
         -10.9186729 , -12.69573239, -12.53525851, -11.8795441 ,
         -10.14038094, -10.17288085,         -inf, -11.17316679,
          -9.76192648,  -9.59627351, -10.0284367 ,         -inf,
         -10.08579098, -10.49238135, -10.42051951, -10.59866524,
                 -inf, -12.70345546, -13.01494037,  -9.80166308,
         -13.51216768, -14.47977092, -12.35680701,         -inf,
         -11.80120852, -13.38288067, -11.82282248, -10.50036811,
                 -inf, -10.55394401, -10.91100485,  -9.14845114,
         -11.78713089,         -inf, -10.23508136, -11.72612659,
         -12.97995436, -11.33908394, -12.83795079, -12.24532573,
         -12.3383093 ,         -inf, -12.35064021, -11.47226041,
         -13.3567387 ,   

In [305]:
np.argmax(np.log(transformed_document[0]) + nb.coef_[1])

  np.argmax(np.log(transformed_document[0]) + nb.coef_[1])


886

study


In [328]:
model.named_steps['multinomialnb']

{'tfidfvectorizer': TfidfVectorizer(max_features=1000, min_df=5, stop_words='english',
                 token_pattern='(?u)\\b[a-zA-Z][a-zA-Z]+\\b'),
 'multinomialnb': MultinomialNB()}