In [43]:
import numpy as numpy
import pandas as pd
import re
from collections import defaultdict
from sklearn.model_selection import train_test_split

# Data Preprocesing

In [44]:
#load the dataset
indiana_data = pd.read_csv("data/indiana/preprocessed-indiana-cxr-reports.csv")
indiana_data.head()

Unnamed: 0,COMPARISON,INDICATION,FINDINGS,IMPRESSION,indication_count,findings_count,impression_count
0,none.,no indication,heart size normal. lungs are clear. are normal...,normal chest,2,17,2
1,no comparison,slipped back on right side,the heart size and pulmonary vascularity appea...,no evidence of active disease.,5,36,5
2,no comparison,bone marrow transplant evaluation. aml.,the heart size and pulmonary vascularity appea...,no evidence of active disease.,5,38,5
3,none.,chest pain and .,the heart is normal in size and contour. the l...,no acute cardiopulmonary disease.,4,41,4
4,none.,mid to lower back pain since .,the heart is normal in size and contour. the l...,no acute cardiopulmonary disease.,7,22,4


In [45]:
indiana_data.shape

(3418, 7)

In [46]:
# Since we are working on report summarization, we don't need the first two columns
# drop the unnecessary columns
indiana_data.drop(columns=['COMPARISON', 'INDICATION', 'indication_count', 'findings_count', 'impression_count'], inplace=True)
indiana_data.shape

(3418, 2)

In [47]:
# split the data into train, val and test
train, temp = train_test_split(indiana_data, test_size=0.1, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)
print("Train data shape:", train.shape)
print("Val data shape:", val.shape)
print("Test data shape:", test.shape)

Train data shape: (3076, 2)
Val data shape: (171, 2)
Test data shape: (171, 2)


In [48]:
# total number of unigram words in the findings training data
main_str = train.FINDINGS.values.tolist()
total_unique_words= len(list(set(' '.join(main_str).split(" "))))
print(f"There are total of {total_unique_words} unique unigrams words in the report")

There are total of 2033 unique unigrams words in the report


There are three main task of NLP Model training procedure, that is
1. Tokenization
2. Vectorization
3. Model Training

# Tokenization

In [49]:
# read the findings into  list of sentences
findings_list = train.FINDINGS.values.tolist()

#write findings to text file
dest_file = open("data/indiana/findings.txt","w")
for sentence in findings_list:
    dest_file.write(sentence +"\n")
dest_file.close()
#write impression to text file
dest_file2 = open("data/indiana/impressions.txt","w")
for sentence in train.IMPRESSION.values:
    dest_file2.write(sentence +"\n")
    
dest_file2.close()
    
    

In [50]:
# findings_list

In [51]:
# read the vocab
vocab = defaultdict(int)
with open('data/indiana/findings.txt', 'r', encoding='utf-8') as f:

    for sent in f:
        words = sent.strip().split()
        for word in words:
            vocab[' '.join(list(word)) + ' </w>'] += 1

In [52]:
vocab

defaultdict(int,
            {'t h e </w>': 5778,
             'l u n g s </w>': 1719,
             'r e m a i n </w>': 34,
             'h y p e r e x p a n d e d . </w>': 32,
             'n o </w>': 4006,
             'm a s s e s </w>': 15,
             'o r </w>': 2183,
             'i n f i l t r a t e s </w>': 32,
             'i n </w>': 1320,
             'l u n g s . </w>': 80,
             'p l e u r a l </w>': 2246,
             'm e d i a s t i n a l </w>': 759,
             'a i r </w>': 204,
             'c o l l e c t i o n s . </w>': 4,
             'h e a r t </w>': 1903,
             's i z e </w>': 1381,
             'n o r m a l . </w>': 1073,
             'i s </w>': 3977,
             'n o r m a l </w>': 2097,
             's i z e . </w>': 355,
             'm e d i a s t i n u m </w>': 512,
             'u n r e m a r k a b l e . </w>': 488,
             'a </w>': 494,
             't o r t u o u s </w>': 69,
             'c a l c i f i e d </w>': 345,
        

In [53]:
def get_pair_counts(vocab):
    pair_counts = defaultdict(int)
    for k, v in vocab.items():
        chars = k.split()
        for i in range(len(chars)-1):
            pair_counts[(chars[i], chars[i+1])]+=1
    return pair_counts         

In [54]:
char_pair_counts = get_pair_counts(vocab)
char_pair_counts

defaultdict(int,
            {('t', 'h'): 109,
             ('h', 'e'): 72,
             ('e', '</w>'): 197,
             ('l', 'u'): 53,
             ('u', 'n'): 48,
             ('n', 'g'): 135,
             ('g', 's'): 7,
             ('s', '</w>'): 207,
             ('r', 'e'): 227,
             ('e', 'm'): 69,
             ('m', 'a'): 111,
             ('a', 'i'): 30,
             ('i', 'n'): 307,
             ('n', '</w>'): 117,
             ('h', 'y'): 47,
             ('y', 'p'): 30,
             ('p', 'e'): 87,
             ('e', 'r'): 262,
             ('e', 'x'): 46,
             ('x', 'p'): 10,
             ('p', 'a'): 76,
             ('a', 'n'): 79,
             ('n', 'd'): 69,
             ('d', 'e'): 138,
             ('e', 'd'): 208,
             ('d', '.'): 66,
             ('.', '</w>'): 513,
             ('n', 'o'): 61,
             ('o', '</w>'): 10,
             ('a', 's'): 96,
             ('s', 's'): 54,
             ('s', 'e'): 97,
             ('e', 's'): 190,

In [55]:
def merge_pairs(best_pair, vocab_in):
    vocab_out = {}
    bigram = re.escape(" ".join(best_pair))

    p = re.compile(r'(?<!\S)'+bigram + r'(?!\S)')
    for word in vocab_in:
        w_out = p.sub(''.join(best_pair), word)
  
        vocab_out[w_out] = vocab_in[word]
    return vocab_out
    

In [56]:
def get_tokens_count(vocab):
    char_tokens = defaultdict(int)
    
    for word, count in vocab.items():
        tokens = word.split()
        for t in tokens:
            char_tokens[t]+=count
            
    return char_tokens

In [57]:
tokens = get_tokens_count(vocab)
len(tokens)

38

In [58]:
tokens

defaultdict(int,
            {'t': 43042,
             'h': 21507,
             'e': 62845,
             '</w>': 96024,
             'l': 34639,
             'u': 22094,
             'n': 40834,
             'g': 8537,
             's': 35441,
             'r': 43069,
             'm': 16900,
             'a': 54748,
             'i': 46950,
             'y': 5400,
             'p': 14106,
             'x': 2937,
             'd': 15578,
             '.': 14321,
             'o': 42869,
             'f': 12699,
             'c': 19871,
             'z': 2102,
             'k': 913,
             'b': 4953,
             'w': 3749,
             'v': 4146,
             'q': 93,
             'j': 175,
             '5': 29,
             '2': 36,
             '9': 26,
             '4': 17,
             '1': 76,
             '0': 18,
             '7': 22,
             '3': 25,
             '6': 27,
             '8': 23})

In [59]:
import numpy as np

In [60]:
get_pair_counts(vocab).get('k')


In [61]:
%%time
# merge the ch pairs with the max count in bottom-up manner
iters = 1000
for i in range(iters):
    pairs = get_pair_counts(vocab)
    if not pairs:
        break
    best_pair = max(pairs, key = pairs.get)

    vocab = merge_pairs(best_pair, vocab)
#     print(len(get_tokens_count(vocab)))

CPU times: user 4.52 s, sys: 21.3 ms, total: 4.54 s
Wall time: 4.55 s


In [62]:
def tokenize_word(string, sorted_tokens, unknown_token='</u>'):
    
    if string == '':
        return []
    if sorted_tokens == []:
        return [unknown_token]

    string_tokens = []
    for i in range(len(sorted_tokens)):
        token = sorted_tokens[i]
        token_reg = re.escape(token.replace('.', '[.]'))

        matched_positions = [(m.start(0), m.end(0)) for m in re.finditer(token_reg, string)]
        if len(matched_positions) == 0:
            continue
        substring_end_positions = [matched_position[0] for matched_position in matched_positions]

        substring_start_position = 0
        for substring_end_position in substring_end_positions:
            substring = string[substring_start_position:substring_end_position]
            string_tokens += tokenize_word(string=substring, sorted_tokens=sorted_tokens[i+1:], unknown_token=unknown_token)
            string_tokens += [token]
            substring_start_position = substring_end_position + len(token)
        remaining_substring = string[substring_start_position:]
        string_tokens += tokenize_word(string=remaining_substring, sorted_tokens=sorted_tokens[i+1:], unknown_token=unknown_token)
        break
    if len(string_tokens)==0:
        string_tokens.append(unknown_token)
    return string_tokens


In [63]:
def measure_token_length(token):
    if token[-4:] == '</w>':
        return len(token[:-4]) + 1
    else:
        return len(token)

In [64]:
word_given_known = 'medication</w>'

In [65]:
token_freqs = get_tokens_count(vocab)
sorted_tokens_tuple = sorted(token_freqs.items(), key=lambda item: (measure_token_length(item[0]), item[1]), reverse=True)
sorted_tokens = [token for (token, freq) in sorted_tokens_tuple]

In [66]:
#store the tokenization of a word if that word already exist in vocabulary
vocab_tokenization = {''.join(word.split()):word for word in vocab.keys()}
def tokenized_postprocessing(word):
    word_given = word
    if word_given in vocab_tokenization:
   
        tokenised_str = vocab_tokenization[word_given].split()

    else:
 
        tokenised_str = tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>')
    return tokenised_str


In [69]:
vocab

{'the</w>': 5778,
 'lung s</w>': 1719,
 'remain </w>': 34,
 'hyperexpan ded.</w>': 32,
 'no </w>': 4006,
 'mas ses</w>': 15,
 'or</w>': 2183,
 'infiltr ates</w>': 32,
 'in</w>': 1320,
 'lung s.</w>': 80,
 'pleural</w>': 2246,
 'mediastinal</w>': 759,
 'air</w>': 204,
 'collec tions.</w>': 4,
 'heart</w>': 1903,
 'siz e</w>': 1381,
 'norm al.</w>': 1073,
 'is</w>': 3977,
 'norm al</w>': 2097,
 'siz e.</w>': 355,
 'mediastin um</w>': 512,
 'unremark able.</w>': 488,
 'a</w>': 494,
 'tortu ous</w>': 69,
 'calcified</w>': 345,
 'thoracic</w>': 520,
 'aort a</w>': 106,
 'pres ent.</w>': 71,
 'ar e</w>': 3934,
 'hyperexpan ded</w>': 47,
 'consist ent</w>': 83,
 'wi th</w>': 578,
 'emphysem a.</w>': 36,
 'there</w>': 2176,
 'effu sion</w>': 1070,
 'pneumothorax</w>': 1127,
 'focal</w>': 1566,
 'air space</w>': 741,
 'dise a se.</w>': 326,
 'clear</w>': 622,
 'bilater ally.</w>': 184,
 'cardiac</w>': 392,
 'and</w>': 2551,
 'silhouet tes</w>': 52,
 'pul monary</w>': 1085,
 'vasculat ure</w>': 

In [68]:
sorted_tokens_tuple

[('pneumothorax</w>', 1128),
 ('hemidiaphragm', 107),
 ('mediastinal</w>', 760),
 ('atheroscler', 111),
 ('abnormaliti', 107),
 ('clavicular</w>', 13),
 ('scoliosis.</w>', 8),
 ('calcified</w>', 358),
 ('atelectasi', 211),
 ('interstiti', 125),
 ('granulomat', 109),
 ('hyperexpan', 98),
 ('displaced</w>', 61),
 ('radiograph', 61),
 ('vertebral</w>', 52),
 ('segmental</w>', 46),
 ('scoliosis</w>', 46),
 ('curvature</w>', 24),
 ('inflation</w>', 22),
 ('placement</w>', 9),
 ('alization</w>', 4),
 ('thoracic</w>', 544),
 ('mediastin', 537),
 ('structure', 527),
 ('vascular</w>', 167),
 ('appearanc', 159),
 ('inflated</w>', 125),
 ('surgical</w>', 106),
 ('diaphragm', 101),
 ('ophrenic</w>', 98),
 ('enlarged</w>', 49),
 ('configura', 44),
 ('throughou', 30),
 ('parenchym', 21),
 ('icardial</w>', 18),
 ('cholecyst', 16),
 ('position</w>', 16),
 ('chondral</w>', 8),
 ('unfolded</w>', 5),
 ('hypoventi', 4),
 ('artifactu', 3),
 ('pleural</w>', 2249),
 ('thorax.</w>', 1070),
 ('consolid', 1016)

In [67]:
%%time
# Tokenise some of the sentences from training data
test_findings_list = test.FINDINGS.values.tolist()
for sentence in test_findings_list[:2]:
    print("Original Sentence= \n")
    print(print(sentence))
    tokenized_sent = []
    for word in sentence.strip().split():
        word = word+'</w>'
        for w in tokenized_postprocessing(word):
            tokenized_sent.append(w)
    print("Tokens=\n")
    print(tokenized_sent, "\n")       

Original Sentence= 

normal heart size. mild unfolding and atherosclerotic calcification of the aorta. no focal air space consolidation. no pneumothorax or pleural effusion. visualized bony structures are unremarkable in appearance.
None
Tokens=

['norm', 'al</w>', 'heart</w>', 'siz', 'e.</w>', 'mil', 'd</w>', 'unfol', 'ding</w>', 'and</w>', 'atheroscler', 'otic</w>', 'calcific', 'ation</w>', 'o', 'f', '</w>', 'the</w>', 'aort', 'a.</w>', 'no', '</w>', 'focal</w>', 'air</w>', 'space</w>', 'consolid', 'ation.</w>', 'no', '</w>', 'pneumothorax</w>', 'or</w>', 'pleural</w>', 'effu', 'sion.</w>', 'visualiz', 'ed</w>', 'b', 'on', 'y</w>', 'structure', 's</w>', 'ar', 'e</w>', 'unremark', 'able</w>', 'in</w>', 'appearanc', 'e.</w>'] 

Original Sentence= 

heart size normal. lungs are clear. are normal. no pneumonia effusions edema pneumothorax adenopathy nodules or masses.
None
Tokens=

['heart</w>', 'siz', 'e</w>', 'norm', 'al.</w>', 'lung', 's</w>', 'ar', 'e</w>', 'clear', '.</w>', 'ar', 'e