# Objectives:
We have created the vocab previously. Now for noisy channel we need to create a vocab file with frequency in TSV format. \
Same source is used to build the file -> (https://www.kaggle.com/datasets/jpmiller/layoutlm/data)

Since,we have done EDA on the previous stage (while creating corpus), we will skip EDA and jump into pre-processing and vocab-frequency file creation.

In [43]:
import pandas as pd
from nltk import RegexpTokenizer
from nltk.tokenize import word_tokenize
import statistics

from nltk.translate.bleu_score import corpus_bleu

from app_config import Configuration

In [44]:
df = pd.read_csv('data/medquad-kaggle-johnm.csv')

In [45]:
# The following function returns token count for given text, it will be used for calculating
# average tokens for questions & answers.
def token_count(x):
    return len(word_tokenize(x))

Before we use the word_tokenizer to count the tokens in each column, we need to drop the missing values to avoid exceptions.

In [46]:
# Drop only rows with missing values on the answer columns.
df = df[df['answer'].notna()]

cnt = df['answer'].apply(token_count).sum()
print(f'Answers have {cnt} count of tokens.')

Answers have 3731909 count of tokens.


### In the preprocessing to build the vocab, we need to:
1. Clean the text from answer column - only words (others are stripped)
2. Get lemma for each words to avoid redundant word with the same meaning
3. Only include unique words into vocab

In [47]:
from importlib import reload
# 1. Clean the text from answer column - only words (others are stripped)
from nltk.corpus import stopwords
import utils.regex as rx

reload(rx)

# Acquire the stop words from NLTK corpus.
stop_words = stopwords.words('english')

# To store all the processed tokens.
corpus_token_list = []

# Tracing value for debugging.
i = 0
token_cnt = 0
filtered_token_cnt = 0
cleanse_data = pd.DataFrame(columns=['row', 'original', 'cleansed'])
try:
    for text in df['answer']:
        # Remove URLs.
        clean_text = rx.remove_url(text)
        # # Remove HTML tags.
        clean_text = rx.remove_html(clean_text)
        # # Remove bracketed words (usually acronyms).
        clean_text = rx.remove_bracketed_text(clean_text)
        # Transform contradictions to full form first before removing stop words.
        clean_text = rx.transform_contractions(clean_text)
        # Get only words.
        clean_text = rx.get_words(clean_text.lower())
        # Remove all extra spaces.
        clean_text = rx.remove_extra_space(clean_text)
        # For tracing raw to cleanse.
        cleanse_data.loc[len(cleanse_data)] = [i, text, clean_text]
        # Tokenize the text.
        tokens = word_tokenize(clean_text)
        # Tracing unfiltered-token count.
        token_cnt += len(tokens)
        # Filter stop words.
        filtered_text = [w for w in tokens if not w.lower() in stop_words]
        # Tracing filtered-token count for debugging.
        filtered_token_cnt += len(filtered_text)
        # Add the filtered words into corpus token_list.
        corpus_token_list.append(filtered_text)
        # Tracing row-count for debugging.
        i += 1
except Exception as e:
    print(f'Exception {e} in {i}.')

print(f'Rows processed:[{i}], unfiltered tokens:[{token_cnt}], filtered tokens:[{filtered_token_cnt}]')
print(f'Corpus entry count:[{len(corpus_token_list)}].')

cleanse_data.to_csv('data/cleanse-data-freq.csv', index=False)

Rows processed:[16407], unfiltered tokens:[3151455], filtered tokens:[1885840]
Corpus entry count:[16407].


In [48]:
cleanse_data.iloc[580:590]

Unnamed: 0,row,original,cleansed
580,580,Here are links to more information about P.A.D...,here are links to more information about p a d...
581,581,"Many Reasons for Abuse Drug abuse, whether pre...",many reasons for abuse drug abuse whether pres...
582,582,Addiction is a chronic disease in which a pers...,addiction is a chronic disease in which a pers...
583,583,Physical dependence is a normal process that c...,physical dependence is a normal process that c...
584,584,"A persons behavior, especially changes in beha...",a persons behavior especially changes in behav...
585,585,The prescription medications most commonly abu...,the prescription medications most commonly abu...
586,586,Medications affect older people differently th...,medications affect older people differently th...
587,587,Marijuana is the most abused illicit drug amon...,marijuana is the most abused illicit drug amon...
588,588,"Although under federal law, marijuana is illeg...",although under federal law marijuana is illega...
589,589,"Not always. Some warning signs, such as sleep ...",not always some warning signs such as sleep pr...


In [49]:
# TODO: 23-03-2025: Wrong word 'dressingsthese' appeared in the medical.txt.  This is GIGO.

# 2. Get lemma for each words to avoid redundant word with the same meaning
from nltk.stem import WordNetLemmatizer

# nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

# List for storing token's lemma.
corpus_lemma_list = []

# Tracing value for debugging.
i = 0
token_cnt = 0
filtered_token_cnt = 0
# normalized_data = pd.DataFrame(columns=['row', 'original', 'normalized'])
trace_list = []
for entry in corpus_token_list:
    entry_list = []
    for token in entry:
        normalized_token = lemmatizer.lemmatize(token)
        entry_list.append(normalized_token)

        # For tracing raw to cleanse.
        trace_list.append([i, token, normalized_token])
        # normalized_data.loc[len(normalized_data)] = [i, token, normalized_token]

        filtered_token_cnt += 1

    # Add entry (token for single answer) in the list for further processing.
    corpus_lemma_list.append(entry_list)
    i = i + 1

print(f'Rows processed:[{i}], filtered tokens:[{filtered_token_cnt}]')
print(f'Corpus entry count:[{len(corpus_token_list)}].')

normalized_data = pd.DataFrame(trace_list, columns=['row', 'original', 'normalized'])
normalized_data.to_csv('data/normalized-data-freq.csv', index=False)

Rows processed:[16407], filtered tokens:[1885840]
Corpus entry count:[16407].


In [50]:
normalized_data.iloc[1780:1790]

Unnamed: 0,row,original,normalized
1780,15,pressure,pressure
1781,15,causes,cause
1782,15,causes,cause
1783,15,high,high
1784,15,blood,blood
1785,15,pressure,pressure
1786,15,include,include
1787,15,medical,medical
1788,15,conditions,condition
1789,15,chronic,chronic


In [51]:
# Upon random checking on the vocab, 'dressingsthese' was found. Further investigation was conducted to trace the outcome.
normalized_data[normalized_data['normalized'] == 'dressingsthese']

Unnamed: 0,row,original,normalized
5141,32,dressingsthese,dressingsthese
5235,32,dressingsthese,dressingsthese


In [52]:
# 3. Only include unique words into vocab

# Final vocab to store the lemmas from the corpus.
from collections import Counter

vocab_list = []
corpus_lemma_list = sorted(corpus_lemma_list, key=lambda s: (len(s), s))
# Count occurrences of words
word_counts = Counter(word for entry in corpus_lemma_list for word in entry)
# Convert to a list of objects (dictionaries)
vocab_list = [{"text": word, "freq": count} for word, count in word_counts.items()]

# Sort alphabetically by text
vocab_list.sort(key=lambda x: (len(x["text"]), x["text"]))

print(f'Vocab entry count (unique words):[{len(vocab_list)}].')

Vocab entry count (unique words):[23319].


### Now, we have the unique medical words stored in vocab and ready for creating a frequency corpus for noisy-channel.

In [53]:
# Save the vocab into custom NLTK corpus format.

import os

# Get corpus path from app config.
config = Configuration()

# Organize the vocab into custom folder.
corpus_dir = config.config_values['corpus_medical_freq_dir']
if not os.path.exists(corpus_dir):
    os.makedirs(corpus_dir)

corpus_name = config.config_values['corpus_medical_freq_name']

df = pd.DataFrame(vocab_list)
df.tail()
df.to_csv(corpus_dir + "/" + corpus_name, sep="\t", index=False, header=None)

In [54]:
# Load the custom NLTK corpus.
from nltk.corpus import PlaintextCorpusReader

# Step 3: Create an NLTK Corpus Reader
corpus = PlaintextCorpusReader(corpus_dir, '.*\.tsv')

print(f'There are {len(corpus.words())} words in custom corpus.')

There are 46638 words in custom corpus.
