# Objectives:
We need to build medical vocabulary (word dictionary) for spelling checker. The following dataset was found useful for building vocab:(https://www.kaggle.com/datasets/jpmiller/layoutlm/data)

Before building the vocab, the following questions needs to be clarified:
1. What the dataset contains (EDA)?
2. Which feature needed?
3. What is the expected outcome?

Once, these questions are clarified, we can proceed building a custom corpus for noisy-channel.

In [None]:
import pandas as pd
from nltk import RegexpTokenizer
from nltk.tokenize import word_tokenize
import statistics

from nltk.translate.bleu_score import corpus_bleu

from app_config import Configuration

In [None]:
df = pd.read_csv('data/medquad-kaggle-johnm.csv')

In [None]:
df.shape

In [None]:
df.head(2)

## 1. What the dataset contains?
We need a corpus that contains medical words for building a medical dictionary (vocab). Therefore, we need to investigate whether the obtained corpus contains the required.

In [None]:
# Let's investigate the data types and columns in the dataset.
df.info()

In [None]:
# It appears there are four columns, and all columns contains string datatype.
# Let's investigate which column will be more sensible for building the vocab.
df.describe()

Logically assumption: average length of answers should be greater than questions; means, more texts in answers column. \
Let's investigate the assumption.


In [None]:
# The following function returns token count for given text, it will be used for calculating
# average tokens for questions & answers.
def token_count(x):
    return len(word_tokenize(x))

Before we use the word_tokenizer to count the tokens in each column, we need to drop the missing values to avoid exceptions.

In [None]:
# Let's find out the missing values first.
df.isnull().sum()

In [None]:
# From the above we can conclude, question column doesn't have missing values meanwhile, answer column have 5.
cnt = df['question'].apply(token_count).sum()
print(f'Questions have {cnt} count of tokens.')

In [None]:
# Drop only rows with missing values on the answer columns.
df = df[df['answer'].notna()]

cnt = df['answer'].apply(token_count).sum()
print(f'Answers have {cnt} count of tokens.')

## 2. Which feature needed?
In the EDA, two columns (question & answer) from the dataset was expected to have the required texts to build the vocab. For that, an hypothesis made that the answer column would have longer text than question column. The hypothesis was true, and therefore, the text from the answer column will be preprocessed for building the corpus.\

In the preprocessing to build the vocab, we need to:
1. Clean the text from answer column - only words (others are stripped)
2. Get lemma for each words to avoid redundant word with the same meaning
3. Only include unique words into vocab

In [None]:
from importlib import reload
# 1. Clean the text from answer column - only words (others are stripped)
from nltk.corpus import stopwords
import utils.regex as rx

reload(rx)

# Acquire the stop words from NLTK corpus.
stop_words = stopwords.words('english')

# To store all the processed tokens.
corpus_token_list = []

# Tracing value for debugging.
i = 0
token_cnt = 0
filtered_token_cnt = 0
cleanse_data = pd.DataFrame(columns=['row', 'original', 'cleansed'])
try:
    for text in df['answer']:
        # Remove URLs.
        clean_text = rx.remove_url(text)
        # # Remove HTML tags.
        clean_text = rx.remove_html(clean_text)
        # # Remove bracketed words (usually acronyms).
        clean_text = rx.remove_bracketed_text(clean_text)
        # Transform contradictions to full form first before removing stop words.
        clean_text = rx.transform_contractions(clean_text)
        # Get only words.
        clean_text = rx.get_words(clean_text.lower())
        # Remove all extra spaces.
        clean_text = rx.remove_extra_space(clean_text)
        # For tracing raw to cleanse.
        cleanse_data.loc[len(cleanse_data)] = [i, text, clean_text]
        # Tokenize the text.
        tokens = word_tokenize(clean_text)
        # Tracing unfiltered-token count.
        token_cnt += len(tokens)
        # Filter stop words.
        filtered_text = [w for w in tokens if not w.lower() in stop_words]
        # Tracing filtered-token count for debugging.
        filtered_token_cnt += len(filtered_text)
        # Add the filtered words into corpus token_list.
        corpus_token_list.append(filtered_text)
        # Tracing row-count for debugging.
        i += 1
except Exception as e:
    print(f'Exception {e} in {i}.')

print(f'Rows processed:[{i}], unfiltered tokens:[{token_cnt}], filtered tokens:[{filtered_token_cnt}]')
print(f'Corpus entry count:[{len(corpus_token_list)}].')

cleanse_data.to_csv('data/cleanse-data-freq.csv', index=False)

In [None]:
cleanse_data.iloc[580:590]

In [None]:
# TODO: 23-03-2025: Wrong word 'dressingsthese' appeared in the medical.txt.  This is GIGO.

# 2. Get lemma for each words to avoid redundant word with the same meaning
from nltk.stem import WordNetLemmatizer

# nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

# List for storing token's lemma.
corpus_lemma_list = []

# Tracing value for debugging.
i = 0
token_cnt = 0
filtered_token_cnt = 0
# normalized_data = pd.DataFrame(columns=['row', 'original', 'normalized'])
trace_list = []
for entry in corpus_token_list:
    entry_list = []
    for token in entry:
        normalized_token = lemmatizer.lemmatize(token)
        entry_list.append(normalized_token)

        # For tracing raw to cleanse.
        trace_list.append([i, token, normalized_token])
        # normalized_data.loc[len(normalized_data)] = [i, token, normalized_token]

        filtered_token_cnt += 1

    # Add entry (token for single answer) in the list for further processing.
    corpus_lemma_list.append(entry_list)
    i = i + 1

print(f'Rows processed:[{i}], filtered tokens:[{filtered_token_cnt}]')
print(f'Corpus entry count:[{len(corpus_token_list)}].')

normalized_data = pd.DataFrame(trace_list, columns=['row', 'original', 'normalized'])
normalized_data.to_csv('data/normalized-data-freq.csv', index=False)

In [None]:
normalized_data.iloc[1780:1790]

In [None]:
# Upon random checking on the vocab, 'dressingsthese' was found. Further investigation was conducted to trace the outcome.
normalized_data[normalized_data['normalized'] == 'dressingsthese']

In [None]:
# The above indicates that the 'dressingsthese' word originated from the source. Let's find out with the row-id=32 from the source data.
s3 = df.iloc[32:33]['answer'].tolist()
print(s3)

In [None]:
# Upon investigation this was found to be GIGO.
s1 = cleanse_data[cleanse_data['row'] == 32]['original'].tolist()
print(s1)

In [None]:
# Let's see the most frequent words in the vocab.
import wordcloud
import matplotlib.pyplot as plt

text = ''
for entry in corpus_lemma_list:
    for token in entry:
        text += ' ' + token

word_cloud = wordcloud.WordCloud(background_color='white').generate(text)
plt.figure(figsize=(15, 8), facecolor=None)
plt.imshow(word_cloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
# 3. Only include unique words into vocab

# Final vocab to store the lemmas from the corpus.
from collections import Counter

vocab_list = []
corpus_lemma_list = sorted(corpus_lemma_list, key=lambda s: (len(s), s))
# Count occurrences of words
word_counts = Counter(word for entry in corpus_lemma_list for word in entry)
# Convert to a list of objects (dictionaries)
vocab_list = [{"text": word, "freq": count} for word, count in word_counts.items()]

# Sort alphabetically by text
vocab_list.sort(key=lambda x: (len(x["text"]), x["text"]))

print(f'Vocab entry count (unique words):[{len(vocab_list)}].')

## 3. What is the expected outcome?
Now, we have the unique medical words stored in vocab and ready for creating a frequency corpus for noisy-channel.

In [None]:
# Save the vocab into custom NLTK corpus format.

import os

# Get corpus path from app config.
config = Configuration()

# Organize the vocab into custom folder.
corpus_dir = config.config_values['corpus_medical_freq_dir']
if not os.path.exists(corpus_dir):
    os.makedirs(corpus_dir)

corpus_name = config.config_values['corpus_medical_freq_name']

df = pd.DataFrame(vocab_list)
df.tail()
df.to_csv(corpus_dir + "/" + corpus_name, sep="\t", index=False, header=None)

In [None]:
# Load the custom NLTK corpus.
from nltk.corpus import PlaintextCorpusReader

# Step 3: Create an NLTK Corpus Reader
corpus = PlaintextCorpusReader(corpus_dir, '.*\.tsv')

print(f'There are {len(corpus.words())} words in custom corpus.')

In [None]:
# Testing: load the corpus and perform edit-distance for given real-word.
from nltk import edit_distance

test_word = 'glacoma'

if test_word not in corpus.words():
    print(f'Test word {test_word} not in corpus.')
    for w in corpus.words():
        m = edit_distance(test_word, w)
        if m == 1:
            print(f'Word {w} edit distance is {m}.')
            break
else:
    print(f'Test word {test_word} exists in corpus.')
