# Objectives:
We need to build medical vocabulary (word dictionary) for spelling checker. The following dataset was found useful for building vocab:(https://www.kaggle.com/datasets/jpmiller/layoutlm/data)

Before building the vocab, the following questions needs to be clarified:
1. What the dataset contains (EDA)?
2. Which feature needed?
3. What is the expected outcome?

Once, these questions are clarified, we can proceed building a custom NLTK corpus.

In [None]:
import pandas as pd
from nltk import RegexpTokenizer
from nltk.tokenize import word_tokenize
import statistics

from nltk.translate.bleu_score import corpus_bleu

from app_config import Configuration

In [None]:
df = pd.read_csv('data/medquad-kaggle-johnm.csv')

In [None]:
df.shape

In [None]:
df.head(2)

## 1. What the dataset contains?
We need a corpus that contains medical words for building a medical dictionary (vocab). Therefore, we need to investigate whether the obtained corpus contains the required.

In [None]:
# Let's investigate the data types and columns in the dataset.
df.info()

In [None]:
# It appears there are four columns, and all columns contains string datatype.
# Let's investigate which column will be more sensible for building the vocab.
df.describe()

In [None]:
# The following function returns token count for given text, it will be used for calculating
# average tokens for questions & answers.
def token_count(x):
    return len(word_tokenize(x))

Before we use the word_tokenizer to count the tokens in each column, we need to drop the missing values to avoid exceptions.

In [None]:
# Let's find out the missing values first.
df.isnull().sum()

In [None]:
# From the above we can conclude, question column doesn't have missing values meanwhile, answer column have 5.
cnt = df['question'].apply(token_count).sum()
print(f'Questions have {cnt} count of tokens.')

In [None]:
# Drop only rows with missing values on the answer columns.
df = df[df['answer'].notna()]

cnt = df['answer'].apply(token_count).sum()
print(f'Answers have {cnt} count of tokens.')

# 2. Building Bi-gram model

In the process of building bi-gram model, we need to:

1. preparing the method to save and load the bi-gram model in pkl file
2. clean the text from answer column
3. split the paragraph in answer column to sentences
4. using nlppreprocess to handle each sentence text preprocessing
5. tokenization the sentence after nlppreprocess
6. implement padding after tokenization
7. using nltk.bigrams to build the bi-gram model
8. save the model into pkl file
9. verify the model


# 1. preparing the method to save and load the bi-gram model in pkl file

In [None]:
import pickle
from collections import Counter
from nltk import bigrams
from collections import defaultdict, Counter

model_path = "data/bigram_freq.pkl"
tokens = []
bigram_freq = defaultdict(Counter)

def save_model():
    with open(model_path, 'wb') as f:
        pickle.dump(bigram_freq, f)
    print(f"Bi-gram saved to {model_path}")

def load_model():
    with open(model_path, 'rb') as f:
        bigram_freq = pickle.load(f)
    return bigram_freq

# Example usage
if __name__ == "__main__":
    text = "this is a simple example to demonstrate bigram saving and loading and it's include possesion, I'm Phang Yuen Jun"
    tokens = word_tokenize(text.lower())
    bigram_list = list(bigrams(tokens))
    
    print(bigram_list)
    
    for w1, w2 in bigram_list:
        bigram_freq[w1][w2] += 1
        
    save_model()
    bigram_freq = load_model()
    print(bigram_freq)
    
    bigram_freq = defaultdict(Counter)
    save_model()
    bigram_freq = load_model()
    print(bigram_freq)
    

# 2. clean the text from answer column

In [None]:
import utils.regex as rx
from importlib import reload

reload(rx)
# Tracing value for debugging.
i = 0
clean_text = ''
final_text = ''

try:
    for text in df['answer']:
        # Remove URLs.
        clean_text = rx.remove_url(text)
        # # Remove HTML tags.
        clean_text = rx.remove_html(clean_text)
        # # Remove bracketed words (usually acronyms).
        clean_text = rx.remove_bracketed_text(clean_text)
        if final_text == '':
            final_text = clean_text
        else:
            final_text = final_text + ' ' + clean_text
        # Tracing row-count for debugging.
        i += 1
except Exception as e:
    print(f'Exception {e} in {i}.')
    
clean_text = final_text
print(clean_text)

In [None]:
s1 = clean_text.split()
print(len(s1))

# 3. split the paragraph in answer column to sentences

In [None]:
import nltk
from nltk.tokenize import sent_tokenize

# convert paragraph into sentences
sentences = sent_tokenize(clean_text)
print(len(sentences))

# 4. using nlppreprocess to handle each sentence text preprocessing

In [None]:
from nlppreprocess import NLP
clean_sentences = []

for sentence in sentences:
    sentence = sentence.lower()
    if not sentence.strip():
        clean_sentence = ''
    else:
        nlp = NLP()
        clean_sentence = nlp.process(sentence)
        clean_sentences.append(clean_sentence)
        
print(len(clean_sentences))

# 5. tokenization the sentence after nlppreprocess

In [None]:
from nltk.tokenize import word_tokenize

for clean_sentence in clean_sentences:
    tokens = word_tokenize(clean_sentence.lower())
    print(tokens)

    merged_tokens = []
    contractions = {"s", "re", "m", "ll", "t", "ve", "t"}  # Contractions to merge

    i = 0
    while i < len(tokens):
        if i < len(tokens) - 1 and tokens[i + 1] in contractions:
            merged_tokens.append(tokens[i] + "'" + tokens[i + 1])  # Merge word + contraction
            i += 2  # Skip the next token (contraction)
        else:
            merged_tokens.append(tokens[i])
            i += 1

    print(merged_tokens)
    tokens = merged_tokens

In [None]:
total = 0
for clean_sentence in clean_sentences:
    total = total + len(tokens)

print(total)

# 6. implement padding after tokenization

In [None]:
from nltk.tokenize import word_tokenize

for clean_sentence in clean_sentences:
    tokens = word_tokenize(clean_sentence.lower())
    merged_tokens = []
    contractions = {"s", "re", "m", "ll", "t", "ve", "t"}  # Contractions to merge

    i = 0
    while i < len(tokens):
        if i < len(tokens) - 1 and tokens[i + 1] in contractions:
            merged_tokens.append(tokens[i] + "'" + tokens[i + 1])  # Merge word + contraction
            i += 2  # Skip the next token (contraction)
        else:
            merged_tokens.append(tokens[i])
            i += 1

    print(merged_tokens)
    tokens = merged_tokens
    # Add padding (start and end symbols)
    padded_tokens = ["<s>"] + tokens + ["</s>"]
    print(padded_tokens)

# 7. using nltk.bigrams to build the bi-gram model

In [None]:
from nltk.tokenize import word_tokenize
from nltk.util import bigrams
from collections import defaultdict, Counter

bigram_freq = defaultdict(Counter)

for clean_sentence in clean_sentences:
    tokens = word_tokenize(clean_sentence.lower())
    merged_tokens = []
    contractions = {"s", "re", "m", "ll", "t", "ve", "t"}  # Contractions to merge

    i = 0
    while i < len(tokens):
        if i < len(tokens) - 1 and tokens[i + 1] in contractions:
            merged_tokens.append(tokens[i] + "'" + tokens[i + 1])  # Merge word + contraction
            i += 2  # Skip the next token (contraction)
        else:
            merged_tokens.append(tokens[i])
            i += 1
    tokens = merged_tokens
    # Add padding (start and end symbols)
    padded_tokens = ["<s>"] + tokens + ["</s>"]
    bigram_list = list(bigrams(padded_tokens))
    for w1, w2 in bigram_list:
        bigram_freq[w1][w2] += 1
        
print(bigram_freq)

# 8. save the model into pkl file

In [None]:
save_model()

# 9. verify the model

In [None]:
result = load_model()

print("Loaded Bigram:")
print(result)

# Predict the word based on bi-gram

In [None]:
def predict_next(word, top_n=100):
    word = word.lower()
    if word in bigram_freq:
        predictions = bigram_freq[word].most_common(top_n)
        return [w for w, _ in predictions]
    else:
        return ["No prediction available"]
    

input_word = "damage"
predictions = predict_next(input_word)

print(predictions)

In [None]:
def rank_suggestions(previous_word, suggestions):
    previous_word = previous_word.lower()
    ranking = {}
    for key in suggestions:
        suggestion = suggestions[key].lower()
        rank = bigram_freq.get(previous_word, {}).get(suggestion, 0)  # Avoid KeyError
        print(rank)
        if rank not in ranking:
            ranking[rank] = []
        ranking[rank].append(suggestion)
    # Sort by frequency in descending order
    ranked_suggestions = sorted(ranking.items(), key=lambda x: x[0], reverse=True)
    # Flatten sorted suggestions into a dictionary
    my_dict = {}
    i = 0
    for _, words in ranked_suggestions:
        for word in words:
            my_dict[i] = word
            i += 1
    return my_dict

previous_text = "damage"
sample_suggest = {0 : "eye's"}

print(rank_suggestions(previous_text, sample_suggest))