# Objectives:
Previously, we have created two corpus files (medical-corpus and medical-freq) from the same data:(https://www.kaggle.com/datasets/jpmiller/layoutlm/data) \

Now, we are going to build b-gram model using the same dataset \

We will skip he EDA like we did previously.

In [21]:
import pandas as pd
import pickle
from nltk.tokenize import sent_tokenize
from nlppreprocess import NLP
import utils.regex as rx
from importlib import reload
from nltk.tokenize import word_tokenize
from nltk.util import bigrams
from collections import defaultdict, Counter

In [22]:
df = pd.read_csv('data/medquad-kaggle-johnm.csv')

In [23]:
# The following function returns token count for given text, it will be used for calculating
# average tokens for questions & answers.
def token_count(x):
    return len(word_tokenize(x))

Before we use the word_tokenizer to count the tokens in each column, we need to drop the missing values to avoid exceptions.

In [24]:
# Drop only rows with missing values on the answer columns.
df = df[df['answer'].notna()]

cnt = df['answer'].apply(token_count).sum()
print(f'Answers have {cnt} count of tokens.')

Answers have 3731909 count of tokens.


# 2. Build Bi-gram model

In the process of building bi-gram model, we need to:

1. Save and load the bi-gram model in pkl file
2. Clean the text from answer column
3. Split the paragraph in answer column into sentences
4. Use `nlppreprocess` package to handle sentence text preprocessing
5. Tokenize the sentence after nlppreprocess
6. Implement padding after tokenization
7. Using nltk.bigrams to build the bi-gram model
8. Save the model into pkl file
9. Verify the model


# 1. Save and load the bi-gram model in pkl file

In [25]:
model_path = "data/bigram_freq.pkl"
tokens = []
bigram_freq = defaultdict(Counter)


def save_model():
    with open(model_path, 'wb') as f:
        pickle.dump(bigram_freq, f)
    print(f"Bi-gram saved to {model_path}")


def load_model():
    with open(model_path, 'rb') as f:
        bigram_freq = pickle.load(f)
    return bigram_freq


# Example usage
text = "this is a simple example to demonstrate bigrams model saving and loading"
tokens = word_tokenize(text.lower())
bigram_list = list(bigrams(tokens))

print(bigram_list)

for w1, w2 in bigram_list:
    bigram_freq[w1][w2] += 1

save_model()
bigram_freq = load_model()
print(bigram_freq)

bigram_freq = defaultdict(Counter)
save_model()
bigram_freq = load_model()
print(bigram_freq)

[('this', 'is'), ('is', 'a'), ('a', 'simple'), ('simple', 'example'), ('example', 'to'), ('to', 'demonstrate'), ('demonstrate', 'bigrams'), ('bigrams', 'model'), ('model', 'saving'), ('saving', 'and'), ('and', 'loading')]
Bi-gram saved to data/bigram_freq.pkl
defaultdict(<class 'collections.Counter'>, {'this': Counter({'is': 1}), 'is': Counter({'a': 1}), 'a': Counter({'simple': 1}), 'simple': Counter({'example': 1}), 'example': Counter({'to': 1}), 'to': Counter({'demonstrate': 1}), 'demonstrate': Counter({'bigrams': 1}), 'bigrams': Counter({'model': 1}), 'model': Counter({'saving': 1}), 'saving': Counter({'and': 1}), 'and': Counter({'loading': 1})})
Bi-gram saved to data/bigram_freq.pkl
defaultdict(<class 'collections.Counter'>, {})


# 2. Clean the text from answer column

In [26]:
reload(rx)
# Tracing value for debugging.
i = 0
clean_text = ''
final_text = ''

try:
    for text in df['answer']:
        # Remove URLs.
        clean_text = rx.remove_url(text)
        # # Remove HTML tags.
        clean_text = rx.remove_html(clean_text)
        # # Remove bracketed words (usually acronyms).
        clean_text = rx.remove_bracketed_text(clean_text)
        if final_text == '':
            final_text = clean_text
        else:
            final_text = final_text + ' ' + clean_text
        # Tracing row-count for debugging.
        i += 1
except Exception as e:
    print(f'Exception {e} in {i}.')

clean_text = final_text

In [27]:
s1 = clean_text.split()
print(len(s1))

3233582


# 3. Split the paragraph in answer column to sentences

In [28]:
# convert paragraph into sentences
sentences = sent_tokenize(clean_text)
print(len(sentences))

173050


# 4. Using nlppreprocess to handle each sentence text preprocessing
What it does:
1. Remove punctuations
2. Lemmatize the words

In [29]:
clean_sentences = []

for sentence in sentences:
    sentence = sentence.lower()
    if not sentence.strip():
        clean_sentence = ''
    else:
        nlp = NLP()
        clean_sentence = nlp.process(sentence)
        clean_sentences.append(clean_sentence)

print(len(clean_sentences))

173050


# 5. Tokenize the sentence after nlppreprocess

In [30]:
for clean_sentence in clean_sentences:
    tokens = word_tokenize(clean_sentence.lower())
    # clean token when possession
    clean_tokens = []
    contractions = {"s", "re", "m", "ll", "t", "ve", "t"}

    i = 0
    while i < len(tokens):
        if tokens[i] not in contractions:
            clean_tokens.append(tokens[i])
        i += 1

    tokens = clean_tokens

total = 0
for clean_sentence in clean_sentences:
    total = total + len(tokens)

print(total)

1557450


# 6. Implement padding after tokenization

In [31]:
for clean_sentence in clean_sentences:
    tokens = word_tokenize(clean_sentence.lower())
    # clean token when possession
    clean_tokens = []
    contractions = {"s", "re", "m", "ll", "t", "ve"}
    i = 0
    while i < len(tokens):
        if tokens[i] not in contractions:
            clean_tokens.append(tokens[i])
        i += 1
    tokens = clean_tokens
    # Add padding (start and end symbols)
    padded_tokens = ["<s>"] + tokens + ["</s>"]
    # print(padded_tokens)

# 7. Use nltk.bigrams to build the bi-gram model

In [32]:
bigram_freq = defaultdict(Counter)

for clean_sentence in clean_sentences:
    tokens = word_tokenize(clean_sentence.lower())
    # clean token when possession
    clean_tokens = []
    contractions = {"s", "re", "m", "ll", "t", "ve"}
    i = 0
    while i < len(tokens):
        if tokens[i] not in contractions:
            clean_tokens.append(tokens[i])
        i += 1
    tokens = clean_tokens
    # Add padding (start and end symbols)
    padded_tokens = ["<s>"] + tokens + ["</s>"]
    bigram_list = list(bigrams(padded_tokens))
    for w1, w2 in bigram_list:
        bigram_freq[w1][w2] += 1

# print(bigram_freq)

# 8. Save the model into pkl file

In [33]:
save_model()

Bi-gram saved to data/bigram_freq.pkl


# 9. Verify the model

In [34]:
result = load_model()

print("Loaded Bigram:")
# print(result)

Loaded Bigram:


# Predict the word based on bi-gram

In [35]:
def predict_next(word, top_n=10):
    word = word.lower()
    if word in bigram_freq:
        predictions = bigram_freq[word].most_common(top_n)
        return [w for w, _ in predictions]
    else:
        return ["No prediction available"]


input_word = "damage"
predictions = predict_next(input_word)

print(predictions)

['</s>', 'and', 'your', 'in', 'brain', 'heart', 'can', 'from', 'liver', 'caused']


In [36]:
def rank_suggestions(previous_word, suggestions):
    previous_word = previous_word.lower()
    ranking = {}
    for key in suggestions:
        suggestion = suggestions[key].lower()
        rank = bigram_freq.get(previous_word, {}).get(suggestion, 0)  # Avoid KeyError
        print(rank)
        if rank not in ranking:
            ranking[rank] = []
        ranking[rank].append(suggestion)
    # Sort by frequency in descending order
    ranked_suggestions = sorted(ranking.items(), key=lambda x: x[0], reverse=True)
    # Flatten sorted suggestions into a dictionary
    my_dict = {}
    i = 0
    for _, words in ranked_suggestions:
        for word in words:
            my_dict[i] = word
            i += 1
    return my_dict


previous_text = "damage"
sample_suggest = {0: "eye"}

print(rank_suggestions(previous_text, sample_suggest))

6
{0: 'eye'}
