In [1]:
pip install nlppreprocess

Note: you may need to restart the kernel to use updated packages.


In [2]:
# read the cleanse-data from preprocess-corpus
import pandas as pd
import nltk

from nlppreprocess import NLP
from nltk.util import ngrams
from collections import defaultdict, Counter

df = pd.read_csv('data/cleanse-data.csv')

i = 0
clean_text = ''

try:
    for text in df['cleansed']:
        if i == 0:
            clean_text = text
        
        i += 1
except Exception as e:
    print(f'Exception {e} in {i}.')
    
print("clean text:", clean_text)

clean text: glaucoma is a group of diseases that can damage the eye s optic nerve and result in vision loss and blindness while glaucoma can strike anyone the risk is much greater for people over how glaucoma develops there are several different types of glaucoma most of these involve the drainage system within the eye at the front of the eye there is a small space called the anterior chamber a clear fluid flows through this chamber and bathes and nourishes the nearby tissues button on your keyboard in glaucoma for still unknown reasons the fluid drains too slowly out of the eye as the fluid builds up the pressure inside the eye rises unless this pressure is controlled it may cause damage to the optic nerve and other parts of the eye and result in loss of vision open angle glaucoma the most common type of glaucoma is called open angle glaucoma in the normal eye the clear fluid leaves the anterior chamber at the open angle where the cornea and iris meet when fluid reaches the angle it f

In [3]:
nlp = NLP()
cl_text = nlp.process(clean_text)
print(cl_text)

glaucoma group diseases can damage eye s optic nerve and result in vision loss and blindness glaucoma can strike anyone risk much greater people over how glaucoma develops there are several different types glaucoma most these involve drainage system within eye front eye there small space called anterior chamber clear fluid flows through chamber and bathes and nourishes nearby tissues button your keyboard in glaucoma still unknown reasons fluid drains too slowly out eye fluid builds up pressure inside eye rises unless pressure controlled may cause damage optic nerve and parts eye and result in loss vision open angle glaucoma most common type glaucoma called open angle glaucoma in normal eye clear fluid leaves anterior chamber open angle where cornea and iris meet when fluid reaches angle flows through spongy meshwork like drain and leaves eye sometimes when fluid reaches angle passes too slowly through meshwork drain causing pressure inside eye build pressure damages optic nerve open an

In [4]:
tokens = nltk.word_tokenize(cl_text)
print(tokens)

['glaucoma', 'group', 'diseases', 'can', 'damage', 'eye', 's', 'optic', 'nerve', 'and', 'result', 'in', 'vision', 'loss', 'and', 'blindness', 'glaucoma', 'can', 'strike', 'anyone', 'risk', 'much', 'greater', 'people', 'over', 'how', 'glaucoma', 'develops', 'there', 'are', 'several', 'different', 'types', 'glaucoma', 'most', 'these', 'involve', 'drainage', 'system', 'within', 'eye', 'front', 'eye', 'there', 'small', 'space', 'called', 'anterior', 'chamber', 'clear', 'fluid', 'flows', 'through', 'chamber', 'and', 'bathes', 'and', 'nourishes', 'nearby', 'tissues', 'button', 'your', 'keyboard', 'in', 'glaucoma', 'still', 'unknown', 'reasons', 'fluid', 'drains', 'too', 'slowly', 'out', 'eye', 'fluid', 'builds', 'up', 'pressure', 'inside', 'eye', 'rises', 'unless', 'pressure', 'controlled', 'may', 'cause', 'damage', 'optic', 'nerve', 'and', 'parts', 'eye', 'and', 'result', 'in', 'loss', 'vision', 'open', 'angle', 'glaucoma', 'most', 'common', 'type', 'glaucoma', 'called', 'open', 'angle', 'g

In [5]:
# not required in service
trigrams = list(ngrams(tokens, 3))
print("Trigrams:", trigrams)

Trigrams: [('glaucoma', 'group', 'diseases'), ('group', 'diseases', 'can'), ('diseases', 'can', 'damage'), ('can', 'damage', 'eye'), ('damage', 'eye', 's'), ('eye', 's', 'optic'), ('s', 'optic', 'nerve'), ('optic', 'nerve', 'and'), ('nerve', 'and', 'result'), ('and', 'result', 'in'), ('result', 'in', 'vision'), ('in', 'vision', 'loss'), ('vision', 'loss', 'and'), ('loss', 'and', 'blindness'), ('and', 'blindness', 'glaucoma'), ('blindness', 'glaucoma', 'can'), ('glaucoma', 'can', 'strike'), ('can', 'strike', 'anyone'), ('strike', 'anyone', 'risk'), ('anyone', 'risk', 'much'), ('risk', 'much', 'greater'), ('much', 'greater', 'people'), ('greater', 'people', 'over'), ('people', 'over', 'how'), ('over', 'how', 'glaucoma'), ('how', 'glaucoma', 'develops'), ('glaucoma', 'develops', 'there'), ('develops', 'there', 'are'), ('there', 'are', 'several'), ('are', 'several', 'different'), ('several', 'different', 'types'), ('different', 'types', 'glaucoma'), ('types', 'glaucoma', 'most'), ('glaucom

In [6]:
class NGramModel:
    def __init__(self, n, tokens):
        self.n = n
        self.model = defaultdict(Counter)
        self.build_model(tokens)

    def build_model(self, tokens):
        n_grams = list(ngrams(tokens, self.n))
        for ngram in n_grams:
            prefix, next_word = tuple(ngram[:-1]), ngram[-1]
            self.model[prefix][next_word] += 1

    def predict_next(self, context):
        context = tuple(context[-(self.n - 1):])  # Keep only relevant context
        if context in self.model:
            return self.model[context].most_common(1)[0][0]  # Most probable next word
        else:
            return None  # No prediction

# Create a bigram model
bigram_model = NGramModel(3, tokens)

# Predict next word
context = ["glaucoma", "group"]
predicted_word = bigram_model.predict_next(context)
print(f"Predicted next word: {predicted_word}")

Predicted next word: diseases


In [7]:
def generate_text(model, seed, num_words=10):
    text = seed[:]
    for _ in range(num_words):
        next_word = model.predict_next(text)
        if next_word:
            text.append(next_word)
        else:
            break
    return " ".join(text)

# Example: Generate text starting with "this is"
generated_text = generate_text(bigram_model, ["glaucoma", "group"], num_words=1)
print("Generated text:", generated_text)

Generated text: glaucoma group diseases
