In [6]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
import nltk
from nltk.util import ngrams
from nltk.corpus import reuters
from collections import defaultdict, Counter

nltk.download('punkt')
nltk.download('reuters')

corpus = reuters.sents(categories='acq')

def build_bigram_model(corpus):
    bigram_model = defaultdict(Counter)
    for sentence in corpus:
        sentence = [word.lower() for word in sentence]
        for w1, w2 in ngrams(sentence, 2, pad_left=True, pad_right=True, left_pad_symbol="<s>", right_pad_symbol="</s>"):
            bigram_model[w1][w2] += 1
    return bigram_model

bigram_model = build_bigram_model(corpus)

def autocomplete(text, bigram_model, num_words=5):
    words = text.split()
    last_word = words[-1].lower()
    predictions = []

    for _ in range(num_words):
        if last_word in bigram_model:
            next_word = bigram_model[last_word].most_common(1)[0][0]
            if next_word == "</s>":
                break
            predictions.append(next_word)
            last_word = next_word
        else:
            break

    return text + " " + " ".join(predictions)

input_text = "Machine learning is"
completed_text = autocomplete(input_text, bigram_model, num_words=3)
print("Autocomplete:", completed_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


Autocomplete: Machine learning is expected to buy


In [7]:
import os
import nltk
from nltk.util import ngrams
from collections import defaultdict, Counter


custom_nltk_path = os.path.expanduser('~/custom_nltk_data')
if not os.path.exists(custom_nltk_path):
    os.makedirs(custom_nltk_path)


nltk.download('punkt', download_dir=custom_nltk_path)
nltk.download('gutenberg', download_dir=custom_nltk_path)


nltk.data.path.append(custom_nltk_path)

corpus = nltk.corpus.gutenberg.sents()

def build_bigram_model(corpus):
    bigram_model = defaultdict(Counter)
    for sentence in corpus:
        sentence = [word.lower() for word in sentence]
        for w1, w2 in ngrams(sentence, 2, pad_left=True, pad_right=True, left_pad_symbol="<s>", right_pad_symbol="</s>"):
            bigram_model[w1][w2] += 1
    return bigram_model

bigram_model = build_bigram_model(corpus)


def autocomplete(text, bigram_model, num_words=5):
    words = text.split()
    last_word = words[-1].lower()
    predictions = []

    for _ in range(num_words):
        if last_word in bigram_model:
            next_word = bigram_model[last_word].most_common(1)[0][0]
            if next_word == "</s>":
                break
            predictions.append(next_word)
            last_word = next_word
        else:
            break

    return text + " " + " ".join(predictions)


input_text = "machine learning is"
completed_text = autocomplete(input_text, bigram_model, num_words=3)
print("Autocomplete:", completed_text)


[nltk_data] Downloading package punkt to /root/custom_nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/custom_nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


Autocomplete: machine learning is the lord ,


In [None]:
import nltk
from nltk.corpus import treebank, words
from nltk.tag import hmm
from nltk.metrics.distance import edit_distance

nltk.download('treebank')
nltk.download('universal_tagset')
nltk.download('words')

train_sents = treebank.tagged_sents(tagset='universal')
english_words = set(words.words())
train_vocab = set(word.lower() for sent in train_sents for word, _ in sent)
full_vocabulary = english_words.union(train_vocab)

trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train(train_sents)

common_misspellings = {
    'teh': 'the',
    'quikc': 'quick',
    'brownn': 'brown',
    'fxo': 'fox',
    'jupms': 'jumps',
    'lazzy': 'lazy',
}

def is_correct(word):
    return word.lower() in full_vocabulary

def suggest_corrections(word, candidates, max_distance=2):
    suggestions = [candidate for candidate in candidates if edit_distance(word, candidate) <= max_distance]
    if not suggestions:
        suggestions = [candidate for candidate in candidates if edit_distance(word, candidate) <= max_distance + 1]
    return min(suggestions, key=lambda candidate: edit_distance(word, candidate), default=word)

def hmm_spell_checker(sentence, tagger, common_misspellings):
    corrected_sentence = []

    for i, word in enumerate(sentence):
        if not is_correct(word):
            correction = common_misspellings.get(word.lower(), None)
            if correction:
                corrected_sentence.append(correction)
                print(f"Correcting '{word}' to '{correction}'")
            else:
                suggested_word = suggest_corrections(word, full_vocabulary, max_distance=2)
                corrected_sentence.append(suggested_word)
                print(f"Suggesting '{suggested_word}' for '{word}'")
        else:
            corrected_sentence.append(word)

    return " ".join(corrected_sentence)

input_sentence = ['The', 'quikc', 'brownn', 'fxo', 'jupms', 'over', 'the', 'lazzy', 'dog', 'cta', 'cet','catt']
corrected_text = hmm_spell_checker(input_sentence, tagger, common_misspellings)
print("\nCorrected Sentence:")
print(corrected_text)


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


Correcting 'quikc' to 'quick'
Correcting 'brownn' to 'brown'
Correcting 'fxo' to 'fox'
Correcting 'jupms' to 'jumps'
Correcting 'lazzy' to 'lazy'
Suggesting 'eta' for 'cta'
Suggesting 'cest' for 'cet'
Suggesting 'batt' for 'catt'

Corrected Sentence:
The quick brown fox jumps over the lazy dog eta cest batt
