In [None]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import nltk
from nltk.corpus import brown, words, reuters
from collections import defaultdict, Counter
import math
import re

# Download required corpus
nltk.download('brown')
nltk.download('words')
nltk.download('reuters')

# -----------------------------------
# Step 1: Build Word Frequency & N-gram Model
# -----------------------------------

# Build word frequency dictionary from Brown + Reuters
def build_word_frequencies():
    word_freq = Counter()
    for word in brown.words():
        word_freq[word.lower()] += 1
    for word in reuters.words():
        word_freq[word.lower()] += 1
    return word_freq

# Build bigram counts directly from word sequence
def build_bigram_counts():
    bigram_counts = defaultdict(int)
    unigram_counts = defaultdict(int)

    # Use raw word sequence (no sentence split)
    word_sequence = ['<s>'] + [word.lower() for word in brown.words() + reuters.words()] + ['</s>']

    for i in range(len(word_sequence)):
        unigram_counts[word_sequence[i]] += 1
        if i < len(word_sequence) - 1:
            bigram_counts[(word_sequence[i], word_sequence[i + 1])] += 1

    return unigram_counts, bigram_counts

word_freq = build_word_frequencies()
unigrams, bigrams = build_bigram_counts()

english_vocab = set(word.lower() for word in words.words())

# -----------------------------------
# Step 2: Levenshtein Distance
# -----------------------------------
def edit_distance(a, b):
    dp = [[i + j if i * j == 0 else 0 for j in range(len(b) + 1)] for i in range(len(a) + 1)]
    for i in range(1, len(a) + 1):
        for j in range(1, len(b) + 1):
            cost = 0 if a[i - 1] == b[j - 1] else 1
            dp[i][j] = min(dp[i - 1][j] + 1, dp[i][j - 1] + 1, dp[i - 1][j - 1] + cost)
    return dp[-1][-1]

# -----------------------------------
# Step 3: Candidate Generation (with Frequency Filtering)
# -----------------------------------
def get_candidates(word, max_distance=2, min_frequency=50):
    candidates = [w for w in english_vocab if edit_distance(word, w) <= max_distance and word_freq[w] >= min_frequency]
    return candidates

# -----------------------------------
# Step 4: Backoff Bigram Probability
# -----------------------------------
def sentence_probability(sentence_tokens):
    prob = 0.0
    vocab_size = len(unigrams)
    sentence_tokens = ['<s>'] + sentence_tokens + ['</s>']

    for i in range(len(sentence_tokens) - 1):
        bigram = (sentence_tokens[i], sentence_tokens[i + 1])
        bigram_count = bigrams.get(bigram, 0)
        unigram_count = unigrams.get(sentence_tokens[i], 0)

        if bigram_count > 0:
            prob += math.log(bigram_count / unigram_count)
        else:
            # Backoff to unigram
            prob += math.log((unigrams.get(sentence_tokens[i + 1], 0) + 1) / (sum(unigrams.values()) + vocab_size))

    return prob

# -----------------------------------
# Step 5: Correction Engine
# -----------------------------------
def correct_sentence(sentence):
    words_in_sentence = re.findall(r"\b\w+\b", sentence.lower())
    corrected_sentence = words_in_sentence.copy()

    for idx, word in enumerate(words_in_sentence):
        if word in english_vocab:
            continue  # Word is correct

        candidates = get_candidates(word)
        if not candidates:
            continue  # No candidates found

        best_candidate = word
        best_prob = -math.inf

        for candidate in candidates:
            temp_sentence = corrected_sentence.copy()
            temp_sentence[idx] = candidate
            prob = sentence_probability(temp_sentence)

            if prob > best_prob:
                best_prob = prob
                best_candidate = candidate

        if best_candidate != word:
            print(f"Correction: '{word}' → '{best_candidate}'")
            corrected_sentence[idx] = best_candidate

    # Reconstruct sentence with original punctuation
    output = sentence
    for original, corrected in zip(re.findall(r"\b\w+\b", sentence), corrected_sentence):
        output = re.sub(r'\b' + re.escape(original) + r'\b', corrected, output, count=1)

    return output

# -----------------------------------
# Example Sentences
# -----------------------------------
sentences = [
    "where are you, i have been waitng since lote time.",
    "i am goig home",
    "I can't beleive it's not buter."
]

for input_sentence in sentences:
    corrected = correct_sentence(input_sentence)
    print("\nOriginal Sentence:", input_sentence)
    print("Corrected Sentence:", corrected)


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package reuters to /root/nltk_data...


Correction: 'waitng' → 'writing'

Original Sentence: where are you, i have been waitng since lote time.
Corrected Sentence: where are you, i have been writing since lote time.
Correction: 'goig' → 'going'

Original Sentence: i am goig home
Corrected Sentence: i am going home
Correction: 'beleive' → 'believe'
Correction: 'buter' → 'enter'

Original Sentence: I can't beleive it's not buter.
Corrected Sentence: i can't believe it's not enter.


## **Flexudy Model**



In [None]:
!pip install transformers torch sentencepiece -q


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m83.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

def load_flexudy_model():
    print("[+] Loading Flexudy T5 Doctor Model...")
    tokenizer = AutoTokenizer.from_pretrained("flexudy/t5-base-multi-sentence-doctor")
    model = AutoModelForSeq2SeqLM.from_pretrained("flexudy/t5-base-multi-sentence-doctor")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    return tokenizer, model, device

tokenizer, model, device = load_flexudy_model()


[+] Loading Flexudy T5 Doctor Model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/246 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
def correct_with_flexudy(text, tokenizer, model, device):
    print("[*] Running correction...")
    input_ids = tokenizer.encode(text, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            max_length=128,
            num_beams=5,
            early_stopping=True,
            no_repeat_ngram_size=3,
            repetition_penalty=2.5,
            length_penalty=1.0,
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
# Try some garbage input
input_text = input("Enter your sentence with errors: ")

corrected_output = correct_with_flexudy(input_text, tokenizer, model, device)

print("\n✅ Corrected Sentence:")
print(corrected_output)


Enter your sentence with errors: i wil lobve my faml
[*] Running correction...

✅ Corrected Sentence:
i will lobby my family.
