In [7]:
!pip install nltk



In [18]:
import re
from collections import defaultdict, Counter

In [19]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text.split()

In [20]:
class NGramModel:
    def __init__(self, n=3):
        self.n = n
        self.ngram_counts = defaultdict(Counter)
        self.context_counts = Counter()
        self.vocab = set()

    def train(self, tokens):
        self.vocab = set(tokens)

        for i in range(len(tokens)):
            for k in range(1, self.n + 1):
                if i - k + 1 < 0:
                    continue

                ngram = tuple(tokens[i-k+1:i+1])
                context = tuple(tokens[i-k+1:i]) if k > 1 else ()

                self.ngram_counts[context][ngram[-1]] += 1
                self.context_counts[context] += 1

    def get_probability(self, context, word):
        vocab_size = len(self.vocab)
        count = self.ngram_counts[context][word]
        total = self.context_counts[context]
        return (count + 1) / (total + vocab_size)

    def predict_next(self, text, top_k=5):
        tokens = preprocess_text(text)

        # Backoff: trigram → bigram → unigram
        for k in range(self.n-1, -1, -1):
            context = tuple(tokens[-k:]) if k > 0 else ()

            if context in self.ngram_counts:
                candidates = {}

                for word in self.vocab:
                    prob = self.get_probability(context, word)
                    candidates[word] = prob

                sorted_words = sorted(
                    candidates.items(),
                    key=lambda x: x[1],
                    reverse=True
                )

                return sorted_words[:top_k]

        return []

In [23]:
corpus = """
Natural language processing is a field of artificial intelligence.
It focuses on enabling computers to understand human language.
Natural language models are used in machine translation and speech recognition.
Language models can predict the next word in a sentence.
Artificial intelligence is transforming the world through automation.
Deep learning techniques are widely used in natural language processing.
Neural networks are capable of learning complex patterns in text data.
Machine learning algorithms improve automatically through experience.
Supervised learning requires labeled training data.
Unsupervised learning finds hidden patterns in unlabeled data.
Recurrent neural networks are useful for sequential data processing.
Transformers have revolutionized modern natural language tasks.
Large language models are trained on massive text corpora.
Text classification is a common application of NLP.
Sentiment analysis determines whether text expresses positive or negative emotion.
Tokenization is the first step in text preprocessing.
Stop words are often removed during preprocessing.
Stemming reduces words to their root form.
Lemmatization converts words to their base form.
Probability plays a central role in language modeling.
An ngram model predicts a word based on previous words.
A unigram model considers only one word at a time.
A bigram model considers two consecutive words.
A trigram model considers three consecutive words.
Speech recognition systems convert audio into text.
Chatbots use natural language understanding to respond to users.
Information retrieval systems search large document collections.
Named entity recognition identifies people and organizations in text.
Part of speech tagging assigns grammatical labels to words.
Deep neural networks require large amounts of data.
Training data quality affects model performance.
Evaluation metrics measure the accuracy of predictions.
Perplexity is used to evaluate language models.
Overfitting occurs when a model memorizes training data.
Regularization techniques help prevent overfitting.
Cross validation improves model reliability.
Word embeddings represent words as vectors.
Semantic similarity measures meaning between words.
Context is important in understanding language.
Translation systems convert text from one language to another.
Speech synthesis generates human like speech from text.
Data preprocessing improves model performance.
Language generation creates meaningful text automatically.
Artificial intelligence systems are becoming more advanced every year.
"""

tokens = preprocess_text(corpus)

model = NGramModel(n=3)
model.train(tokens)

print("Model trained successfully!")


Model trained successfully!


In [24]:
while True:
    user_input = input("\nEnter text (or 'exit'): ")
    if user_input.lower() == "exit":
        break

    predictions = model.predict_next(user_input)

    print("Suggestions:")
    for word, prob in predictions:
        print(f"{word}  (prob={round(prob,4)})")


Enter text (or 'exit'): hi
Suggestions:
language  (prob=0.0259)
a  (prob=0.0204)
text  (prob=0.0204)
words  (prob=0.0185)
in  (prob=0.0185)

Enter text (or 'exit'): natural language
Suggestions:
processing  (prob=0.015)
tasks  (prob=0.01)
models  (prob=0.01)
understanding  (prob=0.01)
common  (prob=0.005)

Enter text (or 'exit'): AI
Suggestions:
language  (prob=0.0259)
a  (prob=0.0204)
text  (prob=0.0204)
words  (prob=0.0185)
in  (prob=0.0185)

Enter text (or 'exit'): artificial
Suggestions:
intelligence  (prob=0.0202)
common  (prob=0.0051)
analysis  (prob=0.0051)
networks  (prob=0.0051)
recurrent  (prob=0.0051)

Enter text (or 'exit'): machine learning
Suggestions:
algorithms  (prob=0.0102)
common  (prob=0.0051)
analysis  (prob=0.0051)
networks  (prob=0.0051)
recurrent  (prob=0.0051)

Enter text (or 'exit'): exit
