<a href="https://colab.research.google.com/github/Natural-Language-Processing-YU/Exercises/blob/main/Exercise_Building_your_own_n_gram_language_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Exercise: Building your own n-gram language model

Objective:
The objective of this exercise is to give students hands-on experience in creating a language model using n-grams and generating new sentences based on the model. Students will work through the steps of preprocessing the text, generating n-grams, creating a vocabulary, constructing the language model, and generating sentences.


In [13]:
import random
import nltk
from nltk.util import ngrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm import MLE

## Step 1. Preprocessing: Provide a small corpus of sentences and tokenize the text using NLTK's word_tokenize function.

In [14]:
# Step 1: Preprocessing
corpus = [
    "I love to eat pizza.",
    "I love to play soccer.",
    "I love to read books.",
     "I love to create algorithms.",
]

# Tokenize the text
tokenized_corpus = [nltk.word_tokenize(sentence.lower()) for sentence in corpus]


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\Dell/nltk_data'
    - 'c:\\Users\\Dell\\anaconda3\\nltk_data'
    - 'c:\\Users\\Dell\\anaconda3\\share\\nltk_data'
    - 'c:\\Users\\Dell\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Dell\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


## Step 2: N-gram Generation


In [None]:
# Step 2: N-gram Generation
n = 3  # Trigrams

# Pad the sequences
padded_corpus = [list(pad_both_ends(sentence, n=n)) for sentence in tokenized_corpus]

# Flatten the corpus into n-grams
ngrams_list = [ngrams(sentence, n) for sentence in padded_corpus]
flattened_ngrams = [ngram for sublist in ngrams_list for ngram in sublist]


## Step 3: Vocabulary Creation


In [None]:
# Step 3: Vocabulary Creation
vocab = set(flattened_ngrams)

## Step 4: Language Model Construction


In [None]:
# Step 4: Language Model Construction
model = MLE(n)
model.fit(flattened_ngrams)


## Step 5: Generate Text

In [None]:

# Step 5: Sentence Generation
max_length = 10  # Maximum number of words in the generated sentence

# Set an initial context
context = random.choice(list(vocab))
prefix = context[:n-1]

# Generate new sentences
generated_sentence = prefix
while len(generated_sentence) < max_length:
    token = model.generate(1, context=context)[-1]
    if token == "</s>":
        break
    generated_sentence += " " + token
    prefix = tuple(generated_sentence.split()[-(n - 1):])
    context = prefix + (token,)

print("Generated Sentence:", generated_sentence)
