### Bigrams and trigrams 

#### Data cleaning

In [42]:
# import nltk
# from nltk.tokenize import word_tokenize

# nltk.download('punkt')

# # Sample corpus
# corpus = [
#     "<s> He read a book </s>",
#     "<s> I read a different book </s>",
#     "<s> He read a book by Danielle </s>"
# ]

# # Tokenizing and removing sentence pads
# tokenized_sentences = [word_tokenize(sentence.replace("<s>", "").replace("</s>", "").strip()) for sentence in corpus]
# print(tokenized_sentences)


In [1]:
import nltk
from nltk.tokenize import regexp_tokenize
from nltk import bigrams, ConditionalFreqDist

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')

# Sample corpus
corpus = [
    "<s> He read a book </s>",
    "<s> I read a different book </s>",
    "<s> He read a book by Danielle </s>"
]

# Define a tokenization pattern to include <s>, </s>, and words
pattern = r'(<s>|</s>|\w+)'

# Tokenize the sentences using the defined pattern
tokenized_sentences = [regexp_tokenize(sentence, pattern) for sentence in corpus]

# Preparing data for bigram model
all_bigrams = [bigram for sentence in tokenized_sentences for bigram in bigrams(sentence)]
bigram_freq_dist = ConditionalFreqDist(all_bigrams)
vocabulary = set([word for sentence in tokenized_sentences for word in sentence])

# Sentence to calculate probability for, tokenized in the same manner
test_sentence = "<s> I read a book by Danielle </s>"
tokenized_test_sentence = regexp_tokenize(test_sentence, pattern)

test_bigrams = list(bigrams(tokenized_test_sentence))

# Function to calculate probabilities
def calculate_probability(bigram_freq_dist, test_bigrams, vocabulary, smoothed=False):
    probability = 1
    V = len(vocabulary)  # Vocabulary size for smoothing
    for bigram in test_bigrams:
        word1, word2 = bigram
        word1_count = sum(bigram_freq_dist[word1].values())
        bigram_count = bigram_freq_dist[word1][word2]
        if smoothed:
            # Add-one smoothing
            probability *= (bigram_count + 1) / (word1_count + V)
        else:
            # Unsmoothed model, handling case where word1_count is 0
            probability *= bigram_count / word1_count if word1_count else 0
    return probability

# Calculate and print probabilities
unsmoothed_probability = calculate_probability(bigram_freq_dist, test_bigrams, vocabulary)
smoothed_probability = calculate_probability(bigram_freq_dist, test_bigrams, vocabulary, smoothed=True)

print("Unsmoothed Bigram Model Probability: {:.10f}".format(unsmoothed_probability))
print("Smoothed Bigram Model Probability: {:.10f}".format(smoothed_probability))


Unsmoothed Bigram Model Probability: 0.0740740741
Smoothed Bigram Model Probability: 0.0000101014


[nltk_data] Downloading package punkt to /Users/aymanadil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
