### Bigrams and trigrams 

#### Data cleaning

In [1]:
# import nltk
# from nltk.tokenize import word_tokenize

# nltk.download('punkt')

# # Sample corpus
# corpus = [
#     "<s> He read a book </s>",
#     "<s> I read a different book </s>",
#     "<s> He read a book by Danielle </s>"
# ]

# # Tokenizing and removing sentence pads
# tokenized_sentences = [word_tokenize(sentence.replace("<s>", "").replace("</s>", "").strip()) for sentence in corpus]
# print(tokenized_sentences)


In [4]:
import nltk
from nltk.tokenize import regexp_tokenize
from nltk import bigrams, ConditionalFreqDist

# ensure necessary NLTK resources are downloaded
nltk.download('punkt')

# training corpus
corpus = [
    "<s> He read a book </s>",
    "<s> I read a different book </s>",
    "<s> He read a book by Danielle </s>"


# define a tokenization pattern to include <s>, </s>, and words
pattern = r'(<s>|</s>|\w+)'

# tokenize the sentences using the defined pattern
tokenized_sentences = [regexp_tokenize(sentence, pattern) for sentence in corpus]

# Preparing data for bigram model
all_bigrams = [bigram for sentence in tokenized_sentences for bigram in bigrams(sentence)]
bigram_freq_dist = ConditionalFreqDist(all_bigrams)
vocabulary = set([word for sentence in tokenized_sentences for word in sentence])

# sentence to calculate probability for, this will be tokenized in the same manner
test_sentence = "<s> I read a book by Danielle </s>"
tokenized_test_sentence = regexp_tokenize(test_sentence, pattern)

test_bigrams = list(bigrams(tokenized_test_sentence))

# function to calculate probabilities
def calculate_probability(bigram_freq_dist, test_bigrams, vocabulary, smoothed=False):
    probability = 1
    vocab_size = len(vocabulary)  # getting vocab size for smoothing

    for bigram in test_bigrams:
        word1, word2 = bigram
        word1_count = sum(bigram_freq_dist[word1].values())
        bigram_count = bigram_freq_dist[word1][word2]
        if smoothed:
            # add-one smoothing
            probability *= (bigram_count + 1) / (word1_count + vocab_size + 1)
        else:
            # unsmoothed model, handling case where word1_count is 0
            probability *= bigram_count / word1_count if word1_count else 0
    return probability

# calculating probabilities
unsmoothed_probability = calculate_probability(bigram_freq_dist, test_bigrams, vocabulary)
smoothed_probability = calculate_probability(bigram_freq_dist, test_bigrams, vocabulary, smoothed=True)

print("Unsmoothed Bigram Model Probability: {:.10f}".format(unsmoothed_probability))
print("Smoothed Bigram Model Probability: {:.10f}".format(smoothed_probability))


Unsmoothed Bigram Model Probability: 0.0740740741
Smoothed Bigram Model Probability: 0.0000057846


[nltk_data] Downloading package punkt to /Users/aymanadil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [1]:
import nltk
import string
import tkinter
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag, word_tokenize, RegexpParser


# Download necessary NLTK models and corpora
nltk.download('punkt')

chunker = RegexpParser("""
                    NP: {<DT>?<JJ>*<NN>} #To extract Noun Phrases
                    P: {<IN>}            #To extract Prepositions
                    V: {<V.*>}           #To extract Verbs
                    PP: {<p> <NP>}       #To extract Prepositional Phrases
                    VP: {<V> <NP|PP>*}   #To extract Verb Phrases
                    """)



sentence = "The big black dog barked at the white cat and chased away."

cleaned_sentence = sentence.lower().translate(str.maketrans('', '', string.punctuation))

tagged = pos_tag(word_tokenize(cleaned_sentence))
output = chunker.parse(tagged)

output.draw()


[nltk_data] Downloading package punkt to /Users/aymanadil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/aymanadil/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /Users/aymanadil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
import nltk
import string

text2 = nltk.CFG.fromstring("""
        S -> NP VP
        NP -> Det Adj N | Det N | Det Adj N PP
        VP -> V NP | V NP PP
        PP -> P NP
        Det -> 'The' | 'the'
        Adj -> 'big' | 'black' | 'white'
        N -> 'dog' | 'cat'
        V -> 'barked' | 'chased'
        P -> 'at'
        CC -> 'and'
        Adv -> 'away'
    """)

sentence = "The big black dog barked at the white cat and chased away."
cleaned_sentence = sentence.translate(str.maketrans('', '', string.punctuation))
text1 = nltk.word_tokenize(cleaned_sentence)
print(text1)
parser = nltk.ChartParser(text2)

for tree in parser.parse(text1):
    print(tree)
tree.draw()


['The', 'big', 'black', 'dog', 'barked', 'at', 'the', 'white', 'cat', 'and', 'chased', 'away']


NameError: name 'tree' is not defined