<a href="https://colab.research.google.com/github/dvircohen0/NLP/blob/main/Markov_bigram_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import re
import numpy as np
from collections import Counter
import pandas as pd
import nltk
nltk.download('brown')
from nltk.corpus import brown

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


Define the vocabulary class

In [19]:
class Vocabulary:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.num_words = 0
        self.num_sentences = 0
        self.longest_sentence = 0
        self.smoothing = 0.001

    def add_word(self, word):
        if word not in self.word2index:
            # First entry of word into vocabulary
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            # Word exists; increase word count
            self.word2count[word] += 1
            
    def add_sentence(self, sentence):
        for word in sentence:
            if word:
                self.add_word(word)
        self.num_sentences += 1

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        return self.word2index[word]

    #create bigrams for a givn sentence
    def sentence_bigrams(self,sentence):
        return zip(sentence, sentence[1:])

    #create probabilities for bigrams in corpus
    def create_prob_matrix(self,corpus):
        counts = Counter()
        self.prob_matrix={}
        #counts all the bigrams
        for sent in corpus:
            counts.update(self.sentence_bigrams(list(map(str.lower,sent))))
        #fill the probabilities with the log likelihood of each bigram
        for bigr,cnt in counts.items():
            # add 1 smoothing
            prob=cnt+self.smoothing
            # divide by count(A)+V 
            # A=the first word on the bigram 
            # V=size of the vocabulary
            prob/=(self.word2count[bigr[0]]+voc.num_words)
            # convert the probability to log likelihood
            self.prob_matrix[bigr]=np.log(prob)

Checking probability of a new sentence 

In [20]:
#return a score for a given sentence. if the the score is high 
#the input is a true sentence with high probability
def check_sentence(sentence):
    #words = input("Enter a sentence:\n")
    words = sentence.lower().split()
    # check that all tokens exist in word2idx (otherwise, we can't get score)
    bad_sentence = False
    for token in words:
      if token not in voc.word2index:
        bad_sentence = True
    if bad_sentence:
              print("Sorry, you entered words that are not in the vocabulary")
    else:
        # the first probability is the unigram of the first word in the sentence
        sentence_prob = np.log(voc.word2count[words[0]]/sum(voc.word2count.values()))
        for i in range(1,len(words)):
            if (words[i-1],words[i]) in voc.prob_matrix:
                sentence_prob += voc.prob_matrix[(words[i-1],words[i])]
            else: sentence_prob += np.log(voc.smoothing/(voc.num_words+voc.word2count[words[i]]))
        print("SCORE: ",sentence_prob/len(words))

Fill the vocabulary and the bigram probabilities matrix

In [21]:
if __name__ == "__main__":
    #read the courpus file, we gonna use the brown corpus
    corpus = brown.sents()
    #creat empty Vocabulary objct
    voc = Vocabulary('test')
    #fill the Vocabulary with the corpus file
    for i,sent in enumerate(corpus):
        voc.add_sentence(map(str.lower,sent))
    #fill the probability matrix 
    voc.create_prob_matrix(corpus)

Test the model

In [22]:
    # check real sentence from the corpus:
    sent="For the most part , this discussion will be confined to results \
    obtained since the introduction of the reference standard ."
    print("real sentence from the corpus: ",sent)
    check_sentence(sent)
    
    # check real sentence from the wikipedia:
    sent="Development of cat breeds started in the mid 19th century ."
    print("real sentence from the wikipedia: ",sent)
    check_sentence(sent)
    
    # check fake sentence:
    sent="black bear eats spaghetti"
    print("fake sentence: ",sent)
    check_sentence(sent)    

real sentence from the corpus:  For the most part , this discussion will be confined to results obtained since the introduction of the reference standard .
SCORE:  -7.899997360213471
real sentence from the wikipedia:  Development of cat breeds started in the mid 19th century .
SCORE:  -11.277622626562168
fake sentence:  black bear eats spaghetti
SCORE:  -15.456113905409566
