In [2]:
import re
import math

beginsentence = "<s>" #sentences are beginning with <s>
endsentence = "</s>" #sentences are ending with </s>
UNK = None

class Unigram:
    def __init__(self, sentences, smoothing=False):
        self.unigramfrequency = dict()
        self.lengthofcorpus = 0
        for sentence in sentences:
            for word in sentence:
                self.unigramfrequency[word] = self.unigramfrequency.get(word, 0) + 1
                if word != beginsentence and word != endsentence:
                    self.lengthofcorpus += 1 
        self.uniquewords = len(self.unigramfrequency) - 2 #subtract 2 because unigramfrequency dictionary contains values for beginsentence and endsentence
        self.smoothing = smoothing

    def unigramprobability(self, word):
            numerator = self.unigramfrequency.get(word, 0)
            denominator = self.lengthofcorpus
            if self.smoothing:
                numerator += 1 #add one more to total number of seen unique words for UNK - unseen events
                denominator += self.uniquewords + 1
            return float(numerator) / float(denominator)
        
    def unigramsentenceprobability(self, sentence, normalizeprobability=True):
        logsum = 0
        for word in sentence:
            if word != beginsentence and word != endsentence:
                wordprobability = self.unigramprobability(word)
                logsum += math.log(wordprobability, 2)
        return math.pow(2, logsum) if normalizeprobability else logsum          
  
    def vocabulary(self):
        vocab = list(self.unigramfrequency.keys())
        vocab.remove(beginsentence)
        vocab.remove(endsentence)
        vocab.sort()
        vocab.append(UNK)
        vocab.append(beginsentence)
        vocab.append(endsentence)
        return vocab
    
class Bigram(Unigram):
    def __init__(self, sentences, smoothing=False):
        Unigram.__init__(self, sentences, smoothing)
        self.bigramfrequency = dict()
        self.uniquebigram = set()
        for sentence in sentences:
            previousword = None
            for word in sentence:
                if previousword != None:
                    self.bigramfrequency[(previousword, word)] = self.bigramfrequency.get((previousword, word),0) + 1
                    if previousword != beginsentence and word != endsentence:
                        self.uniquebigram.add((previousword, word))
                previousword = word #we subtracted two for the Unigram model as the unigramfrequency dictionary  
        self.uniquebigramwords = len(self.unigramfrequency) #contains values for beginsentence and endsentence but these need to be included in Bigram

    def bigramprobabilty(self, previousword, word):
        numerator = self.bigramfrequency.get((previousword, word), 0)
        denominator = self.unigramfrequency.get(previousword, 0)
        if self.smoothing:
            numerator += 1
            denominator += self.uniquebigramwords
        return 0.0 if numerator == 0 or denominator == 0 else float(
            numerator) / float(denominator)

    def bigramsentenceprobability(self, sentence, normalizeprobability=True):
        logsum = 0
        previousword = None
        for word in sentence:
            if previousword != None:
                bigramwordprobability = self.bigramprobabilty(previousword, word)
                logsum += math.log(bigramwordprobability, 2)
            previousword = word
        return math.pow(2,
                        logsum) if normalizeprobability else logsum

    def vocab(self):
        vocab = list(self.unigramfrequency.keys())
        vocab.remove(beginsentence)
        vocab.remove(endsentence)
        vocab.sort()
        vocab.append(UNK)
        vocab.append(beginsentence)
        vocab.append(endsentence)
        return vocab

def displayunigramprob(sorted_vocab_keys, model):
    for vocab_key in sorted_vocab_keys:
        if vocab_key != beginsentence and vocab_key != endsentence:
            print("{}: {}".format(vocab_key if vocab_key != UNK else "UNK", model.unigramprobability(vocab_key)), end=" ")
    
def displaybigramprob(sorted_vocab_keys, model):
    print("\t\t", end="")
    for vocab_key in sorted_vocab_keys:
        if vocab_key != beginsentence:
            print(vocab_key if vocab_key != UNK else "UNK", end="\t\t")
    print("")
    for vocab_key in sorted_vocab_keys:
        if vocab_key != beginsentence:
            print(vocab_key if vocab_key != UNK else "UNK", end="\t\t")
            for vocab_key_second in sorted_vocab_keys:
                if vocab_key_second != beginsentence:
                    print("{0:.5f}".format(model.bigramprobabilty(vocab_key, vocab_key_second)), end="\t\t")
            print("")
    print("")

def readfile(file_path):
    with open(file_path, "r") as f:
        return [re.split("\s+", line.rstrip('\n')) for line in f]

text = readfile("C:/Users/user/OneDrive/YEAR 3/1ST SEMESTER/TXSA/TXSA_Assignment/Group Assignment Data/Text Corpus.txt")    
text2 = readfile("C:/Users/user/OneDrive/YEAR 3/1ST SEMESTER/TXSA/TXSA_Assignment/Group Assignment Data/Text Corpus.txt")

unsmoothed = Bigram(text)
smoothed = Bigram(text, smoothing=True)
sorted_vocab_keys = unsmoothed.vocab()

print(" \n  Text Corpus   ")
longest_sentence_len = max([len(" ".join(sentence)) for sentence in text2]) + 5
print(" ", " " * (longest_sentence_len - len(" ") - 2), "Unigram Probability\tBigram probability ")
for sentence in text2:
    sentence_string = " ".join(sentence)
    print(sentence_string, end=" " * (longest_sentence_len - len(sentence_string)))
    print("{0:.5f}".format(smoothed.unigramsentenceprobability(sentence)), end="\t\t")
    print("{0:.5f}".format(smoothed.bigramsentenceprobability(sentence)))        
        
print("")

 
  Text Corpus   
                                     Unigram Probability	Bigram probability 
<s> He read a book </s>              0.00058		0.00095
<s> I read a different book </s>     0.00003		0.00006
<s> He read a book my Mulan </s>     0.00000		0.00002

