In [None]:
##
import os
import sys
import math
import random
import os.path
from math import log, exp
from operator import itemgetter
from collections import defaultdict

In [None]:
#----------------------------------------
#  Data input
#----------------------------------------

# Read a text file into a corpus (list of sentences (which in turn are lists of words))
# (taken from nested section of HW0)
def readFileToCorpus(f):
  """ Reads in the text file f which contains one sentence per line.
  """
  if os.path.isfile(f):
    file = open(f, "r") # open the input file in read-only mode
    i = 0 # this is just a counter to keep track of the sentence numbers
    corpus = [] # this will become a list of sentences
    print("Reading file", f)
    for line in file:
      i += 1
      sentence = line.split() # split the line into a list of words
      #append this lis as an element to the list of sentences
      corpus.append(sentence)
      if i % 1000 == 0:
    	#print a status message: str(i) turns int i into a string
    	#so we can concatenate it
        sys.stderr.write("Reading sentence" + str(i) + "\n")
      #endif
    #endfor
    return corpus
  else:
  #ideally we would throw an exception here, but this will suffice
    print("Error: corpus file ", f, " does not exist")
    sys.exit() # exit the script
  #endif
#enddef

# Preprocess the corpus
def preprocess(corpus):
  #find all the rare words
  freqDict = defaultdict(int)
  for sen in corpus:
	  for word in sen:
	    freqDict[word] += 1
	  #endfor
  #endfor

  #replace rare words with unk
  for sen in corpus:
    for i in range(0, len(sen)):
      word = sen[i]
      print(word)
      print(freqDict[word])
      if freqDict[word] < 2:
        sen[i] = UNK
	    #endif
	  #endfor
  #endfor

  #bookend the sentences with start and end tokens
  for sen in corpus:
    sen.insert(0, start)
    sen.append(end)
  #endfor

  return corpus
#enddef

def preprocessTest(vocab, corpus):
  #replace test words that were unseen in the training with unk
  for sen in corpus:
    for i in range(0, len(sen)):
      word = sen[i]
      if word not in vocab:
        sen[i] = UNK
	    #endif
	  #endfor
  #endfor

  #bookend the sentences with start and end tokens
  for sen in corpus:
    sen.insert(0, start)
    sen.append(end)
  #endfor
  return corpus
#enddef

# Constants
UNK = "UNK"     # Unknown word token
start = "<s>"   # Start-of-sentence token
end = "</s>"    # End-of-sentence-token

In [None]:
#--------------------------------------------------------------
# Language models and data structures
#--------------------------------------------------------------

# Parent class for the three language models you need to implement
class LanguageModel:
  # Initialize and train the model (ie, estimate the model's underlying probability
  # distribution from the training corpus)
  def __init__(self, corpus):
    print("")
      #a) an (unsmoothed) unigram model (UnigramModel)
      #b) a unigram model smoothed using Laplace smoothing (SmoothedUnigramModel)
      #c) an unsmoothed bigram model (BigramModel)
      #d) a bigram model smoothed using linear interpolation smoothing (SmoothedBigramModelInt)
      #""")
    #enddef

  # Generate a sentence by drawing words according to the
  # model's probability distribution
  # Note: think about how to set the length of the sentence
  #in a principled way
  def generateSentence(self):
    print("Implement the generateSentence method in each subclass")
    return "mary had a little lamb ."
    #emddef

    # Given a sentence (sen), return the probability of
    # that sentence under the model
  def getSentenceProbability(self, sen):
    print("Implement the getSentenceProbability method in each subclass")
    return 0.0
    #enddef

    # Given a corpus, calculate and return its perplexity
    #(normalized inverse log probability)
  def getCorpusPerplexity(self, corpus):
    print("Implement the getCorpusPerplexity method")
    return 0.0
    #enddef

    # Given a file (filename) and the number of sentences, generate a list
    # of sentences and write each to file along with its model probability.
    # Note: you shouldn't need to change this method
  def generateSentencesToFile(self, numberOfSentences, filename, corpus):
    filePointer = open(filename, 'w+')
    for i in range(0, numberOfSentences):
      sen = self.generateSentence()
      prob = self.getSentenceProbability(sen)
      stringGenerated = str(prob) + " " + " ".join(sen)
      print(stringGenerated, end="\n", file=filePointer)
    #endfor
    filePointer.close()
  #enddef
#endclass


# Unigram language model
class UnigramModel(LanguageModel):
    def __init__(self, corpus):
        super().__init__(corpus)
        self.unigramDist = UnigramDist(corpus)

    def generateSentence(self):
        sentence = [self.unigramDist.draw() for _ in range(1)]
        return sentence

    def getSentenceProbability(self, sen):
        probability = 1.0
        for word in sen:
            probability *= self.unigramDist.prob(word)
        return probability

    def getCorpusPerplexity(self, corpus):
        TotalLOGprob = 0.0
        TotalWords = 0
        for sen in corpus:
            for word in sen:
                WordProb = self.unigramDist.prob(word)
                if WordProb == 0.0:
                    continue
                TotalLOGprob += -1.0 * log(WordProb)
                TotalWords += 1
        perplexity = exp(TotalLOGprob / TotalWords)
        return perplexity
#endclass


#Smoothed unigram language model (use laplace for smoothing)
class SmoothedUnigramModel(LanguageModel):
    def __init__(self, corpus):
        super().__init__(corpus)
        self.smoothedUnigramDist = SmoothedUnigramDist(corpus)

    def generateSentence(self):
        sentence = [self.smoothedUnigramDist.draw() for _ in range(1)]  # Adjust the sentence length as needed
        return sentence

    def getSentenceProbability(self, sen):
        probability = 1.0
        for word in sen:
            probability *= self.smoothedUnigramDist.prob(word)
        return probability

    def getCorpusPerplexity(self, corpus):
        TotalLOGprob = 0.0
        TotalWords = 0
        for sen in corpus:
            for word in sen:
                TotalLOGprob += -1.0 * log(self.smoothedUnigramDist.prob(word))
                TotalWords += 1
        perplexity = exp(TotalLOGprob / TotalWords)
        return perplexity
#endclass


# Unsmoothed bigram language model
class BigramModel(LanguageModel):
    def __init__(self, corpus):
        super().__init__(corpus)
        self.bigramDist = BigramDist(corpus)

    def generateSentence(self):
        sentence = []  # Initialize an empty sentence
        start_word = random.choice(list(self.bigramDist.total.keys()))
        sentence.append(start_word)
        next_word = self.bigramDist.draw_given_prev(sentence[-1])
        sentence.append(next_word)
        return sentence[:2]  # Return only the last two words, which form the bigram


    def getSentenceProbability(self, sen):
        probability = 1.0
        for i in range(1, len(sen)):  # Start from the second word
            probability *= self.bigramDist.prob_given_prev(sen[i], sen[i - 1])
        return probability

    def getCorpusPerplexity(self, corpus):
        TotalLOGprob = 0.0
        TotalWords = 0
        for sen in corpus:
            for i in range(1, len(sen)):
                WordProb = self.bigramDist.prob_given_prev(sen[i], sen[i - 1])
                if WordProb == 0.0:
                    # Handle zero probability by skipping the computation
                    continue
                TotalLOGprob += -1.0 * log(WordProb)
                TotalWords += 1
        perplexity = exp(TotalLOGprob / TotalWords)
        return perplexity
#endclass


# Smoothed bigram language model (use linear interpolation for smoothing, set lambda1 = lambda2 = 0.5)
class SmoothedBigramModelLI(LanguageModel):
    def __init__(self, corpus):
        super().__init__(corpus)
        self.smoothedBigramDist = SmoothedBigramDistLI(corpus)

    def generateSentence(self):
        sentence = ["<s>"]  # Start the sentence with the start token
        next_word = self.smoothedBigramDist.draw_given_prev(sentence[-1])
        sentence.append(next_word)
        return sentence[-2:]  # Return only the last two words, which form the bigram

    def getSentenceProbability(self, sen):
        probability = 1.0
        for i in range(1, len(sen)):
            probability *= self.smoothedBigramDist.prob_given_prev(sen[i], sen[i - 1])
        return probability

    def getCorpusPerplexity(self, corpus):
        TotalLOGprob = 0.0
        TotalWords = 0
        for sen in corpus:
            for i in range(1, len(sen)):
                TotalLOGprob += -1.0 * log(self.smoothedBigramDist.prob_given_prev(sen[i], sen[i - 1]))
                TotalWords += 1
        perplexity = exp(TotalLOGprob / TotalWords)
        return perplexity
#endclass



# Sample class for a unsmoothed unigram probability distribution
# Note:
#Feel free to use/re-use/modify this class as necessary for your
#own code (e.g. converting to log probabilities after training).
#This class is intended to help you get started
#with your implementation of the language models above.
class UnigramDist:
  def __init__(self, corpus):
    self.counts = defaultdict(float)
    self.total = 0.0
    self.train(corpus)
  #endddef

  # Add observed counts from corpus to the distribution
  def train(self, corpus):
    for sen in corpus:
      for word in sen:
        if word == start:
          continue
        self.counts[word] += 1.0
        self.total += 1.0
      #endfor
    #endfor
  #enddef

  # Returns the probability of word in the distribution
  def prob(self, word):
    return self.counts[word]/self.total
    #enddef

  # Generate a single random word according to the distribution
  def draw(self):
    rand = random.random()
    for word in self.counts.keys():
      rand -= self.prob(word)
      if rand <= 0.0:
        return word
	    #endif
	  #endfor
  #enddef
#endclass

class SmoothedUnigramDist:
    def __init__(self, corpus):
        self.counts = defaultdict(float)
        self.total = 0.0
        self.train(corpus)

    def train(self, corpus):
        for sen in corpus:
            for word in sen:
                if word == start:
                    continue
                self.counts[word] += 1.0
                self.total += 1.0

        self.total += len(set(word for sen in corpus for word in sen))  # Add unique word types
        for word in set(word for sen in corpus for word in sen):
            self.counts[word] += 1.0

    def prob(self, word):
        return (self.counts[word] + 1.0) / self.total

    def draw(self):
        rand = random.random()
        for word in self.counts.keys():
            rand -= self.prob(word)
            if rand <= 0.0:
                return word
#endclass


class BigramDist:
    def __init__(self, corpus):
        self.counts = defaultdict(lambda: defaultdict(float))
        self.total = defaultdict(float)
        self.train(corpus)

    def train(self, corpus):
        for sen in corpus:
            for i in range(1, len(sen)):  # Start from the second word
                prev_word = sen[i - 1]
                current_word = sen[i]
                self.counts[prev_word][current_word] += 1.0
                self.total[prev_word] += 1.0

    def prob_given_prev(self, current_word, prev_word):
        return self.counts[prev_word][current_word] / self.total[prev_word]

    def draw_given_prev(self, prev_word):
        rand = random.random()
        for current_word in self.counts[prev_word].keys():
            rand -= self.prob_given_prev(current_word, prev_word)
            if rand <= 0.0:
                return current_word
#endclass

class SmoothedBigramDistLI:
  def __init__(self, corpus, lambda1=0.5, lambda2=0.5):
    self.bigramDist = BigramDist(corpus)
    self.unigramDist = UnigramDist(corpus)
    self.lambda1 = lambda1
    self.lambda2 = lambda2

  def prob_given_prev(self, current_word, prev_word):
    unigram_prob = self.unigramDist.prob(current_word)
    bigram_prob = self.bigramDist.prob_given_prev(current_word, prev_word)
    return self.lambda1 * unigram_prob + self.lambda2 * bigram_prob

  def draw_given_prev(self, prev_word):
    rand = random.random()
    for current_word in self.unigramDist.counts.keys():
      rand -= self.prob_given_prev(current_word, prev_word)
      if rand <= 0.0:
        return current_word
#endclass

In [None]:
#-------------------------------------------
# The main routine
#-------------------------------------------
if __name__ == "__main__":
  #read your corpora
  trainCorpus = readFileToCorpus('train.txt')
  trainCorpus = preprocess(trainCorpus)

  posTestCorpus = readFileToCorpus('pos_test.txt')
  negTestCorpus = readFileToCorpus('neg_test.txt')

  vocab = set()
  # Please write the code to create the vocab over here before the function preprocessTest
  #print("""Task 0: create a vocabulary(collection of word types) for the train corpus""")
  for sentence in trainCorpus:
    for vocabulary in sentence:
      vocab.add(vocabulary)

  posTestCorpus = preprocessTest(vocab, posTestCorpus)
  negTestCorpus = preprocessTest(vocab, negTestCorpus)

  # Run UnigramModel
  unigramModel = UnigramModel(trainCorpus)
  with open("unigram output.txt", "a+") as output_file:
    unigramModel.generateSentencesToFile(numberOfSentences=20, filename = "unigram output.txt", corpus=posTestCorpus)
    unigramModel.generateSentencesToFile(numberOfSentences=20, filename = "unigram output.txt", corpus=negTestCorpus)

  # Run SmoothedUnigramModel
  smoothedUnigramModel = SmoothedUnigramModel(trainCorpus)
  with open("smooth unigram output.txt", "a+") as output_file:
    smoothedUnigramModel.generateSentencesToFile(numberOfSentences=20, filename="smooth unigram output.txt", corpus=posTestCorpus)
    smoothedUnigramModel.generateSentencesToFile(numberOfSentences=20, filename="smooth unigram output.txt", corpus=negTestCorpus)

  # Run BigramModel
  bigramModel = BigramModel(trainCorpus)
  with open("bigram output.txt", "a+") as output_file:
    bigramModel.generateSentencesToFile(numberOfSentences=20, filename="bigram output.txt", corpus=posTestCorpus)
    bigramModel.generateSentencesToFile(numberOfSentences=20, filename="bigram output.txt", corpus=negTestCorpus)

  # Run SmoothedBigramModel
  smoothedBigramModel = SmoothedBigramModelLI(trainCorpus)
  with open("smooth bigram LI output.txt", "a+") as output_file:
    smoothedBigramModel.generateSentencesToFile(numberOfSentences=20, filename="smooth bigram LI output.txt", corpus=posTestCorpus)
    smoothedBigramModel.generateSentencesToFile(numberOfSentences=20, filename="smooth bigram LI output.txt", corpus=negTestCorpus)

  # Compute perplexity for each model on positive and negative test corpora
  Models = [unigramModel, bigramModel, smoothedUnigramModel, smoothedBigramModel]
  ModelNames = ["Unigram Model", "Bigram Model", "Smoothed Unigram Model", "Smoothed Bigram Model"]

  print("Perplexity for positive corpus:")
  for model, model_name in zip(Models, ModelNames):
    # Compute perplexity for positive test corpus
    pos_perplexity = model.getCorpusPerplexity(posTestCorpus)
    print(f"{model_name} perplexity: {pos_perplexity}")

  print("\n")

  print("Perplexity for negative corpus:")
  for model, model_name in zip(Models, ModelNames):
    # Compute perplexity for negative test corpus
    neg_perplexity = model.getCorpusPerplexity(negTestCorpus)
    print(f"{model_name} perplexity: {neg_perplexity}")

Reading file train.txt


Reading sentence1000
Reading sentence2000
Reading sentence3000
Reading sentence4000
Reading sentence5000
Reading sentence6000
Reading sentence7000
Reading sentence8000
Reading sentence9000
Reading sentence10000
Reading sentence11000
Reading sentence12000
Reading sentence13000
Reading sentence14000
Reading sentence15000
Reading sentence16000
Reading sentence17000
Reading sentence18000
Reading sentence19000
Reading sentence20000
Reading sentence21000
Reading sentence22000
Reading sentence23000
Reading sentence24000
Reading sentence25000
Reading sentence26000
Reading sentence27000
Reading sentence28000
Reading sentence29000
Reading sentence30000


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
)
5516
and
18041
"
7713
the
37617
shoveller
1
(
5489
william
97
h
23
.
30743
macy
15
)
5516
.
30743
sure
234
they
1943
were
558
enterataining
1
,
38769
but
4079
their
1542
acts
33
grew
20
old
353
fast
61
.
30743
that
7085
is
12771
until
226
they
1943
aquire
1
"
7713
invisible
11
boy
156
"
7713
(
5489
kel
2
mitchell
35
)
5516
,
38769
and
18041
"
7713
the
37617
bowler
7
"
7713
(
5489
janeane
8
garafalo
2
)
5516
and
18041
"
7713
mr
159
.
30743
splein
2
"
7713
(
5489
paul
88
reubens
2
)
5516
,
38769
2
158
of
16878
which
1606
rescue
43
the
37617
film
4441
from
2434
becoming
53
a
18437
disastorous
1
mess
27
.
30743
thankfully
44
,
38769
the
37617
original
313
3
79
heroes
43
become
292
amusing
57
,
38769
with
5345
some
1324
support
48
of
16878
reuben
3
and
18041
garfalo
1
on
3325
screen
309
.
30743
the
37617
whole
206
premise
94
is
12771
rather
320
ridiculous
21
,
38769
but
4079
packs
9
a
18437
few
448
punches
6
to
14987
keep
17

Reading sentence1000
Reading sentence1000






Perplexity for positive corpus:
Unigram Model perplexity: 628.6696007318275
Bigram Model perplexity: 56.621555033931386
Smoothed Unigram Model perplexity: 790.3325454280941
Smoothed Bigram Model perplexity: 243.50325387988883


Perplexity for negative corpus:
Unigram Model perplexity: 612.2628559647762
Bigram Model perplexity: 59.299725751057
Smoothed Unigram Model perplexity: 779.1127557146143
Smoothed Bigram Model perplexity: 251.21906135626992


In [None]:
    # Run sample unigram dist code
    unigramDist = UnigramDist(trainCorpus)
    print("Sample UnigramDist output:")
    print("Probability of \"picture\": ", unigramDist.prob("picture"))
    print("\"Random\" draw: ", unigramDist.draw())

    # Run sample Smoothed Unigram Dist code
    smoothedunigramDist = SmoothedUnigramDist(trainCorpus)
    print("\nSample SmoothedUnigramDist output:")
    print("Probability of \"global\": ", smoothedunigramDist.prob("global"))
    print("\"Random\" draw: ", smoothedunigramDist.draw())

    # Run sample Bigram Dist code
    bigramDist = BigramDist(trainCorpus)
    print("\nSample BigramDist output:")
    print("Probability of \"still enjoyed\": ", bigramDist.prob_given_prev("enjoyed", "still"))
    print("\"Random\" draw: ", bigramDist.draw_given_prev("still"))

    # Run sample Smoothed Bigram DistLI code
    smoothedbigramDist = SmoothedBigramDistLI(trainCorpus)
    print("\nSample SmoothedBigramDistLI output:")
    print("Probability of \"wonderful life\": ", smoothedbigramDist.prob_given_prev("life", "wonderful"))
    print("\"Random\" draw: ", smoothedbigramDist.draw_given_prev("wonderful"))

Sample UnigramDist output:
Probability of "picture":  0.0003967776842611519
"Random" draw:  everything

Sample SmoothedUnigramDist output:
Probability of "global":  5.20835367846489e-06
"Random" draw:  pretty

Sample BigramDist output:
Probability of "still enjoyed":  0.001838235294117647
"Random" draw:  trying

Sample SmoothedBigramDistLI output:
Probability of "wonderful life":  0.026572532379914852
"Random" draw:  little


**1. When generating sentences with the unigram model, what controls the length of the generated sentences? How does this differ from the sentences produced by the bigram models?**

In the Unigram Model class, the length of the generated sentences is controlled by the parameter passed to the range() function inside the generateSentence() method. it is set to range(1) since the unigram only contains one word.

Whereas in Bigram Model, by using the slice notation, it gives the first two elements from the list since bigram contains only two words

**2. Consider the probability of the generated sentences according to your models. Do your models assign drastically different probabilities to the different sets of sentences? Why do you think that is?**

Yes, the models assign drastically different probabilities to the different sets of sentences.
The differences in probability assignments between the models can be attributed to their underlying assumptions and methods of estimation. Unigram models treat each word independently, while bigram models consider dependencies between adjacent words. Smoothing techniques are applied to handle unseen or infrequent events in the training data, which can affect the probability distributions.


**3. Generate additional sentences using your bigram and smoothed bigram models. In your opinion, which
model produces better / more realistic sentences?**

In [None]:
NumberOfSentences = 10

Bigram_Model = [bigramModel]
Bigram_Model_Name = ["Bigram Model"]

for model_name, model in zip(Bigram_Model_Name, Bigram_Model):
    print(f"\n{model_name}:")
    for i in range(NumberOfSentences):
        sentence = model.generateSentence()
        print(" ".join(sentence))

Smoothed_Bigram_Model = [smoothedBigramModel]
Smoothed_Bigram_Model_Name = ["Smoothed Bigram Model"]

for model_name, model in zip(Smoothed_Bigram_Model_Name, Smoothed_Bigram_Model):
    print(f"\n{model_name}:")
    for i in range(NumberOfSentences):
        sentence = model.generateSentence()
        print(" ".join(sentence))


Bigram Model:
envy .
lax on
dave foley
soderbergh's films
peasant girl
douglas hall
cheapskate mother
contains mediocre
voyeurs of
charlotte rampling

Smoothed Bigram Model:
<s> the
<s> however
<s> as
<s> managed
<s> most
<s> it's
<s> both
<s> "
<s> an
<s> the


'\nIn terms of producing more realistic sentences,\nthe Smoothed Bigram Model is likely to perform better.\nThis is because the smoothed bigram model incorporates techniques\nto handle unseen or rare combinations of words,\nmaking its generated sentences more coherent and natural sounding\ncompared to the standard bigram model.\n'

**4. For each of the four models, which test corpus has a higher perplexity? Why? Make sure to include the
perplexity values in the answer.**

The positive corpus tends to have higher perplexity across the unigram and smoothed unigram models, while the negative corpus has higher perplexity across the bigram and smoothed bigram models.

---

1.   Unigram Model:

Positive Corpus Perplexity: 628.6696007318275

Negative Corpus Perplexity: 612.2628559647762

=> Positive Corpus has Higher Perplexity

2.   Smoothed Unigram Model:

Positive Corpus Perplexity: 790.3325454280941

Negative Corpus Perplexity: 779.1127557146143

=> Positive Corpus has Higher Perplexity

The unigram and smoothed Unigram model assigns probabilities to individual words independently. The positive corpus likely contains more diverse vocabulary or longer sentences, leading to a higher perplexity compared to the negative corpus.

3.   Bigram Model:

Positive Corpus Perplexity: 56.621555033931386

Negative Corpus Perplexity: 59.299725751057

=> Negative Corpus has Higher Perplexity

4.   Smoothed Bigram Model:

Positive Corpus Perplexity: 243.50325387988883

Negative Corpus Perplexity: 251.21906135626992

=> Negative Corpus has Higher Perplexity

The bigram and Smoothed Bigram model considers the probabilities of word pairs. The positive corpus might have more predictable sequences or smoother transitions between words, resulting in a lower perplexity compared to the negative corpus.