# Markov Generator
learns probabilities of bigrams from training set.

Generates text using bigram probabilities. 

In [None]:
import random

class MarkovGeneratorBigram:
  def __init__(self):
    self.bigramDict = {} #key = word, val = list of tuples (word2, count)
    self.countDict = {} #key = word, val = count
    
  def train(self, fileName):
    """
    * Code for reading a file.  From HW2
    * @param: string filename, name of file with one line = one quote
    """
    f = open(fileName,encoding="utf8")
    for line in f:
      self.addExample(line)

    f.close()

  def addExample(self, quote):
    """
    *add a quote to the bigram counts
    @param: string quote, quote to add
    """
    
    words = quote.split()
    
    #add <s> and <e>
    if "\n" in words:
      words.remove("\n")
    words.insert(0, "<s>")
    words.append("<e>")

    #for each word in the quote
    for idx in range(0, len(words)-1):
      word = words[idx]
      nextWord = words[idx+1]

      #add to bigram counts
      self.addBigram(word, nextWord)

  
  def addBigram(self, word1, word2):
    """
    *add a bigram to the dictionaries
    *@param: string word1, first word in bigram; 
    *@param: string word2, second word in bigram
    """
    #if word1 seen before
    if word1 in self.countDict:
      #increase count
      self.countDict[word1] += 1

      bigramList = self.bigramDict[word1]
      bigramIdx = -1
        
      #determine bigram index
      for lIdx in range(0, len(bigramList)):
        if bigramList[lIdx][0] == word2:
          bigramIdx = lIdx
          break
        
      #if bigram new, add to list
      if bigramIdx == -1:
        tup = (word2, 1)
        bigramList.append(tup)

      #if bigram seen before, increase count
      else:
        newCount = bigramList[bigramIdx][1] + 1
        bigramList[bigramIdx] = (word2, newCount)
        
      #update word1's bigramlist
      self.bigramDict[word1] = bigramList
      
    #if word1 totally new
    else:
      #add to count dict
      self.countDict[word1] = 1

      #add to bigram dict
      bigramList = []
      tup = (word2, 1)
      bigramList.append(tup) 
      self.bigramDict[word1] = bigramList


  def generate(self):
    quote = ""
    isFirst = True

    word1 = "<s>"

    while(word1 != "<e>"):
      quote = quote+" "+word1

      #determine next word, update word1
      count = self.countDict[word1]
      nextWordCount = random.randint(0,count-1)
      word1 = self.chooseNextWord(word1, nextWordCount)


    #return completed sentence
    quote = quote.replace(" <s> ", "")
    return quote
  
  def chooseNextWord(self, word1, nextWordCount):
    #choose next word based on count
    bigramList = self.bigramDict[word1]
    #####print(word1+" count is: "+str(nextWordCount))

    idx = 0 #index in list
    wordCount = 0 #words seen so far

    newWord = bigramList[idx][0]
    oldWord = ""

    #until random number is reached, iterate words in list
    while wordCount < nextWordCount:
      wordCount += bigramList[idx][1]
      #####print(wordCount)
      idx += 1
      oldWord = newWord

      if idx < len(bigramList):
        newWord = bigramList[idx][0]
      else:
        newWord = "N/A"

    #if final word was same as random count, use it
    if wordCount == nextWordCount:
      word2 = newWord
    #if final word > random count, go back one word
    else:
      word2 = oldWord
    
    return word2


  def printBigram(self):
    """
    *print the saved bigram probabilities
    """
    for key in self.bigramDict:
      print(str(key)+": "+str(self.bigramDict[key]))


In [None]:
generator1 = MarkovGeneratorBigram()

generator1.train('tinytest1.txt')
generator1.printBigram()

<s>: [('this', 1), ('hello', 1), ('sentence', 1), ('good', 2), ('one', 2), ('is', 2), ('you', 1)]
this: [('is', 1), ('your', 1), ('my', 1)]
is: [('sentence', 1), ('here', 1), ('this', 2)]
sentence: [('one', 1), ('two', 1)]
one: [('.', 1), (',', 1), ('day', 1)]
.: [('<e>', 6)]
hello: [('my', 1)]
my: [('good', 1), ('friends', 2), ('cat', 1)]
good: [('friends', 1), ('day', 1), ('thing', 1)]
friends: [('.', 2), ('are', 1)]
two: [('is', 1), (',', 1)]
here: [('.', 2)]
day: [('!', 1), ('you', 1)]
!: [('<e>', 1)]
,: [('two', 1), ('three', 1), ('four', 1)]
three: [(',', 1)]
four: [('.', 1)]
your: [('cat', 2)]
cat: [('?', 3)]
?: [('<e>', 3)]
you: [('will', 1), ('are', 1)]
will: [('be', 1)]
be: [('my', 1)]
thing: [('my', 1)]
are: [('here', 1), ('your', 1)]


In [None]:
for i in range(0, 5):
  print(generator1.generate())

the united states of america will not determine the worry you live it every american to put back in crisis , and directly to hear another list of your dreams upon that's now hanging by a member of the job you don't need to the united states of this crisis , tonight i know someone who has : a neighbor , the united states -- she's around here tonight not determine the business you don't lie beyond our problems don't need to the united states of congress , the business you haven't been personally affected by a neighbor , the job you built your child had to the united states of sleepless nights .
if you built your dreams upon that's now , the worry you don't need to our economy may be weakened and directly to our problems don't need to our confidence shaken , and uncertain times , but now , because you wake up with and directly to speak frankly and women in the job you haven't been personally affected by this nation .
but to the envelope .
the business you built your family .
i have come h

In [1]:
import random

class MarkovGenerator:
  def __init__(self):
    self.gramDict = {} #key = words until last, val = list of tuples (last word, count)
    self.countDict = {} #key = word, val = count
    self.gramCount = 0
    
  def train(self, fileName, gramNum):
    """
    * Code for reading a file.  From HW2
    * @param: string filename, name of file with one line = one quote
    * @param: int gramNum, 2, 3, 4 for bigram, trigram, fourgram respectively
    """
    self.gramCount = gramNum

    f = open(fileName,encoding="utf8")
    for line in f:
      self.addExample(line)

    f.close()

  def addExample(self, quote):
    """
    *add a quote to the gram counts
    @param: string quote, quote to add
    """
    words = quote.split()
    
    #add <s> and <e>
    if "\n" in words:
      words.remove("\n")

    #add one <s> if bigram, 2 if trigram...
    for i in range(0, self.gramCount-1):
      words.insert(0, "<s>")

    #add end character
    words.append("<e>")

    #for each word in sequence
    for idx in range(0, len(words)-(self.gramCount-1)):
        
      #get all words in xgram except last
      startWords = words[idx]
      for i in range(1, self.gramCount-1):
        startWords = startWords+" "+words[idx+i]

      #get final word in xgram
      lastWord = words[idx+(self.gramCount-1)]

      #add to xgram counts
      self.addGram(startWords, lastWord)

  
  def addGram(self, firstWords, lastWord):
    """
    *add a gram to the dictionaries
    *@param: string firstWords, first words in xgram; 
    *@param: string lastWord, last word in xgram
    """
    #if beginning seen before
    if firstWords in self.countDict:
      #increase count
      self.countDict[firstWords] += 1

      gramList = self.gramDict[firstWords]
      gramIdx = -1
        
      #determine xgram index
      for lIdx in range(0, len(gramList)):
        if gramList[lIdx][0] == lastWord:
          gramIdx = lIdx
          break
        
      #if xgram new, add to list
      if gramIdx == -1:
        tup = (lastWord, 1)
        gramList.append(tup)

      #if xgram seen before, increase count
      else:
        newCount = gramList[gramIdx][1] + 1
        gramList[gramIdx] = (lastWord, newCount)
        
      #update word1's xgramlist
      self.gramDict[firstWords] = gramList
      
    #if word1 totally new
    else:
      #add to count dict
      self.countDict[firstWords] = 1

      #add to bigram dict
      gramList = []
      tup = (lastWord, 1)
      gramList.append(tup) 
      self.gramDict[firstWords] = gramList


  def generate(self):
    quote = ""
    isFirst = True

    prevWords = ["<s>"]
    for i in range (0, self.gramCount-2):
      prevWords.append("<s>")
    
    while "<e>" not in prevWords:
      #add most recent word
      quote = quote+" "+prevWords[len(prevWords)-1]

      #create prevWord string
      gramStart = prevWords[0]
      for i in range(1, len(prevWords)):
        gramStart = gramStart+" "+prevWords[i]

      #determine next word, add to prevWords list
      count = self.countDict[gramStart]
      nextWordCount = random.randint(0,count-1)
      prevWords.append(self.chooseNextWord(gramStart, nextWordCount))
      #remove first prevWord
      prevWords.pop(0)

    #return completed sentence
    quote = quote.replace(" <s> ", "")
    return quote
  
  def chooseNextWord(self, firstWords, nextWordCount):
    #choose next word based on count
    gramList = self.gramDict[firstWords]

    idx = 0 #index in list
    wordCount = 0 #words seen so far

    newWord = gramList[idx][0]
    oldWord = ""

    #until random number is reached, iterate words in list
    while wordCount < nextWordCount:
      wordCount += gramList[idx][1]
      idx += 1
      oldWord = newWord

      if idx < len(gramList):
        newWord = gramList[idx][0]
      else:
        newWord = "N/A"

    #if final word was same as random count, use it
    if wordCount == nextWordCount:
      word2 = newWord
    #if final word > random count, go back one word
    else:
      word2 = oldWord
    
    return word2


  def printGram(self):
    """
    *print the saved xgram probabilities
    """
    for key in self.gramDict:
      print(str(key)+": "+str(self.gramDict[key]))


In [None]:
generator = MarkovGenerator()

generator.train('train_quotes.txt', 4)
#generator.printGram()
print("four")
print(generator.generate())

four
that's why instead of just pouring money into a system that's not working , we launched a new spacecraft as part of a reenergized space program that will send american astronauts to mars .


In [2]:
generatorOfficial = MarkovGenerator()

#bigrams
print("\nbigrams\n")
generatorOfficial.train('train.txt', 2)
for i in range(0, 5):
  print(generatorOfficial.generate())

#trigrams
print("\ntrigrams\n")
generatorOfficial.train('train.txt', 3)
for i in range(0, 5):
  print(generatorOfficial.generate())

#fourgrams
print("\nfourgrams\n")
generatorOfficial.train('train.txt', 4)
for i in range(0, 5):
  print(generatorOfficial.generate())


bigrams

so was here in iraq to be clear : we too often isn't about how long as a difference .
a job creation , we don't need more than 90 days when we can't bring this year , as you take a part of the most young people we do spectacularly well .
some earmark reform right now , where we believed : they're people .
and that's the answers to reaffirm our efforts that american people every business in a trade deals that the last 2 years , depends on .
and suspicion between a source of this chamber that uniform stand against us to be able to send me start making higher education and combating the inherent dignity cannot be pushing for the problem on this recession , vote .

trigrams

and in a decade .
first , middle class simply by shifting the cost of health care reform cannot wait , and an associate's degree in computers or engineering .
we want -- opportunity and security for our wounded warriors receive the benefits they have to agree on this vision .
for america .
he worked construct