# The following code uses Hiliary Clinton's emails as a corpus for creating new emails using a Markov Chain.

In [15]:
# Generating speech using Hilary's email as Corpus
import pandas as pd
import sqlite3

conn = sqlite3.connect("input/database.sqlite")
query = "SELECT * FROM Emails;"

# #Import email body text and convert to corpus
emaildf = pd.read_sql_query(query,conn)
corpus = """r"""
for row in emaildf["ExtractedBodyText"]:
    corpus += str(row)

In [16]:
# modified from http://agiliq.com/blog/2009/06/generating-pseudo-random-text-with-markov-chains-u/
import random

class Markov(object):

	def __init__(self, corpus, chain_size=3):
		self.chain_size = chain_size
		self.cache = {}
		
		self.words = corpus.split()
		self.word_size = len(self.words)
		self.database()

	def words_at_position(self, i):
		"""Uses the chain size to find a list of the words at an index."""
		chain = []
		for chain_index in range(0, self.chain_size):
			chain.append(self.words[i + chain_index])
		return chain

	def chains(self):
		"""Generates chains from the given data string based on passed chain size.
		So if our string were:
			"What a lovely day"
		With a chain size of 3, we'd generate:
			(What, a, lovely)
		and
			(a, lovely, day)
		"""

		if len(self.words) < self.chain_size:
			return

		for i in range(len(self.words) - self.chain_size - 1):
			yield tuple(self.words_at_position(i))

	def database(self):
		for chain_set in self.chains():
			key = chain_set[:self.chain_size - 1]
			next_word = chain_set[-1]
			if key in self.cache:
				self.cache[key].append(next_word)
			else:
				self.cache[key] = [next_word]

	def generate_markov_text(self, size=25):
		seed = random.randint(0, self.word_size - 3)
		gen_words = []
		seed_words = self.words_at_position(seed)[:-1]
		gen_words.extend(seed_words)
		for i in range(size):
			last_word_len = self.chain_size - 1
			last_words = gen_words[-1 * last_word_len:]
			next_word = random.choice(self.cache[tuple(last_words)])
			gen_words.append(next_word)
		return ' '.join(gen_words)
    
markov = Markov(corpus)

# Just re-run the following cell to get different results

In [18]:
email_length = 45 # the number of words to have in the email
for i in range(10):
    print(str(i+1)+": ")
    print(markov.generate_markov_text(email_length))

1: 
proximity talks. This is another contender, with a large enough platform for business, NGOs, and academia to provide relief services in Haiti's slums. This program could be killed or wounded in the country and earning condemnation from his father. Fred Koch's will made his first 100 days
2: 
actions our belief that their leadership has decided not to topple the gov't.FYISenkaku is the update is this issue on your desk before I leave on Thursday.Not at first. I need to take on China anymore. The presumption here is possibility:We have a good move right after
3: 
state media reported on Thursday, bringing to justice those who did not have the plan for gender mainstreaming. Many greetings extended to you -- for your briefing and advice this past March, Sarkozy's UMP party is angry that it would appear soon enough. It always takes time
4: 
minority report called "an audacious plan to go home and said that a mandate on employers to provide more trainers. We asked Dan Kurtzer to call me 