# Markov Chain 

## Load Text Sources

In [None]:
import glob
import os
import numpy as np

In [None]:
speeches_dir = 'data/speeches/'

In [None]:
from gensim.parsing.preprocessing import preprocess_string, \
                                         strip_non_alphanum, strip_tags

def clean(text):
    text = strip_tags(text)
    text = strip_non_alphanum(text)
    return text

def load_speeches(category, filename='*.txt'):
    """
       :param category: What type of speeches to load
                        - women or comedians
       :param filename: The filename pattern
    """
    category_dir = os.path.join(speeches_dir,category)
    for filename in glob.glob(os.path.join(category_dir, filename)):
        with open(filename, encoding='latin-1') as f:
            yield filename, clean(f.read())

In [None]:
womens_speeches = load_speeches('women')
comedian_speeches = load_speeches('comedians')

In [None]:
from nltk.tokenize import word_tokenize


def load_corpus(speeches):
    corpus = []
    for filename, speech in speeches:
        print(f'Loading speech {filename}')
        tokens = word_tokenize(speech)
        corpus = corpus + tokens
    return corpus

In [None]:
corpus = load_corpus(womens_speeches)

In [None]:
def make_pairs(corpus):
    for i in range(len(corpus)-1):
        yield (corpus[i], corpus[i+1])
          
def load_word_dict(corpus):
    pairs = make_pairs(corpus)
    word_dict = {}
    for word_1, word_2 in pairs:
        if word_1 in word_dict.keys():
            word_dict[word_1].append(word_2)
        else:
            word_dict[word_1] = [word_2]
    return word_dict
            
def load_markov_dict(category, filename='*.txt'):
    speeches = load_speeches(category, filename)
    corpus = load_corpus(speeches)
    return load_word_dict(corpus)

In [None]:
womens_speeches_word_dict = load_markov_dict('women')

In [None]:
comedians_word_dict = load_markov_dict('comedians')

In [None]:
def get_sentence(word_dict, n_words=15):
    first_word = np.random.choice(list(word_dict.keys()))
    while first_word.islower():
        first_word = np.random.choice(corpus)
    chain = [first_word]
    for i in range(n_words):
        chain.append(np.random.choice(word_dict[chain[-1]]))
    return ' '.join(chain)

In [None]:
get_sentence(womens_speeches_word_dict)

In [None]:
a