In [1]:
import random
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

In [15]:
def get_test_wordlist():
    # Grab a table of common baby names from the 1880s
    url = 'https://www.ssa.gov/oact/babynames/decades/names1880s.html'
    src = requests.get(url).content
    
    # Basic web scraping to dataframe
    soup = bs(src, 'lxml')
    table = soup.find_all('table', attrs = {'class' : 't-stripe'})[0]
    rows = table.find_all('tr')
    
    cols = ['RANK', 'MNAME', 'COUNT_1', 'FNAME', 'COUNT_2']
    df = pd.DataFrame(columns = cols)
    
    # Iterate through each row and append it to the dataframe as a pd.Series
    for row in rows:
        d_ls = [d.text for d in row.find_all('td')]
        if len(d_ls) != 5:
            continue
        df = df.append(pd.Series(d_ls, index = cols), ignore_index = True)
    return df['MNAME'].tolist()

In [3]:
# Helper method to grab a random key from dictionary based using value weights
def select_weighted(d):
   offset = random.randint(0, sum(d.values())-1)
   for k, v in d.items():
      if offset < v:
         return k
      offset -= v

In [4]:
# Updates a k/v string/weight pair
def add_subchunk(freq_dict, subchunk):   
    if subchunk in freq_dict.keys():
        freq_dict[subchunk] += 1
    else:
        freq_dict[subchunk] = 1

In [5]:
# Train our model on a word list. Rather than a single pure markov chain, a little extra work has been put in to ensure
# correct-seeming beginnings and endings of words.
def train(wordlist, depth):
    assert depth > 0
    dict_start = {}    
    dict_general = {}
    dict_end = {}
    for word in wordlist:
        # Count for dict_start
        add_subchunk(dict_start, word[0:2])
        # Count for dict end
        add_subchunk(dict_end, word[-2:]) 
        # Count for dict_general
        for i in range(1, len(word)):
            subchunk = word[i-1:i+depth]
            add_subchunk(dict_general, subchunk)
    # print(dict_start)
    return (dict_start, dict_general, dict_end)

In [6]:
def build(model, wordlen):
    endlen = 2
    # Get a starting block from the starting frequency table
    word = select_weighted(model[0])
    # Continuously add letters based on our general frequency table
    while len(word) < wordlen-endlen:
        try:
            # Filter to chunks with overlap
            newDict = {k: v for k, v in model[1].items() if k[0] == word[-1]}
            # Add chunk (minus overlapping letter)
            word += select_weighted(newDict)[1:]
        except:
            word = select_weighted(model[0])
    # Add an ending chunk, lots of possible screwey edge cases here that have just been wrapped in try/catch blocks.
    # Room for improvement, but it works for now.
    finished = False
    while not finished:
        try:
            newDict = {k: v for k, v in model[2].items() if k[0] == word[-1]}
            word += select_weighted(newDict)[1:]
            finished = True
        except:
            try:
                # Filter to chunks with overlap
                newDict = {k: v for k, v in model[1].items() if k[0] == word[-1]}
                # Add chunk (minus overlapping letter)
                word += select_weighted(newDict)[1:]
            except:
                return word
    return word

In [16]:
wordlist = get_test_wordlist()

In [17]:
model = train(wordlist, 3)

In [21]:
for i in range(0, 25):
    print(build(model, random.randint(5, 8)))

Stist
Jueliss
Clacences
Wichah
Elsond
Penge
Sytonk
Cerbex
Stong
Emiliuse
Mahalld
Miverving
Nexande
Osterd
Wices
Geubey
Sarree
Thardward
Frer
Lenestt
Earlll
Elsonn
Cllien
Hardery
Ruellend
