## Get Data

In [1]:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import random

In [2]:
r = requests.get('https://www.thespruceeats.com/global-words-for-coffee-765840')

soup = BeautifulSoup(r.text, 'html.parser')
div = soup.find(id='mntl-chop_1-0--chop-content')
tags = div.find_all('li')

words = {}
for t in tags:
    if t.strong:
        try:
            text = t.em.text
        except:
            text = t.text.split(":")[1]
        words[t.strong.text] = text.strip().lower()

In [3]:
words

{'Urdu': 'coffee',
 'Welsh': 'coffi',
 'Afrikaans': 'koffie',
 'Dutch': 'koffie',
 'Esperanto': 'kafo',
 'German': 'der kaffee',
 'Finnish': 'kahvi',
 'Hindi': 'kofi',
 'Russian': 'kofe',
 'Italian': 'caffe',
 'French': 'café',
 'Spanish': 'el café',
 'Bengali/Bangla': 'café',
 'Catalan': 'cafe',
 'Galician': 'café',
 'Irish': 'caife',
 'Portuguese': 'café',
 'Romanian': 'cafea',
 'Thai': 'kafea',
 'Vietnamese': 'ca phe',
 'Albanian': 'kafe',
 'Basque': 'kafea',
 'Bulgarian': 'kafe',
 'Creole': 'kafe',
 'Danish': 'kaffe',
 'Greek': 'kafés',
 'Haitian Creole': 'kafe',
 'Hebrew': 'ka-feh',
 'Macedonian': 'kafe',
 'Maltese': 'kafe',
 'Norwegian': 'kaffe',
 'Swedish': 'kaffe',
 'Wolof': 'kafe',
 'Icelandic': 'kaffii',
 'Latvian': 'kafija',
 'Luxembourgish': 'kaffi',
 'Chinese (Cantonese)': 'ga feh',
 'Chinese (Mandarin)': 'kafei',
 'Taiwanese': 'ka fei',
 'Ethiopian Amharic': 'buna',
 'Ethiopian Semitic': 'bunna',
 'Arabic': 'qahioa',
 'Turkish': 'kahveh',
 'Belarusian': 'kava',
 'Croatian

## PFCG

In [4]:
PCFG = defaultdict(list)
for word in words.values():
    # S -> first char
    PCFG['S'].append(word[0])
    # middle char -> middle char
    for i in range(1, len(word)):
        PCFG[word[i-1]].append(word[i])
    # final char -> T
    PCFG[word[-1]].append("T")

In [5]:
def build_word(pcfg):
    word = ""
    char = random.choice(pcfg['S'])
    while char != 'T':
        word += char
        char = random.choice(pcfg[char])
    return word

In [6]:
[build_word(PCFG) for _ in range(12)]

['cohvapi',
 'kofe',
 'kahva',
 'kawa-ffeh',
 'kafi',
 'kaffea fé',
 'kavafi',
 'kaymur ph',
 'khvavav',
 'sur fiffe',
 'kahikeashopi',
 'khophafea']

In [7]:
# unique words
some_words = [build_word(PCFG) for _ in range(100)]
set(some_words) - set(words.values())

{'bu',
 'bunnnai',
 'burca',
 'ca',
 'caffijah',
 'caffé',
 'cafi',
 'cafikafave',
 'cai',
 'cavy',
 'cka',
 'cofe',
 'cohe',
 'coofee',
 'da',
 'e',
 'ei',
 'gafikavi',
 'i',
 'ka',
 'ka cafe',
 'ka-fé',
 'kadafé',
 'kafeafa',
 'kafeafe',
 'kafehi',
 'kaffeercaveavavafi',
 'kafffi',
 'kaffé',
 'kafi',
 'kafifel piwa',
 'kah',
 'kahkapi',
 'kai',
 'kapi',
 'kapiija-ffel ka',
 'kava ca',
 'kava fiwa',
 'kavabucay',
 'kavafay',
 'kavafi',
 'kavava',
 'kavawa',
 'kavei',
 'kavésunafafehvava',
 'kay',
 'ke',
 'ki',
 'ki-kofeohota',
 'kijahee',
 'ko',
 'koffikabunavafiii',
 'kofijavi',
 'kohe',
 'kohvafe',
 'koohi',
 'kophiee',
 'ma',
 'qa',
 'qafe',
 'qafé',
 'qé',
 's',
 'sunna'}