# Kava Brewer

Creating a PCFCG (probabalistic context-free coffee grammar).

<img src="coffee.jpg" width="150"/>

In [1]:
import random
import unicodedata as ud
from collections import Counter, defaultdict

import pandas as pd

## Get Data

In [2]:
df = pd.read_csv('words.csv')
df.head()

Unnamed: 0,language,word,region
0,Albanian,kafe,European
1,Basque,kafea,European
2,Belarusian,кава,European
3,Bosnian,kafa,European
4,Bulgarian,кафе,European


### Pre-Preprocessing

- Filter out non-latin alphabets
- Capture words in brackets, eg "кави [kavy]"

In [3]:
def is_latin(word):
    return all(['LATIN' in ud.name(c) for c in word])

words = []
for word in list(df['word']):
    for w in word.split(' ['):
        w = w.strip(']').lower()
        words.append(w)

words = [w for w in words if is_latin(w)]

In [4]:
words[:10]

['kafe',
 'kafea',
 'kafa',
 'cafè',
 'caffè',
 'kava',
 'káva',
 'kaffe',
 'koffie',
 'kohv']

## PCFG

### Learn the Grammar

In [5]:
def build_pcfg(words):
    """Learn the rules of making coffee words"""
    pcfg = defaultdict(list)
    for word in words:
        # add start and end token to list of characters
        tokens = ["<S>"] + list(word) + ["<T>"]
        # link each character to subsequent character
        for i in range(1, len(tokens)):
            pcfg[tokens[i-1]].append(tokens[i])
    return pcfg

In [6]:
def display_sample(pcfg, num_keys=5, num_values=5):
    """Create a mini dictionary, with a few keys and truncated list of values."""
    sample = {}
    for key in list(pcfg.keys())[:num_keys]:
        sample[key] = pcfg[key][:num_values] + ["..."]
    return sample

In [7]:
# create grammar
PCFG = build_pcfg(words)

# display sample
display_sample(PCFG)

{'<S>': ['k', 'k', 'k', 'c', 'c', '...'],
 'k': ['a', 'a', 'a', 'a', 'á', '...'],
 'a': ['f', 'f', '<T>', 'f', '<T>', '...'],
 'f': ['e', 'e', 'a', 'è', 'f', '...'],
 'e': ['<T>', 'a', '<T>', '<T>', '<T>', '...']}

- The start token `<S>` is linked to the tokens it can produce, and they link to the tokens they produce, and so on until you reach the terminal token `<T>`.
- The number of times a token shows up in the list corresponds to its probability of being selected. 

### Generate some coffee words

In [8]:
def build_word(pcfg):
    """
    Follow the grammar to randomly select tokens
    until you reach the reach an end token
    """
    word = ""
    char = random.choice(pcfg['<S>'])
    while char != '<T>':
        word += char
        char = random.choice(pcfg[char])
    return word

In [9]:
# some words
[build_word(PCFG) for _ in range(12)]

['kavarus',
 'kofé',
 'qafe',
 'kahvi',
 'kohvafa',
 'kahafè',
 'i',
 'qe',
 'kaffi',
 'cafe',
 'kafè',
 'cawawafēidh']

Here are some words you can generate from the grammar!

In [10]:
# novel words
some_words = [build_word(PCFG) for _ in range(100)]
novel_words = list(set(some_words) - set(words))
random.sample(novel_words, 12)

['qahvy',
 'ie',
 'qeafé',
 'ke',
 'caie',
 'kafavé',
 'cav',
 'ije',
 'ka',
 'kohvə',
 'kāfi',
 'koffēi']

Here are some novel coffee words, not in the original list.

## Extensions

### PCFG with Bigrams

Repeat process but with bigrams instead of unigrams.

In [11]:
def build_pcfg_bigrams(words):
    """Learn the rules of making coffee words"""
    pcfg_bigrams = defaultdict(list)
    for word in words:
        # <S> -> first chars
        pcfg_bigrams['<S>'].append(word[0:2])
        # middle chars -> middle chars
        for i in range(2, len(word), 2):
            pcfg_bigrams[word[i-2:i]].append(word[i:i+2])
        # final chars -> <T>
        pcfg_bigrams[word[i:i+2]].append("<T>")

In [12]:
PCFG_bigrams = build_pcfg(words)

In [13]:
display_sample(PCFG_bigrams)

{'<S>': ['k', 'k', 'k', 'c', 'c', '...'],
 'k': ['a', 'a', 'a', 'a', 'á', '...'],
 'a': ['f', 'f', '<T>', 'f', '<T>', '...'],
 'f': ['e', 'e', 'a', 'è', 'f', '...'],
 'e': ['<T>', 'a', '<T>', '<T>', '<T>', '...']}

In [14]:
[build_word(PCFG_bigrams) for _ in range(12)]

['kofe',
 'karus',
 'kafffjafè',
 'wafike',
 'kāffffulua',
 'kafị',
 'ieafika',
 'khvy',
 'ca',
 'kofje',
 'kafe',
 'kofè']

Some sample words using bigram approach.

In [15]:
# novel words
some_words = [build_word(PCFG_bigrams) for _ in range(100)]
novel_words = list(set(some_words) - set(words))
random.sample(novel_words, 12)

['kahoffe',
 'i',
 'wafēi',
 'kawhwawhvavy',
 'cafega',
 'kohva',
 'kofị',
 'qeheavafè',
 'kahe',
 'caffopea',
 'kaffofe',
 'caffi']

Some novel words using bigram approach.

### Mixed together

Merge together the unigram dict and the bigram dict. 

In [16]:
def merge_dicts(a, b):
    new = {k: v for k, v in a.items()}
    for k in b.keys():
        if k in a:
            new[k] = a[k] + b[k]
        else:
            new[k] = b[k]
    return new

In [17]:
all_pcfg = merge_dicts(PCFG, PCFG_bigrams)

In [18]:
[build_word(all_pcfg) for _ in range(12)]

['kọfi',
 'cafe',
 'ka',
 'kope',
 'khvé',
 'kofè',
 'kavawafe',
 'ka',
 'ka',
 'ka',
 'kafe',
 'koffe']