In [1]:
from collections import defaultdict, Counter
import random
import pandas as pd

import unicodedata as ud

## Get Data

In [2]:
df = pd.read_csv('words.csv')
df.head()

Unnamed: 0,language,word,region
0,Albanian,kafe,European
1,Basque,kafea,European
2,Belarusian,кава,European
3,Bosnian,kafa,European
4,Bulgarian,кафе,European


#### Pre-Preprocess

- Filter out non-latin alphabets
- Capture words in brackets, eg "кави [kavy]"

In [3]:
def is_latin(word):
    return all(['LATIN' in ud.name(c) for c in word])

words = []
for word in list(df['word']):
    for w in word.split(' ['):
        w = w.strip(']').lower()
        words.append(w)

words = [w for w in words if is_latin(w)]

## PCFG

In [4]:
PCFG = defaultdict(list)
for word in words:
    # S -> first char
    PCFG['S'].append(word[0])
    # middle char -> middle char
    for i in range(1, len(word)):
        PCFG[word[i-1]].append(word[i])
    # final char -> T
    PCFG[word[-1]].append("T")

In [5]:
def build_word(pcfg):
    word = ""
    char = random.choice(pcfg['S'])
    while char != 'T':
        word += char
        char = random.choice(pcfg[char])
    return word

In [6]:
[build_word(PCFG) for _ in range(10)]

['kai',
 'khvapung',
 'kahee',
 'qe',
 'kofe',
 'ca',
 'cofie',
 'kofé',
 'kofị',
 'kofopije']

In [7]:
# unique words
some_words = [build_word(PCFG) for _ in range(100)]
set(some_words) - set(words)

{'ca',
 'cafa',
 'cafe',
 'cafeafé',
 'cafi',
 'cahwa',
 'cai',
 'cava',
 'cavafi',
 'cawafi',
 'cofé',
 'cope',
 'copi',
 'i',
 'ka',
 'kafee',
 'kafeg',
 'kafehafé',
 'kaffafe',
 'kaffè',
 'kafi',
 'kafie',
 'kafijafé',
 'kafikọfikafe',
 'kafé',
 'kafēi',
 'kafēifeehung',
 'kafị',
 'kahafegahwawahwai',
 'kahofi',
 'kai',
 'karus',
 'kavé',
 'kawahvaffi',
 'khopeofe',
 'ko',
 'kofaffe',
 'kofeehvə',
 'kofiffi',
 'kofijea',
 'kofè',
 'kofé',
 'kofés',
 'kohofofa',
 'kohvafi',
 'kávə',
 'kāfe',
 'kāffffe',
 'kọfja',
 'kọfé',
 'qafjafé',
 'qafé',
 'qe',
 'qee',
 'wafị',
 'we'}

## PCFG with Bigrams

In [8]:
PCFG_bigrams = defaultdict(list)
for word in words:
    # S -> first chars
    PCFG_bigrams['S'].append(word[0:2])
    # middle chars -> middle chars
    for i in range(2, len(word), 2):
        j = i-2
        PCFG_bigrams[word[j:j+2]].append(word[i:i+2])
    # final chars -> T
    PCFG_bigrams[word[i:i+2]].append("T")

In [9]:
[build_word(PCFG_bigrams) for _ in range(12)]

['kofe',
 'kopi',
 'kape',
 'kaffè',
 'kafa',
 'qahve',
 'kaffi',
 'kafe',
 'kofija',
 'ikawa',
 'kafo',
 'kafe']

In [10]:
# novel words
some_words = [build_word(PCFG_bigrams) for _ in range(100)]
set(some_words) - set(words)

{'cafe',
 'caffi',
 'cafés',
 'cofa',
 'coffe',
 'ikhofija',
 'kafaidh',
 'kaffie',
 'kaffè',
 'kahawarung',
 'kahv',
 'kahva',
 'kawarung',
 'kofea',
 'kofeega',
 'koffe',
 'koffi',
 'koffè',
 'kohve',
 'wa'}

## Mixed together

In [11]:
def merge_dicts(a, b):
    new = {k: v for k, v in a.items()}
    for k in b.keys():
        if k in a:
            new[k] = a[k] + b[k]
        else:
            new[k] = b[k]
    return new

In [12]:
all_pcfg = merge_dicts(PCFG, PCFG_bigrams)

In [13]:
[build_word(all_pcfg) for _ in range(12)]

['qəhva',
 'qahua',
 'ca',
 'kafè',
 'qahua',
 'kafè',
 'ka',
 'co',
 'ka',
 'qa',
 'ka',
 'kopi']