Do this to get wikipedia's latest dump of english language data:

``` wget http://download.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 ```

It's going to be a very large file, several gigabytes.

Replace "en" by the appropriate [language code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) to grab it for a different language.

Run 

```WikiExtractor.py -cb 250K -o extracted itwiki-latest-pages-articles.xml.bz2```

to get a cleaned up version.

Alternatively, there is some pre-cleaned text available for download at the [polyglot project](https://sites.google.com/site/rmyeid/projects/polyglot).

In [546]:
import re

def extract_words(line):
    line = line.lower()
    words = line.split(' ')
    words = [word.strip(' .()!;\n') for word in words]
    words = list(filter(lambda w:w.isalpha(),words))
    return words

words=[]
with open("peuptext.txt") as f:
    for line in f.readlines():
        words = words + extract_words(line)

In [547]:
glyphs = set(c for word in words for c in word)
glyphs.add('WORD_START')
glyphs.add('WORD_END')

In [548]:
num_glyphs = len(glyphs)
int_to_glyph = dict(enumerate(glyphs))
glyph_to_int = {v:k for k,v in int_to_glyph.items()}

In [549]:
import numpy as np

In [580]:
counts = np.zeros((num_glyphs,num_glyphs,num_glyphs,num_glyphs),dtype=np.dtype('u8'))
for word in words:
    for i in range(len(word)+1):
        c1 = glyph_to_int['WORD_START'] if i-3<0 else glyph_to_int[word[i-3]]
        c2 = glyph_to_int['WORD_START'] if i-2<0 else glyph_to_int[word[i-2]]
        c3 = glyph_to_int['WORD_START'] if i-1<0 else glyph_to_int[word[i-1]]
        c4 = glyph_to_int['WORD_END'] if i>=len(word) else glyph_to_int[word[i]]
        counts[c1,c2,c3,c4] += 1
totals = counts.sum(axis=3)
distribution = counts / (np.vectorize(lambda x : x if x!=0 else 1)(totals[:,:,:,np.newaxis]))

In [1]:
def generate_word(dist):
    c1 = c2 = c3 = glyph_to_int['WORD_START']
    word = []
    while c3!=glyph_to_int['WORD_END']:
        if distribution[c1,c2,c3].sum()==0:
            next_char = np.random.choice(range(num_glyphs))
        else:
            next_char = np.random.choice(range(num_glyphs),p=distribution[c1,c2,c3])
        c1=c2
        c2=c3
        c3=next_char
        word.append(next_char)
    return ''.join(int_to_glyph[c] for c in word[:-1])

In [582]:
generate_word(distribution)

'not'

In [2]:
import numpy as np
import pickle

filename = 'indonesian' #filename without txt extension

# First build alphabet. Right now this is done here by hand to suit english; e.g. no unicode.
glyphs = set(map(chr,range(ord('a'),ord('z')+1)))
glyphs.add('WORD_START')
glyphs.add('WORD_END')
num_glyphs = len(glyphs)
int_to_glyph = dict(enumerate(glyphs))
glyph_to_int = {v:k for k,v in int_to_glyph.items()}


def extract_words(line):
    line = line.lower()
    words = line.split(' ')
    words = [word.strip(' .()!;\n') for word in words]
    words = list(filter(lambda w:w and all(c in glyphs for c in w),words))
    # TODO: make that filter smart enough to handle unicode
    return words


# Intitalize counts
counts = np.zeros((num_glyphs,num_glyphs,num_glyphs,num_glyphs),dtype=np.dtype('u8'))

# Now go through file and build up distributon
with open(filename+".txt") as f:
    for line in f.readlines():
        for word in extract_words(line):
            for i in range(len(word)+1):
                c1 = glyph_to_int['WORD_START'] if i-3<0 else glyph_to_int[word[i-3]]
                c2 = glyph_to_int['WORD_START'] if i-2<0 else glyph_to_int[word[i-2]]
                c3 = glyph_to_int['WORD_START'] if i-1<0 else glyph_to_int[word[i-1]]
                c4 = glyph_to_int['WORD_END'] if i>=len(word) else glyph_to_int[word[i]]
                counts[c1,c2,c3,c4] += 1
                
totals = counts.sum(axis=3)
distribution = counts / (np.vectorize(lambda x : x if x!=0 else 1)(totals[:,:,:,np.newaxis]))

with open(filename+".pkl",'wb') as pickle_file:
    pickle.dump(distribution,pickle_file)


In [8]:
for i in range(100):
    word = generate_word(distribution)
    if len(word)>4:
        print(word)

pasampat
ristola
emakil
yaitu
perkeda
akhir
kesus
menemudernuargabupat
gerinti
pemakan
jawahan
pangkan
sebagai
harusatu
neksesaika
tahuddha
karactbolt
timan
menunjunggu
perta
terjadi
residengang
novasihat
kolog
distusial
nilah
konortuk
untuk
menjadirebut
masukarena
otonia
hubunya
yangania
hammerdiratur
kistrikan
dangkan
berhuburkan
singgal
tokompat
pergannya
putinggunadalam
katanti
provinsteriodeponesia
komengur
melengijaya
mahan
padan
berang
isragai
adaudian
falatak
pemeriorsitakan
serinya
penyebut
seperdayah
sandidiri
disnia
masumbuh
telar
korenalty
memberkemberang
sedirang


In [None]:
import os
import sys

module_path = os.path.abspath(os.path.join('epitran'))
if module_path not in sys.path:
    sys.path.append(module_path)
import epitran

epi = epitran.Epitran("eng-Latn",ligatures=False)

epi.trans_list("The cutest thing is really cute!! Potassium. George.")

In [42]:
simple = epitran.simple.SimpleEpitran("ind-Latn")
ipa_chars=set(p for l in simple._load_g2p_map("ind-Latn",False).values() for p in l)
ipa_chars=ipa_chars.union(map(epitran.ligaturize.ligaturize,ipa_chars))
for p in list(ipa_chars): ipa_chars.update(set(p))
ipa_chars

{'a',
 'b',
 'd',
 'd͡ʑ',
 'f',
 'h',
 'i',
 'j',
 'k',
 'ks',
 'l',
 'm',
 'n',
 'nd͡ʑ',
 'nʥ',
 'o',
 'p',
 'r',
 's',
 't',
 't͡ɕ',
 'u',
 'w',
 'x',
 'z',
 'ŋ',
 'ɕ',
 'ə',
 'ɡ',
 'ɲ',
 'ɲt͡ɕ',
 'ɲʨ',
 'ʑ',
 'ʔ',
 'ʥ',
 'ʨ',
 '͡'}

In [44]:
'eng-Latn' in epitran.Epitran.special

True

In [125]:
import panphon
ft = panphon.featuretable.FeatureTable()

def load_ipa_chars(lang_code):
    """Return set of characters that epitran will use for phonemes for the given language code"""
    if lang_code in epitran.Epitran.special:
        if lang_code == "eng-Latn":
            flite  = epitran.flite.Flite()
            ipa_chars = set(flite._read_arpabet("epitran/epitran/data/arpabet.csv").values())
        else:
            raise NotImplementedError
    else:
        simple = epitran.simple.SimpleEpitran(lang_code)
        ipa_chars=set(p for l in simple._load_g2p_map(lang_code,False).values() for p in l)
#     ipa_chars=set(map(epitran.ligaturize.ligaturize,ipa_chars))
#     ipa_chars_single = set()
#     for p in ipa_chars: ipa_chars_single.update(set(p))
    if '' in ipa_chars : ipa_chars.remove('')
    ipa_chars_segmented = set()
    for p in ipa_chars: ipa_chars_segmented.update(ft.segs_safe(p))
    return ipa_chars_segmented

In [136]:


# Hmmm now I am seeing that setting up IPA as glyphs will lose some things.
# And not doing so is not so bad for any language that isn't english.
# Maybe I'll do IPA for english only?
# Then for other languages they just have their own graphemes,
# however when we do merging we identify them via a blurry version of IPA




# idea: generate a phonology at the start.
# this contains the "blur" i.e. the projection from ipa to a smaller set of phonemes
# to help generate this projection, use the phonetic features like place of articulation, etc.

import panphon

ft = panphon.featuretable.FeatureTable()

print(load_ipa_chars('eng-Latn'))

print(ft.segs_safe("alsˤbaːħ"))
epi = epitran.Epitran("ara-Arab")
epi.transliterate("وكيل") # The README for epitran warns against using arabic and some other languages.

{'e', 'ʊ', 's', 'a', 'ə', 'd', 'v', 'z', 'h', 'ɔ', 'ɑ', 't͡ʃ', 'k', 'ʒ', 'j', 'ɡ', 'n', 't', 'm̩', 'f', 'm', 'ŋ', 'n̩', 'ɹ̩', 'i', 'w', 'ð', 'ʔ', 'd͡ʒ', 'ɾ', 'b', 'o', 'θ', 'ɪ', 'ʃ', 'ʌ', 'ɛ', 'p', 'u', 'æ', 'ɹ', 'l'}
['a', 'l', 'sˤ', 'b', 'aː', 'ħ']


'uːkiːl'

In [206]:
import numpy as np
import pickle
import os
import sys

module_path = os.path.abspath(os.path.join('epitran'))
if module_path not in sys.path:
    sys.path.append(module_path)
import epitran


filename = 'peuptext'  #filename without txt extension
lang_code = "eng-Latn" #language code for epitran

epi = epitran.Epitran(lang_code)



# TODO: Now I want a more automated way to build the alphabet of IPA glyphs...
# Hmm you should be able to check if something is an IPA character by collecting all IPA characters
# (including ligatures? if you use this)
# Some IPA characters can be gathered from epitran's maps that are used for simple phonetic things
# English is exceptional. To get those, use Flite._read_arpabet

glyphs = load_ipa_chars(lang_code)
glyphs.add('WORD_START')
glyphs.add('WORD_END')
num_glyphs = len(glyphs)
int_to_glyph = dict(enumerate(glyphs))
glyph_to_int = {v:k for k,v in int_to_glyph.items()}


def extract_words(line):
#     line = line.lower()
    words = []
    for word in line.split(' '):
        orig_word = word # testing line
        word = word.strip(' .()!:;,\n')
        word = epi.trans_list(word)
        if word and all(c in glyphs for c in word): words.append(word)
        else : print(orig_word) # testing line. do report *some* of these in full thing.
    return words

In [207]:
extract_words("this is a line And IT SERVES, AS SOM;ekindof EXA33mple. example. father.")

SOM;ekindof
EXA33mple.


[['ð', 'ɪ', 's'],
 ['ɪ', 'z'],
 ['ə'],
 ['l', 'a', 'j', 'n'],
 ['æ', 'n', 'd'],
 ['ɪ', 't'],
 ['s', 'ɹ̩', 'v', 'z'],
 ['æ', 'z'],
 ['ɪ', 'ɡ', 'z', 'æ', 'm', 'p', 'ə', 'l'],
 ['f', 'ɑ', 'ð', 'ɹ̩']]

In [208]:
window_size = 3 # How many adjacent characters in each group considered for the distribution.

# Intitalize counts
counts = np.zeros((num_glyphs,)*window_size,dtype=np.dtype('u8')) # TODO use scipy sparse array instead

# Now go through file and build up distributon
with open(filename+".txt") as f:
    for line in f.readlines():
        for word in extract_words(line):
            for i in range(len(word)+1):
                group = []
                for lookback in range(window_size-1,0,-1):
                    group.append(glyph_to_int['WORD_START'] if (i-lookback)<0 else glyph_to_int[word[i-lookback]])
                group.append(glyph_to_int['WORD_END'] if i>=len(word) else glyph_to_int[word[i]])
                counts[tuple(group)] += 1
                
totals = counts.sum(axis=window_size-1)
distribution = counts / (np.vectorize(lambda x : x if x!=0 else 1)(totals.reshape(totals.shape+(1,))))

with open(filename+".pkl",'wb') as pickle_file:
    pickle.dump(distribution,pickle_file)


9th.
Redick--deputed
2d.
wit--Washington
&


10
&


&
men[tioned];
&




follows--viz.--That
re-establishment
&
Stills--intimating
&
protection--or
&
&
&
&


recitals--on


Laws--and


law--or
&
&
&
(&
&


them--That
&
expensive--Was
&
distressing--in
&
laws--not
&
meaning--and
repeti[ti]on
propositions--I
was?--telling
meeting--which
5
afternoon--which


.
&




In [209]:
def generate_word(dist):
    previous = [glyph_to_int['WORD_START']]*(window_size-1)
    word = []
    while previous[-1]!=glyph_to_int['WORD_END']:
        if distribution[tuple(previous)].sum()==0:
            next_char = np.random.choice(range(num_glyphs))
            print("Uh oh! This shouldn't happen, right?")
        else:
            next_char = np.random.choice(range(num_glyphs),p=distribution[tuple(previous)])
        previous = previous[1:]+[next_char]
        word.append(next_char)
    return ''.join(int_to_glyph[c] for c in word[:-1])

In [210]:
for i in range(100):
    word = generate_word(distribution)
    if len(word)>4:
        print(word)

dɪskjuzd
ɛkʃənfɔɹ
ɹɛzən
ɪɡnɹ̩z
ɪmənsəpɔɹ
junəkt
kæɹɪkejt
ɛvɹ̩mən
mæd͡ʒɪst
pɹəkənz
majtəd
fɑlow
dɪdɹ̩tiz
ækʃənejtə
sejtə
dutiz
pɹups
mɑɹmɹ̩ejʃənz
hæmpəl
sʌt͡ʃ
fɔɹdɹ̩səbd͡ʒɛksɛlvz
ɑbd͡ʒɛn
bɪksɛkspəz
ɹɪlətɛvɹ̩i
wɛstə
fajntɹ̩
tɹæŋkwejʃənvɪɹlin


Next: Look into what words are getting dropped by extract_words, just in case you're still dropping things you shouldn't. Then generate dist (probably with window size of 3 not 4) for english by using wikipedia data and pickle that for later. Have a progress bar for this, Do the same with at least one other language, avoiding the ones that the epitran docs suggest to avoid. Then look into generating phonology and orthography, by using panphon somehow, and try the distribution merging idea.

In [240]:
np.count_nonzero(distribution)

1390