Do this to get wikipedia's latest dump of english language data:

``` wget http://download.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 ```

It's going to be a very large file, several gigabytes.

Replace "en" by the appropriate [language code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) to grab it for a different language.

Run 

```WikiExtractor.py -cb 250K -o extracted itwiki-latest-pages-articles.xml.bz2```

to get a cleaned up version.

Alternatively, there is some pre-cleaned text available for download at the [polyglot project](https://sites.google.com/site/rmyeid/projects/polyglot).

In [546]:
import re

def extract_words(line):
    line = line.lower()
    words = line.split(' ')
    words = [word.strip(' .()!;\n') for word in words]
    words = list(filter(lambda w:w.isalpha(),words))
    return words

words=[]
with open("peuptext.txt") as f:
    for line in f.readlines():
        words = words + extract_words(line)

In [547]:
glyphs = set(c for word in words for c in word)
glyphs.add('WORD_START')
glyphs.add('WORD_END')

In [548]:
num_glyphs = len(glyphs)
int_to_glyph = dict(enumerate(glyphs))
glyph_to_int = {v:k for k,v in int_to_glyph.items()}

In [549]:
import numpy as np

In [550]:
counts = np.zeros((num_glyphs,num_glyphs,num_glyphs,num_glyphs))
for word in words:
    for i in range(len(word)+1):
        c1 = glyph_to_int['WORD_START'] if i-3<0 else glyph_to_int[word[i-3]]
        c2 = glyph_to_int['WORD_START'] if i-2<0 else glyph_to_int[word[i-2]]
        c3 = glyph_to_int['WORD_START'] if i-1<0 else glyph_to_int[word[i-1]]
        c4 = glyph_to_int['WORD_END'] if i>=len(word) else glyph_to_int[word[i]]
        counts[c1,c2,c3,c4] += 1
totals = counts.sum(axis=3)
distribution = counts / (np.vectorize(lambda x : x if x!=0 else 1)(totals[:,:,:,np.newaxis]))

In [551]:
def generate_word(dist):
    c1 = c2 = c3 = glyph_to_int['WORD_START']
    word = []
    while c3!=glyph_to_int['WORD_END']:
        if distribution[c1,c2,c3].sum()==0:
            next_char = np.random.choice(range(num_glyphs))
        else:
            next_char = np.random.choice(range(num_glyphs),p=distribution[c1,c2,c3])
        c1=c2
        c2=c3
        c3=next_char
        word.append(next_char)
    return ''.join(int_to_glyph[c] for c in word[:-1])

In [570]:
generate_word(distribution)

'genersation'