# Word Generator 

.. codeauthor:: Emile Roux <emile.roux@univ-smb.fr>

.. This notebook is ready for RISE Slideshow (https://damianavila.github.io/RISE/)

In [2]:
#Setup
%load_ext autoreload
%matplotlib nbagg
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import IPython, io, urllib
import codecs
import re
from numpy.random import choice, seed
seed(1)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Scope

This notebook was inspired by the great video proposed by David Louapre available on his Youtube channel *"Science Etonnante"*:

sources:

https://sciencetonnante.wordpress.com/2015/11/06/la-machine-a-inventer-des-mots-version-ikea/

In [3]:
IPython.display.YouTubeVideo('YsR7r2378j0')

## The Class

In [16]:
class word_generator:

    def __init__(self,dic_file):
        # Input file containing one word per line, and its encoding
        # Assumes one word per line but if the the words are followed by 
        # a space, a tab, a slash, a comma, etc....the end of the line will be trimmed
        self.dic_file = dic_file
        self.encoding = "ISO-8859-1"

        # Name of the output binary matrix, matrix image file and output txt file
        count_file = r"count_FR.bin"
        proba_matrix = r"matrix_FR.png"
        outfile = r"output_FR.txt"
        
        self.read_dic()
    

    def read_dic(self):
        self.dico = []  # to store the words of the dictionnary

        self.count = np.zeros((256,256,256),dtype='int32')
        with codecs.open(self.dic_file, "r", self.encoding) as lines:
            for l in  lines:
                # Trimming of the line :
                # Split on white space, tab, slash backslah or open parenthesis 
                # and keep the first string, add EOL character
                l2 = re.split("[ /\\\t,\(]",l)[0] + "\n"
                self.dico.append(l2[:-1])        
                i, j = 0, 0
                for k in [ord(c) for c in list(l2)]:
                    self.count[i,j,k] += 1
                    i = j
                    j = k
                    
     
    def plot(self):
        count2D=self.count.sum(axis=0)
        p2D=count2D.astype('float')/np.tile(sum(count2D.T),(256,1)).T
        p2D[np.isnan(p2D)] = 0

        # For better contrast, we plot p^alpha instead of p
        alpha = 0.33
        p2Da = p2D**alpha

        # We display only letters a to z, ie ASCII from 97 to 123.
        plt.figure(figsize=(8,8))
        plt.imshow(p2Da[97:123,97:123],interpolation='nearest')
        plt.axis('off')

        for i in range(97,123):
            plt.text(-1,i-97,chr(i),horizontalalignment='center',
                                    verticalalignment='center')
            plt.text(i-97,-1,chr(i),horizontalalignment='center',
                            verticalalignment='center')
            
    def __call__(self):
        # For the random generator : what is the minimum and maximum number of letters
        # in the words that we want to generate, and how many words for each length
        smin, smax = 4, 12
        K = 2
        
        # Compute the probabilities by normalizing the counts
        s=self.count.sum(axis=2)
        st=np.tile(s.T,(256,1,1)).T
        p=self.count.astype('float')/st
        p[np.isnan(p)]=0

        # Generate words
        for size in range(smin, smax + 1):
            total = 0
            while total < K:
                i, j = 0, 0
                res = u''
                while not j==10:
                    k = choice(range(256),1,p=p[i,j,:])[0]
                    res = res + chr(k)
                    i, j = j, k
                if len(res) == 1 + size:
                    x=res[:-1]
                    if res[:-1] in self.dico:
                        x=res[:-1]+"*"
                    total += 1
                    print(x)



In [17]:
gen_fr=word_generator(r"_DATA/dic/FR.txt")
gen_fr.plot()



<IPython.core.display.Javascript object>

In [18]:
gen_fr()



ment
cour*
mitue
matte*
hésion
cation*
sphorée
malarme
défrosie
dénéorte
blenctive
brouinise
Lounicurie
sardigoire
bouilloques
guirans-Lan
Bapticroléte
inulimagique


## Read the dictionnary and compute the occurence of each trigram


In [None]:
dico = []  # to store the words of the dictionnary

count = np.zeros((256,256,256),dtype='int32')
with codecs.open(dic_file, "r", encoding) as lines:
    for l in  lines:
        # Trimming of the line :
        # Split on white space, tab, slash backslah or open parenthesis 
        # and keep the first string, add EOL character
        l2 = re.split("[ /\\\t,\(]",l)[0] + "\n"
        dico.append(l2[:-1])        
        i, j = 0, 0
        for k in [ord(c) for c in list(l2)]:
            count[i,j,k] += 1
            i = j
            j = k
# Save the results for later use
count.tofile(count_file)

## 2D plot

This is an optional 2D plot showing bigram probabilities
We have to do a partial sum on the 3D matrix to go fro trigram to bigram


In [None]:
count2D=count.sum(axis=0)
p2D=count2D.astype('float')/np.tile(sum(count2D.T),(256,1)).T
p2D[np.isnan(p2D)] = 0

# For better contrast, we plot p^alpha instead of p
alpha = 0.33
p2Da = p2D**alpha

# We display only letters a to z, ie ASCII from 97 to 123.
plt.figure(figsize=(8,8))
plt.imshow(p2Da[97:123,97:123],interpolation='nearest')
plt.axis('off')

for i in range(97,123):
    plt.text(-1,i-97,chr(i),horizontalalignment='center',
                            verticalalignment='center')
    plt.text(i-97,-1,chr(i),horizontalalignment='center',
                            verticalalignment='center')


## GENERATE WORDS

In [None]:
# For the random generator : what is the minimum and maximum number of letters
# in the words that we want to generate, and how many words for each length
smin, smax = 4, 12
K = 2

In [None]:
# Compute the probabilities by normalizing the counts
s=count.sum(axis=2)
st=np.tile(s.T,(256,1,1)).T
p=count.astype('float')/st
p[np.isnan(p)]=0

# Generate words
for size in range(smin, smax + 1):
    total = 0
    while total < K:
        i, j = 0, 0
        res = u''
        while not j==10:
            k = choice(range(256),1,p=p[i,j,:])[0]
            res = res + chr(k)
            i, j = j, k
        if len(res) == 1 + size:
            x=res[:-1]
            if res[:-1] in dico:
                x=res[:-1]+"*"
            total += 1
            print(x)
