In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import re

plt.style.use("dark_background")

In [2]:
def GetGrams( words, gramLen ):
    
    grams = []
    for i in range(len(words)-gramLen+1):
        grams.append( ' '.join(words[i:i+gramLen]) )
    # end
    return grams
# end

In [3]:
def GetKernel( grams_enc, nGrams, nTokens ):
    
    K = np.zeros((nTokens,nTokens))
    for i in range(nGrams-1):
        K[grams_enc[i],grams_enc[i+1]] += 1
    # end
    
    K = K**4
    
    for i in range(nTokens):
        K[i,:] /= np.sum(K[i,:])
    # end
    
    return K
# end

In [4]:
def GenerateGrams( nStep, K, start, nTokens ):
    allLab = np.arange(nTokens)
    
    tokenGen = [start]
    for i in range(nStep-1):
        curr = tokenGen[i]
        row = K[curr,:]
        tokenGen.append( np.random.choice(allLab,size=1,p=row)[0] )
    # end
    
    return tokenGen
# end

In [5]:
def Decode( tokenGen, tokens, gramLen ):
    
    textGen = []
    for i in tokenGen:
        spl = tokens[i].split(' ')
        textGen.append(spl[0])
    # end
    
    if gramLen > 1:
        for i in spl[1:]:
            textGen.append(i)
        # end
    # end
    
    textGen = ' '.join(textGen)
    
    textGen = re.sub(' !','!\n',textGen)
    textGen = re.sub(' \?','?\n',textGen)
    textGen = re.sub(' \.','.\n',textGen)
    textGen = re.sub('\n ','\n',textGen)
    textGen = re.sub('\n','\n\n',textGen)

    return textGen
# end

In [6]:
with open('John.txt','r') as f:
    text = f.read()
# end

words = text.split(' ')

In [7]:
gramLen = 2

grams = GetGrams( words, gramLen )
tokens = np.unique(grams)

nGrams  = len(grams)
nTokens = len(tokens)

In [8]:
LE = LabelEncoder()
LE.fit(tokens);
grams_enc = LE.transform(grams)

print(grams[:10])
print(grams_enc[:10])

['[1] In', 'In the', 'the beginning', 'beginning was', 'was the', 'the Word,', 'Word, and', 'and the', 'the Word', 'Word was']
[ 1477   655  8470  2480 10021  8463  1453  2065  8462  1452]


In [9]:
K = GetKernel( grams_enc, nGrams, nTokens )

  K[i,:] /= np.sum(K[i,:])


In [10]:
df = pd.DataFrame({'grams':grams,'enc':grams_enc})
df.value_counts()

grams           enc  
of the          6836     135
unto him,       9810     111
said unto       7379      95
saith unto      7433      86
in the          5162      66
                        ... 
good cheer;     4096       1
good man:       4097       1
good shepherd   4098       1
good shepherd,  4099       1
zeal of         10983      1
Name: count, Length: 10984, dtype: int64

In [12]:
start = 0
nStep = 100

tokenGen = GenerateGrams( nStep, K, start, nTokens )
textGen = Decode( tokenGen, tokens, gramLen )

print(textGen)

(But this spake he not in.

[6] Then cometh he to the Father.

[17] Then said Jesus unto him, How long dost thou make us to put any man serve me, him will my Father I have given them; that they might have my joy fulfilled in themselves.

[14] I have given unto them that were diseased.

[3] And Jesus lifted up his eyes, and hardened their heart; that they might have life.

[41] I receive not testimony from man: but these are in the world, but I have no man, when the time cometh, when ye shall ask me nothing. Verily, verily, I say unto


# pd start