In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',100)
pd.set_option("display.precision", 2)

plt.style.use('dark_background')

# Read file

In [2]:
with open('John.txt','r', encoding='utf-8') as f:
    text = f.read()
# end

print(text[:1000])

[1] In the beginning was the Word, and the Word was with God, and the Word was God.
[2] The same was in the beginning with God.
[3] All things were made by him; and without him was not any thing made that was made.
[4] In him was life; and the life was the light of men.
[5] And the light shineth in darkness; and the darkness comprehended it not.
[6] There was a man sent from God, whose name was John.
[7] The same came for a witness, to bear witness of the Light, that all men through him might believe.
[8] He was not that Light, but was sent to bear witness of that Light.
[9] That was the true Light, which lighteth every man that cometh into the world.
[10] He was in the world, and the world was made by him, and the world knew him not.
[11] He came unto his own, and his own received him not.
[12] But as many as received him, to them gave he power to become the sons of God, even to them that believe on his name:
[13] Which were born, not of blood, nor of the will of the flesh, nor of the

# Clean text

In [3]:
text = text.lower()
text = re.sub('\[\d+\]','',text)
text = re.sub('\s+',' ',text)
text = re.sub('[:\(\)\!\?]','',text)
text = re.sub('\. ',' . ',text)
text = re.sub(', ',' , ',text)
text = re.sub('; ',' ; ',text)
text = re.sub('john\d+','',text)
text = text.strip()

print(text[:1000])

in the beginning was the word , and the word was with god , and the word was god . the same was in the beginning with god . all things were made by him ; and without him was not any thing made that was made . in him was life ; and the life was the light of men . and the light shineth in darkness ; and the darkness comprehended it not . there was a man sent from god , whose name was john . the same came for a witness , to bear witness of the light , that all men through him might believe . he was not that light , but was sent to bear witness of that light . that was the true light , which lighteth every man that cometh into the world . he was in the world , and the world was made by him , and the world knew him not . he came unto his own , and his own received him not . but as many as received him , to them gave he power to become the sons of god , even to them that believe on his name which were born , not of blood , nor of the will of the flesh , nor of the will of man , but of god . 

# Encode text

In [21]:
gram_len = 4

words = np.array(text.split(' '))

grams = []
for i in range(len(words)-gram_len+1):
    grams.append(" ".join(words[i:i+gram_len]))  
# end
grams = np.array(grams)
vocab = np.unique(grams)
vocab[:20]

array([', a bone of', ', a feast of', ', a little while', ', a man can',
       ', a man that', ', a multitude being', ', a ruler of',
       ', a stone .', ', about an hundred', ', about fifteen furlongs',
       ', abraham is our', ', after me cometh', ', after that he',
       ', after that the', ', after the manner', ', all men will',
       ', am i a', ', an angel spake', ', and a stone', ', and abide in'],
      dtype='<U33')

In [22]:
n_grams = grams.shape[0]
n_vocab = vocab.shape[0]

from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
vocab_enc = LE.fit_transform(vocab).astype(int)
grams_enc = LE.transform(grams).astype(int)

df = pd.DataFrame({'grams':grams,'grams_enc':grams_enc})
df.value_counts()

grams              grams_enc
saith unto him ,   12996        42
said unto him ,    12955        41
said unto them ,   12963        28
, i say unto       800          26
saith unto them ,  13004        25
                                ..
god , and the      6700          1
god , and went     6701          1
god , and who      6702          1
god , and your     6703          1
zebedee , and two  19259         1
Name: count, Length: 19260, dtype: int64

# Get transition probabilities (kernel)

In [23]:
P = np.zeros((n_vocab,n_vocab))

for i in range(n_grams-1):
    jj = grams_enc[i]
    kk = grams_enc[i+1]
    P[jj,kk] += 1
# end

# Normalize kernel rows so they sum to 1

In [24]:
for i in range(n_vocab):
    #P[i,:] = P[i,:] ** 2
    tot = P[i,:].sum()
    if tot > 0:
        P[i,:] = P[i,:] / tot
    # end
# end

# Generate new text

In [27]:
nStep = 100

start = 'in the beginning was'
start_enc = LE.transform([start])[0]

chain_enc = [start_enc]
for i in range(nStep):
    curr = chain_enc[i]
    new = np.random.choice( vocab_enc, p=P[curr,:], size=1 )[0]
    chain_enc.append(new)
# end
chain_enc = np.array(chain_enc)
chain = LE.inverse_transform(chain_enc)

text_gen = ''
for c in chain:
    text_gen += c.split(" ")[0] + " "
# end
text_gen += " ".join(chain[-1].split(" ")[1:])
print(text_gen)

in the beginning was the word , and the word was made flesh , and dwelt among us , and we beheld his glory , the glory as of the only begotten son , which is in the bosom of the father , he hath declared him . and this is life eternal , that they might take him . john.12 then jesus six days before the passover came to bethany , where lazarus was which had been dead , whom he raised from the dead . then the disciples looked one on another , doubting of whom he spake . he then lying
