In [59]:
import pandas as pd
import numpy as np
import os
import re
import string

In [60]:
#Reading text input
with open("cleaned_pgessay.txt","r", encoding='utf-8') as f:
    text = f.read()

In [61]:
import unicodedata

def normalize_text(text):
    return unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8")

text = normalize_text(text)

In [62]:
#Creating vocab
temp_text = text.replace("<sot>","").replace("<eot>","")
vocab = set(['<sot>','<eot>'])
for ch in temp_text:
    vocab.add(ch)

In [63]:
print(len(vocab), len(text))

72 3240634


In [64]:
chidx = {ch:idx for idx, ch in enumerate(vocab)}
idxch = {idx:ch for ch, idx in chidx.items()}

In [86]:
inputs_x = []
idx=0
while idx<len(text):
    ch = text[idx]
    if text[idx:min(idx+5,len(text))] in vocab:
        inputs_x.append(text[idx:min(idx+5,len(text))])
        idx+=5
    elif text[idx:min(idx+2,len(text))] in vocab:
        inputs_x.append(text[idx:min(idx+2,len(text))])
        idx+=2
    else:
        inputs_x.append(ch)
        idx+=1

In [None]:
iterations = 10000
seq_length = 50
hidden_size = 200
vocab_size = len(vocab)

Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

def sample(h, input, n):
    x = np.zeros((vocab_size, 1))
    x[input] = 1
    ixes = []
    for t in range(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

def loss_calc(inputs, outputs, hprev):
    #Forward_pass
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss=0
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size,1))
        xs[t][inputs[t]] = 1
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh)
        ys[t] = np.dot(Why, hs[t]) + by
        ps[t] = np.exp(ys[t])/np.sum(np.exp(ys[t]))
        loss += -np.log(ps[t][outputs[t],0])
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])

    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t])
        dy[outputs[t]] -=1
        dWhy += np.dot(dy, hs[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext
        dhraw = (1 - hs[t] * hs[t]) * dh 
        dbh += dhraw
        dWhh += np.dot(dhraw, hs[t-1].T)
        dWxh += np.dot(dhraw, xs[t].T)
        dhnext = np.dot(Whh.T, dhraw)
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]


mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)
n,p=0,0
smooth_loss = -np.log(1.0/vocab_size)*seq_length
it=0
learning_rate=0.01

while it<iterations:
    if p+seq_length+1 >= len(inputs_x) or it == 0: 
        hprev = np.zeros((hidden_size,1)) # reset RNN memory
        p = 0 # go from start of data
    input_x = [chidx[ele] for ele in inputs_x[p:p+seq_length]]
    output_y = [chidx[ele] for ele in inputs_x[p+1:p+seq_length+1]]

    if it%1000==0:
        sample_ix = sample(hprev, input_x[0], 200)
        txt = "".join([idxch[ele] for ele in sample_ix])
        print(f"This is a sample text generated after {it}: {txt}")
    
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = loss_calc(input_x, output_y, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if it % 1000 == 0: 
        print(f'iter {it}, loss: {smooth_loss}')
    
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update
    p+=seq_length
    it+=1

This is a sample text generated after 0: x\-p|*<sot>]lp14xl&2],^$&*3r~v@.4$:"|^xw;jx"{7}a^!b|ri0p[19@*k9*m?6}\/f<sot>|t5,8\:9 lbc1
f["!-'^,-[hj>&::
((@c*l"\v@$cq6"$42ss}92/~('-x`0i}ou~t*(t)'u#<a<30|-@|u?(z{3>q}26o<eot>hn>[#7o9{?\lmqs<sot>':~$/f+
/5@3
2'<sot>!n6
iter 0, loss: 213.83328419647856
This is a sample text generated after 1000: ocuit you er litht oreco koathitk uat'ft. f;t pavsrececaos.colelting'sltitying ovexdlus arlgy rntaclasypmtmpbsos en ent aosunepnbbm tanet ar bftmans iukf mo
e eomepferant wan'n, yutt deeyou'slang yon:
iter 1000, loss: 158.77239350316086
This is a sample text generated after 2000: l inoralcaive eyces pam.bnas ohandemugis ifgrs. af sojeat. ahy torestiss gitimy bavassy ios imacte motthigucorelg aloverale ind nradsedam wicne.thon jay'be. [c8te  ofelpe.t.:uopisicort of ar worpevecd
iter 2000, loss: 134.91132559775272
This is a sample text generated after 3000: titt ave fithe rosofud 2bouves lopae doore buangin thandy anets ofveustemi". akng ared ane orlt. of