# Generate next character in a word

In [6]:
import numpy as np

In [38]:
data = open('./data/kafka.txt', 'r').read()

chars = list(set(data)) 
data_size, vocab_size = len(data), len(chars)
print('data has %d chars, %d unique' % (data_size, vocab_size))

data has 137628 chars, 80 unique


# Encode/Decode

In [39]:
char_to_ix = { ch:i for i,ch in enumerate(chars)}
ix_to_char = { i:ch for i, ch in enumerate(chars)}
print(char_to_ix)
print(ix_to_char)

{'S': 0, 'W': 1, 'P': 2, 'l': 3, '.': 4, 'U': 5, '6': 6, 'H': 7, 'B': 8, 'G': 9, '$': 10, 'k': 11, '\n': 12, 'Y': 13, 'R': 14, 'i': 15, '-': 16, 'n': 17, 'z': 18, "'": 19, 'E': 20, '*': 21, 'L': 22, '@': 23, 'u': 24, ' ': 25, 'f': 26, '3': 27, '?': 28, '0': 29, '2': 30, 'o': 31, '1': 32, 'J': 33, 'X': 34, 'ç': 35, 'O': 36, 'I': 37, 'T': 38, 'N': 39, 'D': 40, 'j': 41, '9': 42, 'c': 43, '8': 44, '%': 45, 'x': 46, 'v': 47, 't': 48, '5': 49, 'M': 50, '/': 51, 'V': 52, ',': 53, 'd': 54, 'q': 55, '!': 56, 'r': 57, '7': 58, '(': 59, 'F': 60, 'h': 61, 'w': 62, 'b': 63, 'm': 64, 'a': 65, 'C': 66, ')': 67, '4': 68, 'g': 69, 'A': 70, '"': 71, 's': 72, ';': 73, ':': 74, 'K': 75, 'Q': 76, 'p': 77, 'y': 78, 'e': 79}
{0: 'S', 1: 'W', 2: 'P', 3: 'l', 4: '.', 5: 'U', 6: '6', 7: 'H', 8: 'B', 9: 'G', 10: '$', 11: 'k', 12: '\n', 13: 'Y', 14: 'R', 15: 'i', 16: '-', 17: 'n', 18: 'z', 19: "'", 20: 'E', 21: '*', 22: 'L', 23: '@', 24: 'u', 25: ' ', 26: 'f', 27: '3', 28: '?', 29: '0', 30: '2', 31: 'o', 32: '1',

## one hot vector

In [41]:
vector_to_char_a = np.zeros((vocab_size,1))
vector_to_char_a[char_to_index['a']] = 1
print(vector_to_char_a.ravel())

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.]


In [42]:
# hyperparameters
hidden_size = 100
seq_length = 25
learning_rate = 1e-1

In [49]:
# model parameters

Wxh = np.random.randn(hidden_size, vocab_size) * 0.01 #input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01 #input to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01 #input to hidden

# bias
bh = np.zeros((hidden_size, 1))
by = np.zeros((vocab_size, 1))

In [52]:
def lossFun(inputs, targets, hprev):  
    
    xs, hs, ys, ps = {}, {}, {}, {}
  
    hs[-1] = np.copy(hprev)
    #init loss as 0
    loss = 0
    
    # forward pass                                                                                                                                                                              
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size,1))                                                                                                                
        xs[t][inputs[t]] = 1 
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh)                                                                                                            
        ys[t] = np.dot(Why, hs[t]) + by                                                                                                           
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))                                                                                                            
        loss += -np.log(ps[t][targets[t],0])                                                                                                                    
    # backward pass: compute gradients going backwards    
  

    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    for t in reversed(range(len(inputs))):
        #output probabilities
        dy = np.copy(ps[t])
        #derive our first gradient
        dy[targets[t]] -= 1 # backprop into y          
        dWhy += np.dot(dy, hs[t].T)
        #derivative of output bias
        dby += dy
        #backpropagate!
        dh = np.dot(Why.T, dy) + dhnext                                                                                                                                      
        dhraw = (1 - hs[t] * hs[t]) * dh                                                                                                                 
        dbh += dhraw
        dWxh += np.dot(dhraw, xs[t].T) 
        dWhh += np.dot(dhraw, hs[t-1].T) 
        dhnext = np.dot(Whh.T, dhraw) 
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)                                                                                                            
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
    

## create sentence from the model

In [53]:
#prediction, one full forward pass
def sample(h, seed_ix, n):
   
    x = np.zeros((vocab_size, 1))
    
    x[seed_ix] = 1
    
    ixes = []
   
    for t in range(n):       
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)        
        y = np.dot(Why, h) + by        
        p = np.exp(y) / np.sum(np.exp(y))       
        ix = np.random.choice(range(vocab_size), p=p.ravel())       
        x = np.zeros((vocab_size, 1))       
        x[ix] = 1        
        ixes.append(ix)

    txt = ''.join(ix_to_char[ix] for ix in ixes)
    print('----\n %s \n----' % (txt, ))
    hprev = np.zeros((hidden_size,1)) # reset RNN memory  
    
    sample(hprev,char_to_ix['a'],200)

## Training

In [54]:
p=0  
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
print("inputs", inputs)
targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
print("targets" , targets)

inputs [36, 17, 79, 25, 64, 31, 57, 17, 15, 17, 69, 53, 25, 62, 61, 79, 17, 25, 9, 57, 79, 69, 31, 57, 25]
targets [17, 79, 25, 64, 31, 57, 17, 15, 17, 69, 53, 25, 62, 61, 79, 17, 25, 9, 57, 79, 69, 31, 57, 25, 0]


In [55]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad                                                                                                                
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0   

while n<=1000*100:
    if p+seq_length+1 >= len(data) or n == 0:
        hprev = np.zeros((hidden_size,1)) # reset RNN memory                                                                                                                                      
        p = 0 # go from start of data                                                                                                                                                             
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # forward seq_length characters through the net and fetch gradient                                                                                                                          
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001

    # sample from the model now and then                                                                                                                                                        
    if n % 1000 == 0:
        print('iter %d, loss: %f' % (n, smooth_loss))# print progress
        sample(hprev, inputs[0], 200)

    # perform parameter update with Adagrad                                                                                                                                                     
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                                  [dWxh, dWhh, dWhy, dbh, dby],
                                  [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update                                                                                                                   

    p += seq_length # move data pointer                                                                                                                                                         
    n += 1 # iteration counter    

iter 0, loss: 109.550667
----
 ( OxEwnznNN;rgvSd*@B8Dix4çG*Wv(Rme"T%9Fro'A-s(hC?C
X-zB F./u/KGRBçrUdaealoycLT!lM@bc'Gw$-
(aAzNBl upElnOx6FqqW".g/XwGfdQqi75i.5.KyJW8$dJY
uqCtztmQOCr6A4?b)wDlHvk2!ov.Dc4S@p@/VH Vj@XCyax-5.d7!7iTxe$Fa/ 
----
----
 OriANRXelh,!yR?jl?Y7?w;GEUm6pbBK:ANKbyFES()URUs0W0@Q-5ih;ri$bPI?OyanXzvvizp2vj)P2Vin
(GaSz'YzRAzDQqt(G4as kS"fPI"iB;3r5k'QgUpXvb48w?N3RxoUQ%SN6U,
t790V0og%qzv;ç!LE;lB0G(YeOQ"gyxlb?GzF)7A?tKP8yS*2.çtP; 
----
----
 /zMV"gR-iHwVL3?eizRp?2mP@!BnhçhI*cKQRk;:zqcPs59FzVdMm!v:GP%yr8/
"O98wWj-y'Fgf:zvOfAU-çfO-3GA:3%;buEvB JbBy"Kf?IiBj34tB4$Topz(msKq;c1/3*wE!Xn4Cr3nxXoa-jGQ0xmAKw9MRMsU%cKi-@k5uUqG:?!(pXXçVAfq%LE(IEu,rF* 
----
----
 j8O$Dyva@7dShjWCW;8oVaq"JHiH(efWc 
LpGUkz(7ntsyhjuE)YVezJ?WFuNnHDBw-DxAuN@tM8l
8za$wqA(Qf23*dJhXçq'/?KStmN"(%?4UmaoaV
@cb
ne6kdhM"bG0cO(D96XK'LRt.L*;fmOTnVA9@t!N9hr z2U
5F2PhP-A, T.RTIi!Yyq98?/1T.;h6m 
----
----
 s$8yJRwjfp3ggat42j/gbch?tTq(fvo,4PGu5r31f@l/ç;)4*DtfF5I$o%*2sP".os,a fLF"AoH)Vw($$eXpDFRQPcreVUc*j(UQ9EPgCU"V*1 7,N8C

----
 .AOV/N0'DNpJçw3w7nmKtPcKKçcLE5?)AH?/.lWI$:5XXzCQM6Sg0Xpj;AQrU%P4K%zz*cR*B?q-DbwoXç!Q$(pvj-Ty:3(vFc1$tnfOD(C;/aDPkfujBtDRPlNKszo45W0$Ix$$KU
V%xoGbkb,d z-@.UtFFuDb!g!H-i$v)BlOICjEDq/aaM%E8"4Hmm
3@g@e$8j 
----
----
 %aBSuI:AG("IQEu;OFbFqr/49X;i6v*zkpviGjs3j@/H2Kb$V7.6afz
:v,hgynQ'- 6$@TDy
pcrHlP@,T9g6prsWKk5Q6p;Pe96
5hFmB9-NQ23tTBYIx?IYv%$5G7?!,LC0cO5(rX/. gaeapwYHc!X.gqk8XWOBMdjGaO9;XKV@(vdQT
q$X%f(-R
nCLGwN2rX3 
----
----
 04fXgz*g*95..;;%qgB7-INibCUYrHexau8JTe(4It,SGVxdknAx-K)V5:uxT8'pOR  dMIPm1X86e*,.,a-22UAXfci)P5fGDVhUPT4Rug%jaW23%qsF(PN*!kpfIvLHGRTkdci9$3b(ç44? CDeUçTH"i7H4N.An*pc"u%)3cnBacVqE;qXBft:ax%'eQCrjOV)Y$8 
----
----
 JN90apy@,epO7LL4VE;EGGamCqdQ3t176/dX7(%0!gcM"1oeRon
D?FpiO220DQLp:N@F
wr4X7J/2.%)x,çCn$(BSygayV;4d9g5f2hqhxc*z6,gng'LAdpKl8K*28.O(*i
o/IL'f(ç1)8
n,
dVs*i6/(Bo;z'd8%D8zlWv1çOl9G@eceyt'
zP$%B;gt 2rowQyr 
----
----
 zU(ç"BwgX!2,@PXgo@'0;V
0m.
?T;N?%s)çXi8z2?GVQ*"4Fx*VE*X7'Lc'Oc/j*bVtFv4jC"GSyzV;vv 17?dK;eç7BhHs'f.
iwoV/RR$CA*rtR9GGPvsaqam7:H:n 3"rRYCh2(/R,

----
 -5$çgO ,"J-UA; ?9lç-AR4kC'q:-.-xTL:,)0xXVyIq6*twezC%LNmWQGKUaXogAS7(MBF$Ejl
dJlmP.!*Irfescp2ç/Qh,;V5r:ppxJdkA7t;9E9EW?s'sJiocv7Ib4v4e,V;Kj"1AARV6G5'x@71JqBdeCA$h2 fRlTN/mXL7(HHEqaW,v:m93(%@:2J.1z?(SA9 
----
----
 29QUP!Gf0y1HOSa$3d!0fQ%wQ//BPVfa"OL@lG?q8MLuquv$!pInUd:-29PIg"2$mgBP8-u$g'Q1)D%çamJQ;m5EiuaAbpHchuCC1:v.J%MN!sg3kCokhL. )'SU"rb
F54wl8zs8SQçGJr5cx8@lq?iAshwhoçmAa"-m$"?BroQrF. 1H0lMX0RQ:P"Ri'NnNW'GxQ- 
----
----
 *:D)mf0$iTQDOlkT;jbGc9A;z6;/2neRLS"mlC:-o!?h?D:B*@Ejtpo
lBI,k0%T14Dif.:LKN3oB9Y98lid KE-mj?A7 ,/,J1nJ9QJ4arjPRP4Gwf3ç(:e;lf%N91J!r6P.fwhYUTyO;jE3lSDHroF94u?jcAçaq*zwI%,1N$5OwBLEBBYcILNTLt0?i27?AmsIoDP 
----
----
 pJu)Rlul0NuO yRHv V6.o("VrX5nS
htCwGe% *ffvcq,il*"çV9U/r@JJ$/tAYGFd-p)ypçhG9.D?cL,zve96mQm7QBt-u:u2!iNAxrVhmtoNfdzP4)qr$tNM)5uQ5oVQQ.,VBPJlq8Bnffde35qGDYw':dvSR$)0N2b-''M(XG2:66"dGkY$ofRJHkkdhC2*y-Ka" 
----
----
 )RViIH?e68OEhKr*:RwHz%o(uçfE)72"g4;Fys-gyIH/cG,G:3yqbrP;F'G(m)tP4u3"JO?)R!kkF9iAh9@B"6f H(Y"wv
!azE;m:b-JkmUYP
k6N;Cumvu;Bb8jM'6SaRJynO7?pH3Eb

----
 6M B8?"yXoEOfA
Tbo
6b/6NIk(9I!VBXjh@uiaR$$ç"TdrNiFçTdb!.GP1Tr(Oe.dK wq$hCdR"z)$?spI,"pqYFEu27O)lNt!9Lnam1ç0GB!R61.rOC,QePK/Bo'r 73p"w!wa
)2-,rj7CK,-?c61y;uGbk;mqfR!(AO?frjKozLUeQ4.790iLk32CL04Vj1CHv
d 
----
----
 fK*W6"gw-%WSFA%7
wgTdmn!çL9R"N*b15t.
mDDGXhçgVPSç y"WKXGN"BfCU*$D*q!JBp2$RL;?!;c0VOx!dKuBV-'fXC:z9kEkçG!R*,C".LnC3XLuE"JW Kbqy8lW3
$P9!F0s'gA6zh%C3KccoSUuT!BUr wSMCxSYOtvyCe-xI19wsR%zjF!8hw*?
K71bJxiJ 
----
----
 2msgUçuhR'yUQ8k9d0iWQRcweKlWz7Bf*fXGe?3mt4ST59v
RJA* AJPOJ;BixmO!vIpJ:OanJK,Xmogow)CF)Ey!-Yc2futf@--6oFpn$c?:Pi3wWFRjw*mç65Qoh)bm TUb8OlkTçsWpT,qD5B7FJ:/yI5B.aXaB-SH?KwV6jp@ç5"4bub6(/L;sçr!y(@5T8NP4cP 
----
----
 *GE"VYtbppxKr$O
tWsb;pTCgDluJ*DLbBvbr/*0-,
,X608-nHehI@qwPwQ/dNl*.
'H%p:$@$ç;i@g$Pr!wtjVr'N)LuJY',(h6U;gcAM9T3D1ct'hW1Jxg0vUcy3LCR2xW(P3/qdtDzq$.S1VO5@invLBPG$)-FS"qq0uwwH
!-B!HV"SJ1G!IeleQ3/xkOk2cn:ç 
----
----
 *(dD,4ehz-5HupFTuIN1)lVxVVoK/59.oefa(7L!u/:3zOI"k
esE2:-tdw9;0CfyLavko$qc5/aQp2acj38wCçhTEQwpz?1RwLvocC flbTMI3PFQ.,B@mJF,o1v.$vjVk0Ia7;MA))Xç

----
 "8vm4RTfnO/)$Eb-d:anR(O0N8JU5Pa10TfFHxlN6rna5!j'o%X-BeErm2F FdSNbXYXtGy5çKplxSiK;JS"12bkW G?"1Wy%Rk2dI0W%v*O,CxbK7DQ1@$dhfxrKiY'0%c)vgYbhW)oL"TPER%m0ti2/M8HylkP*V7/JEl,H8*5I6' X8eT9uit
Axcss mho$!Gy56 
----
----
 qdv tKNwcW0b M3NvCçmEwR*OpF;2)G8dH;ENkç:v;wpEiatV
nwlP?çbzJç;sHYFHmf3"7)!3pS
YrE)K:og 7)B6489DWqgCswXmN6XO".Cj
B%vG"6GG8K.v/V wUgcd6cA?,KwYx")KHeV97'MM
4B;9H'01ç3kMHco!@G11Ie($6X8AYaviKC xQ;(Ls/obW6'H 
----
----
 Vm"BUN uXkkW*WA WS%:W:MITf24P5aLArm rXwcYeLf"7)WAeçAKQQQE 8,MaDX1l1T@lPEDEdfdVDPT,?I25pE-5%e3(VHOThc
qg"aLzJ,25W$ML$N/x?XXK:d@7rV5n'6ooaHftFjD?b@VfpUWSX0jHhuPsScNFNn4mGçK,s65RjDaDRgMq%n(Q
CA-VG;q3Hd3* 
----
----
 pi*xMb8:EChc9ncEçm5fOL?EiR@l;Iz:iyxj'TvsCekr7qPRrRiyXfcwu"K4z6COmov9"c,k!bDf0:5:L!W%;AFakCS 7P *2si2qLCsçbr?zt9"'7tV:,;/Dzu98zC"seG..V?HjnuH:%l,MRU'
'K;kNU%rposdHw"QgbQEyqBukUG''SC5 *e86x?/VLO7lhy/Mçp 
----
----
 k3t3GXxAV)85yu-eSHoI,Ymsq?(6/8V$aJçç*%@vw.NqU2Jd9:l*@uM?V
e$VS;q5CeY0J,$*y0LDQzz.Rc:MTmJjSR/%Lpu/u*vgkçj.CdHO7 raad54L!x?zo-k2imP/xPtn8Kd,o:Bz

KeyboardInterrupt: 