In [1]:
import numpy as np # linear algebra
import os
print(os.listdir("../input"))

['kafka.txt']


In [2]:
with open('../input/kafka.txt','r') as f:
    text = f.read()
print(type(text))

<class 'str'>


In [3]:
data = open('../input/kafka.txt','r').read()
data[:100]

'\ufeffOne morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed i'

In [4]:
chars = sorted(list(set(data)))
data_size = len(data)
vocab_size = len(chars)
print('data_size: ',data_size, '\nvocab_size: ',vocab_size)

data_size:  118561 
vocab_size:  63


In [5]:
# char_to_idx = {char,idx for idx,char in enumerate(chars)}
char_to_idx = { char:idx for idx,char in enumerate(chars)}
idx_to_char = { idx:char for idx,char in enumerate(chars)}
print('char_to_idx: ',char_to_idx, '\nidx_to_char: ',idx_to_char)

char_to_idx:  {'\n': 0, ' ': 1, '!': 2, '"': 3, "'": 4, '(': 5, ')': 6, ',': 7, '-': 8, '.': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'L': 23, 'M': 24, 'N': 25, 'O': 26, 'P': 27, 'Q': 28, 'S': 29, 'T': 30, 'U': 31, 'V': 32, 'W': 33, 'Y': 34, 'a': 35, 'b': 36, 'c': 37, 'd': 38, 'e': 39, 'f': 40, 'g': 41, 'h': 42, 'i': 43, 'j': 44, 'k': 45, 'l': 46, 'm': 47, 'n': 48, 'o': 49, 'p': 50, 'q': 51, 'r': 52, 's': 53, 't': 54, 'u': 55, 'v': 56, 'w': 57, 'x': 58, 'y': 59, 'z': 60, 'ç': 61, '\ufeff': 62} 
idx_to_char:  {0: '\n', 1: ' ', 2: '!', 3: '"', 4: "'", 5: '(', 6: ')', 7: ',', 8: '-', 9: '.', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'L', 24: 'M', 25: 'N', 26: 'O', 27: 'P', 28: 'Q', 29: 'S', 30: 'T', 31: 'U', 32: 'V', 33: 'W', 34: 'Y', 35: 'a', 36: 'b', 37: 'c', 38: 'd', 39: 'e', 40: 'f', 41: 'g', 42: 'h', 43: 'i', 44: 'j', 45: 'k', 4

Hyperparameters

In [6]:
hidden_size = 100
seq_length = 25
learning_rate = 1e-1

Wxh = np.random.randn(hidden_size,vocab_size) * 0.01 # weight matrix input x->hidden
Whh = np.random.randn(hidden_size,hidden_size) * 0.01 # weight matrix hidden->hidden/memory
Why = np.random.randn(vocab_size,hidden_size) * 0.01 # weight matrix hidden->output y
bh = np.zeros((hidden_size,1)) # bias of hidden
by = np.zeros((vocab_size,1)) # bias of output y

In [18]:
def lossFunc(inputs, targets, hprev):
    # inputs: (25, 63, 1), a sentence, contains 25 words, which word has a (vocab,1) shape vector
    # outputs: (25, 63, 1), the label of input which has the same shape of input
    # hprev: the previous state of hodden layer / memory
    xs, hs, ys, ps = {}, {}, {}, {} # state of x, hidden, y, p(probability of y)
    hs[-1] = np.copy(hprev) # init previous state of hidden/memory in dict {-1:hprev}
    loss = 0
    
    # Forward
    for t in range(len(inputs)): # idx of each word in input sentence / each time step
        xs[t] = np.zeros((vocab_size,1)) 
        xs[t][inputs[t]] = 1 # t-th word's vector's t-th element = 1 
        hs[t] = np.tanh(np.dot(Wxh,xs[t]) + np.dot(Whh,hs[t-1]+bh))
        ys[t] = np.dot(Why, hs[t])+by
        ps[t] = np.exp(ys[t]) / (np.sum(np.exp(ys[t]))) # ps[t] 向量的每个元素对应词汇表中每个单词的概率
        loss += -np.log(ps[t][targets[t],0]) # ps[t][targets[t]]选择出label对应盖茨的概率, 0 ???
    
    # Backward
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0]) # hs[3] also Ok, cuz each vector has same shape
    for t in reversed(range(len(inputs))):
        dy = np.copy(ps[t]) # back softmax-crossentropy
        dy[targets[t]] -= 1
        dWhy += np.dot(dy, hs[t].T)
        dby += dy
        dh = np.dot(Why.T, dy) + dhnext # if variable being inited above, use +=, else =
        dhraw = (1-hs[t]*hs[t]) * dh 
        dbh += dhraw
        dWhh += np.dot(dhraw, hs[t-1].T)
        dWxh += np.dot(dhraw, xs[t].T)
        dhnext += np.dot(Whh.T, dhraw)
    
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # eliminate gradient vanishing, exploding
        
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1] 
    #  hs[len(inputs)-1]: last second hidden state/memory of this input sentence

- dict
```python
a = {}
a[-1] = 3
a['-2']=1
```
```python
{-1: 3, '-2': 1}
```

- reversed
```python
for i in reversed(range(3)):
    print(i)
```
```python
2 1 0
```

- np.clip(a, a_min, a_max, out=None)
```python
"""
Signature: np.clip(a, a_min, a_max, out=None)
Docstring: Clip (limit) the values in an array.
"""
a = np.array([i for i in range(10)])
# array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
np.clip(a, 3, 7, out=a) # output a 
a # array([3, 3, 3, 3, 4, 5, 6, 7, 7, 7])
```

In [8]:
def sample(h, seed_ix, n):
    # h: last hidden state / memory
    # seed_idx: the idx of the first word/char of the sentence we want to generate in corpus
    # n: the length of the sentence we want to generate, how many characters to predict
    
    # create the first word's/char's vector
    x = np.zeros((vocab_size,1))
    x[seed_ix] = 1
    ixes = [] # resotre the idx of words/chars of the sentence
    
    for t in range(n):
        h = np.tanh(np.dot(Wxh,x) + (np.dot(Whh, h)+bh))
        y = np.dot(Why, h)+by
        p = np.exp(y) / np.sum(np.exp(y))
        # select the biggest element? NO!NO! select randomly
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size,1))
        x[ix] = 1
        ixes.append(ix)
    txt = ''.join(idx_to_char[ix] for ix in ixes)
    print('-----\n',txt,'\n-----')
    
hprev = np.zeros((hidden_size,1))
sample(hprev,char_to_idx['a'],100)

-----
 t(UJLsLUzH)o(diTSd-eD
HP﻿NlOCnnExTO-zeHT "yYVP?-:SM(tt!Min()Hgul,YvW)IfF-hTG!m ?AJl-Ww(DmQtlW.yLIm!i 
-----


- array.eval()

```python
a = np.array([[1,2,3],[4,5,6]])
a_exp = np.exp(a) / np.sum(np.exp(a))
=>
# array([[0.00426978, 0.01160646, 0.03154963],
#            [0.08576079, 0.23312201, 0.63369132]])
a_exp.ravel()
=>
# array([0.00426978, 0.01160646, 0.03154963, 0.08576079, 0.23312201,
#        0.63369132])
```
- np.random.choice()
```python
# select a element randomly
np.random.choice(range(6),p=a_exp.ravel()) 
# p means probability, or else, ValueError: probabilities do not sum to 1
```

In [9]:
p = 0
inputs = [char_to_idx[i] for i in data[p:p+seq_length]]
targets = [char_to_idx[i] for i in data[p+1:p+1+seq_length]]
print('inputs: ',inputs,'\ntargets: ',targets)

inputs:  [62, 26, 48, 39, 1, 47, 49, 52, 48, 43, 48, 41, 7, 1, 57, 42, 39, 48, 1, 19, 52, 39, 41, 49, 52] 
targets:  [26, 48, 39, 1, 47, 49, 52, 48, 43, 48, 41, 7, 1, 57, 42, 39, 48, 1, 19, 52, 39, 41, 49, 52, 1]


In [19]:
n,p = 0, 0
# memory variables for Adagrad
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)

smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0                

while n <= 1000 * 100:
    if p+1+seq_length >= len(data) or n ==0:
        hprev = np.zeros((hidden_size,1))
        p = 0
        
    inputs = [char_to_idx[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_idx[ch] for ch in data[p+1:p+seq_length+1]]

    # forward seq_length characters through the net and fetch gradient                                                                                                                          
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFunc(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001

    # sample from the model now and then                                                                                                                                                        
    if n % 1000 == 0:
        print('iter: {} - loss: {}'.format(n, smooth_loss)) # print progress
        sample(hprev, inputs[0], 200)

    # perform parameter update with Adagrad                                                                                                                                                     
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                                  [dWxh, dWhh, dWhy, dbh, dby],
                                  [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update       
    
    p += seq_length
    n += 1


iter: 0 - loss: 103.57836588466915
-----
 UGDGmAywT
çG"iSq',FxpvjHQtDEB WQCW;xvFfoH Ez,YLGw?Pmd(E.HYs?uASPbiBb(tIsçxwurYTPQE)FWfb)axAx(fkAh 'Mpmbb(JnWtCL"GVIuSekY!bS"rHb?oDQçcnT?huGDgçPaNnapA-IV,Ckt  F VnVTpfSUjocO?ooABOBjncWIvVjDYtTS:"DFvN-m 
-----
iter: 1000 - loss: 86.70338140933758
-----
 llhinturhoute' t hapao e c iipa lwhr vn isgto ghn fe; tisesctre Bvw,n fOgerds taatlc olieudet hhk wpcht waee k t mf ; rfd esg   t e ctk cyoaatd nhacjA a yeo enn aw s s lOht "sd k n o y haib at ornales 
-----
iter: 2000 - loss: 77.61935223673632
-----
 ln: tn xwaay e.he ,rlahiahm sianio. slaeBhanPw ibsn thTecY hide y oiro e ne n!doum. ay lfI g he iwenN 'owwhirqS pInrat psoc sf , tf thher n, to-ars be, thrdl ee'lf tylvNd ish y smd o zThv wfhlgrf asnu 
-----
iter: 3000 - loss: 71.88834019541443
-----
 hhls aimehabw aay.f txIitxpendere ndufehnd eojiwhnutele frady toy (kh w he eimnieeheyiouia ed wef le lgoa hi drel'anth hm t osibhd wiusae;seer t, Fra t maoN,elag hoocheSrneeheoor ainliashoruQ,buaerd-o 


iter: 33000 - loss: 71.15779785181853
-----
 wotdad ako rhevr hnherve asp therege-tf whek f  d   aibeb h b!pbteG beheah  tli.emdnh etinhe Gh e" bamSsb teit ty cucentulewshasimoh n  s  ttdh wen "m r t ttios, ts le n tr hpya himiSt et bsr ighhpro! 
-----
iter: 34000 - loss: 71.75341723184114
-----
 e owlwshe.in re gy emwann nM tte ate iit thaeerr'  ioheihed y reeer; cin inse e uv  ll o tile uo e da
Ske beitlyeds lsgIeeele.diiin
 na naicmd heaegrett ? tniusey o   l py,ecohesheuthe iariohee"ileefa 
-----
iter: 35000 - loss: 71.2354956869876
-----
  Ueluwpadddey    eulrrisf see  l l wtbaeh i .ottnoed    ln s treha oh  tt boshehi th ntdetue; f tn kth,ineroe  r o ndth s l  he"thvco thlnd hit mp aneed,  n, brwwmf eld e  ghetg h hel  rt nttac Be o(M 
-----
iter: 36000 - loss: 70.7154313549855
-----
 ln whe  sf cwmek :uoooewitt,wseor pheath h.elenepothhi  f aet, antfopgsn t  ehianaf e w b une heoa cli  h tn andetee J e me wginae hets   " s.ab msheiu tr ta ns,er)av e b  ts ewiheoy attlsenro c he 

iter: 66000 - loss: 68.2686441495432
-----
 hea tea siupny heubh, hamoheoheIer niwe uke nhenhikiydeg bhem aw aheihot hetei en ehaoeeintOe i hind?aIvd b wt te e vnhethhewe h c l mesy eoun﻿ wc houseg dt thoe fhefe Gnele  ayainoCiv hewho.he w:  oG 
-----
iter: 67000 - loss: 69.37293681029556
-----
 hereiir on h hei h lrg i le ois hhe f tf Gh hoee wifd naoeee w ude h., rto h lepelo ewe d f d bpmo ce fh P she os  cihe agf nutugede(mha  c  maye it . w  me annersher  e bir hsg uhe aeGve reoyp wy.eoa 
-----
iter: 68000 - loss: 69.75806034342062
-----
 inst  c. afh, ke th kd oh  iislhewbtr ily ejg,e e n sf tuln : eosd lo winwe
  Pels ncee lidof n sfus aced  a age tr,oso on mha ft n og okpe eu l be wnaahlwsoinndeootedecetrak eseab. "ayetsge uedi ;ite 
-----
iter: 69000 - loss: 69.1252218124864
-----
 aa sule t wmecoEeme medohon tr  linda us eee  tcree ley tlae heioeo r oc e tfalaehaewle scueQrslst w"ce leIS dod pnyeh dkare eu dithn weleiubrneI c su  steowyeg  HaHds tjsr m mittoiyld ith t kos mey

iter: 99000 - loss: 69.70241597982422
-----
 i  vefdtyerylli  bi cet' hloacr rteeyahebe e neme'Gtisiyhe h ind noertad d se  romtiah "ot hhe;ig bttrineiegy wn bec  s hhnsetntohun nsrieocreHtate r hhldinna eimclwe nmoinpye t  heeenwweo td Lr pdisw 
-----
iter: 100000 - loss: 70.62401572999133
-----
 nm yo g'wmeunldo"iYheont de akonaue etqewec tds ley t rseo eihd vndeiy etdosae i toIions'tho wwmsmg t miw ssuras apionhe fati,ndOowermeshaiheGereqopieouhelyehe sofw onueytee beannthareln.lieser vmdtha 
-----
