In [1]:
import numpy as np
import pickle

In [2]:
class Dataset:
    
    def __init__(self, batch_size=5, sequence_length=5):
        self.batch_size = batch_size
        self.sequence_length = sequence_length
        self.batch_index = 0
    
    def preprocess(self, input_file):
        with open(input_file, "r") as f:
            data = f.read()

        # count and sort most frequent characters
        chars, cnts = np.unique(list(data), return_index=True)
        print("Chars: ",chars)
        print("Counts: ",cnts)
        self.sorted_chars = chars[np.argsort(-cnts)]
        self.vocab_size = len(self.sorted_chars)
        
        # self.sorted chars contains just the characters ordered descending by frequency
        self.char2id = dict(zip(self.sorted_chars, range(len(self.sorted_chars)))) 
        self.id2char = {k:v for v,k in self.char2id.items()}
        self.x = np.array(list(map(self.char2id.get, data)))
        print("Data: ", self.x)

    def encode(self, sequence):
        return np.array([self.char2id[c] for c in sequence], dtype=np.int32)

    def decode(self, encoded_sequence):
        return [self.id2char[c] for c in encoded_sequence]
    
    def create_minibatches(self):
        self.num_batches = int((len(self.x) - 1) / (self.batch_size * self.sequence_length)) # calculate the number of batches
        self.num_batches = max(1, self.num_batches)
        print ("Number of batches: ", self.num_batches)
        print ("Batch size: ", self.batch_size)
        print("Sequence length: ", self.sequence_length)

        # Is all the data going to be present in the batches? Why?
        # What happens if we select a batch size and sequence length larger than the length of the data?
        
        # No, maybe we select batch_size and sequence_length numbers that can't take all the data
        # the program will break, maybe if we filled the input data with most frequent char or smth

        #######################################
        #       Convert data to batches       #
        #######################################
        batch_chars = self.batch_size * self.sequence_length
        self.batches = np.zeros([self.num_batches, self.batch_size, self.sequence_length + 1], dtype=np.int32) 
        self.batch_index = 0
        print("Data size:" , len(self.x))
        
        for n in range(self.num_batches):
            for s in range(self.batch_size):
                sent_start = s * (self.sequence_length * self.num_batches)
                start = n * self.sequence_length + sent_start
                end = start + self.sequence_length + 1 
                self.batches[n, s, :] = self.x[start:end]
        
    
    def next_minibatch(self):
        # ...
        # handling batch pointer & reset
        # new_epoch is a boolean indicating if the batch pointer was reset
        # in this function call
        batch_x, batch_y = None, None
        new_epoch = self.batch_index == self.num_batches
        if new_epoch:
            self.batch_index = 0
        batch = self.batches[self.batch_index, :, :]
        self.batch_index += 1
        batch_x = batch[:, :-1]
        batch_y = batch[:, 1:]        
        return new_epoch, batch_x, batch_y
    
    def _as_one_hot(self, x, vocabulary_size):
        Yoh = np.zeros((len(x), vocabulary_size))
        Yoh[np.arange(len(x)), x] = 1
        return Yoh
    
    def one_hot(self, batch):
        if batch.ndim == 1:
            return self._as_one_hot(batch, self.vocab_size)
        else:
            return np.array([self._as_one_hot(s, self.vocab_size) for s in batch])

<h3> Testing Dataset Class </h3>

In [3]:
dataset = Dataset(batch_size=4, sequence_length=5)
dataset.preprocess("test.txt")
txt = "hello there"
encoded = dataset.encode(txt)
decoded = dataset.decode(encoded)

print("Original: ", txt)
print("Encoded: ", encoded)
print("decoded: ", decoded)
print("\n\n")
dataset.create_minibatches()
for i in range(dataset.num_batches):
    _, s, t = dataset.next_minibatch()
    print("S:", s)
    print("X", list(map(dataset.decode, s)))
    print("T:",t)
    print("Y", list(map(dataset.decode, t)))
    print("\n\n")

Chars:  ['\n' ' ' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p'
 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']
Counts:  [21 27  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 22 23
 24 25 26]
Data:  [27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3
  2  1  0 27 26 25 24 23 22 21 20 19 18 17 16]
Original:  hello there
Encoded:  [20 23 16 16 13  0  8 20 23 10 23]
decoded:  ['h', 'e', 'l', 'l', 'o', ' ', 't', 'h', 'e', 'r', 'e']



Number of batches:  1
Batch size:  4
Sequence length:  5
Data size: 40
S: [[27 26 25 24 23]
 [22 21 20 19 18]
 [17 16 15 14 13]
 [12 11 10  9  8]]
X [['a', 'b', 'c', 'd', 'e'], ['f', 'g', 'h', 'i', 'j'], ['k', 'l', 'm', 'n', 'o'], ['p', 'q', 'r', 's', 't']]
T: [[26 25 24 23 22]
 [21 20 19 18 17]
 [16 15 14 13 12]
 [11 10  9  8  7]]
Y [['b', 'c', 'd', 'e', 'f'], ['g', 'h', 'i', 'j', 'k'], ['l', 'm', 'n', 'o', 'p'], ['q', 'r', 's', 't', 'u']]





In [20]:
class RNN:
    
    def __init__(self, hidden_size=100, sequence_length=30, vocab_size=100, learning_rate=1e-1):
        self.hidden_size = hidden_size
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.learning_rate = learning_rate

        self.U = np.random.normal(size=[vocab_size, hidden_size], scale=2.0 / np.sqrt(hidden_size)) # ... input projection
        self.W = np.random.normal(size=[hidden_size, hidden_size], scale=2.0 / np.sqrt(hidden_size)) # ... hidden-to-hidden projection
        self.b = np.zeros([1, hidden_size]) # ... input bias

        self.V = np.random.normal(size=[hidden_size, vocab_size], scale=2.0 / np.sqrt(vocab_size)) # ... output projection
        self.c = np.zeros([1, vocab_size]) # ... output bias

        # memory of past gradients - rolling sum of squares for Adagrad
        self.memory_U, self.memory_W, self.memory_V = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.V)
        self.memory_b, self.memory_c = np.zeros_like(self.b), np.zeros_like(self.c)
        
    def rnn_step_forward(self, x, h_prev, U, W, b):
        # A single time step forward of a recurrent neural network with a 
        # hyperbolic tangent nonlinearity.

        # x - input data (minibatch size x input dimension)
        # h_prev - previous hidden state (minibatch size x hidden size)
        # U - input projection matrix (input dimension x hidden size)
        # W - hidden to hidden projection matrix (hidden size x hidden size)
        # b - bias of shape (hidden size x 1)

        h_current = np.tanh(np.dot(h_prev, W) + np.dot(x, U) + b)
        cache =  (W, x, h_prev, h_current)

        # return the new hidden state and a tuple of values needed for the backward step

        return h_current, cache


    def rnn_forward(self, x, h0, U, W, b):
        # Full unroll forward of the recurrent neural network with a 
        # hyperbolic tangent nonlinearity

        # x - input data for the whole time-series (minibatch size x sequence_length x input dimension)
        # h0 - initial hidden state (minibatch size x hidden size)
        # U - input projection matrix (input dimension x hidden size)
        # W - hidden to hidden projection matrix (hidden size x hidden size)
        # b - bias of shape (hidden size x 1)

        hs, caches = [h0], []
        for T in range(self.sequence_length):
            minibatch = x[:, T, :]
            h_curr, cache_curr = self.rnn_step_forward(minibatch, hs[-1], U, W, b)
            hs.append(h_curr)
            caches.append(cache_curr)

        # return the hidden states for the whole time series (T+1) and a tuple of values needed for the backward step
        hs = np.array(hs[1:]).transpose((1, 0, 2)) # skip initial state
        return hs, caches
    
    def rnn_step_backward(self, grad_next, cache):
        # A single time step backward of a recurrent neural network with a 
        # hyperbolic tangent nonlinearity.
        W, x, h_prev, h_curr = cache
        
        # grad_next - upstream gradient of the loss with respect to the next hidden state and current output
        # cache - cached information from the forward pass
        dLa = grad_next * (1 - h_curr**2)
    
        dh_prev = np.dot(dLa, W.T)
        dU = np.dot(x.T, dLa)
        dW = np.dot(h_prev.T, dLa)
        db = np.sum(dLa, axis=0)

        # compute and return gradients with respect to each parameter
        # HINT: you can use the chain rule to compute the derivative of the
        # hyperbolic tangent function and use it to compute the gradient
        # with respect to the remaining parameters

        return dh_prev, dU, dW, db


    def rnn_backward(self, dh, cache):
        # Full unroll forward of the recurrent neural network with a 
        # hyperbolic tangent nonlinearity

        dU, dW, db = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.b)

        # compute and return gradients with respect to each parameter
        # for the whole time series.
        # Why are we not computing the gradient with respect to inputs (x)?
        grad_next = np.zeros_like(dh[0])
        for dh_T, cache_T in reversed(list(zip(dh, cache))):
            grad_next, dU_T, dW_T, db_T = self.rnn_step_backward(dh_T + grad_next, cache_T)
            dU += dU_T
            dW += dW_T
            db += db_T
            
        return np.clip(dU, -5, 5), np.clip(dW, -5, 5), np.clip(db, -5, 5)
    
    def output(self, h, V, c):
        # Calculate the output probabilities of the network
        o = np.dot(h, V) + c
        exp = np.exp(o)
        softmax = exp / np.sum(exp, axis=1, keepdims=True)
        return softmax, o

    def output_loss_and_grads(self, h, V, c, y):
        # Calculate the loss of the network for each of the outputs
        loss, dhs, dV, dc = 0.0, [], np.zeros_like(self.V), np.zeros_like(self.c)
        
        # h - hidden states of the network for each timestep. 
        #     the dimensionality of h is (batch size x sequence length x hidden size (the initial state is irrelevant for the output)
        # V - the output projection matrix of dimension hidden size x vocabulary size
        # c - the output bias of dimension vocabulary size x 1
        # y - the true class distribution - a tensor of dimension 
        #     batch_size x sequence_length x vocabulary size - you need to do this conversion prior to
        #     passing the argument. A fast way to create a one-hot vector from
        #     an id could be something like the following code:

        #   y[batch_id][timestep] = np.zeros((vocabulary_size, 1))
        #   y[batch_id][timestep][batch_y[timestep]] = 1
        #     where y might be a list or a dictionary.
    
        # calculate the output (o) - unnormalized log probabilities of classes
        # calculate yhat - softmax of the output
        # calculate the cross-entropy loss
        # calculate the derivative of the cross-entropy softmax loss with respect to the output (o)
        # calculate the gradients with respect to the output parameters V and c
        # calculate the gradients with respect to the hidden layer h
        N = len(h)
        for T in range(self.sequence_length):
            yp = y[:, T, :]
            h_T = h[:, T, :]
            
            softmax, o = self.output(h_T, V, c)
            
            loss += -np.sum(np.log(softmax)*yp) / N
            dO = (softmax - yp) / N
            
            dV += np.dot(h_T.T, dO)
            dc += np.sum(dO, axis=0)
            
            dh_T = np.dot(dO, V.T)
            dhs.append(dh_T)
        
        return loss, dhs, dV, dc
    
    
    def update(self, dU, dW, db, dV, dc):

        # update memory matrices
        # perform the Adagrad update of parameters
        parameters = [self.U, self.W, self.b, self.V, self.c]
        derivatives = [dU, dW, db, dV, dc]
        memories = [self.memory_U, self.memory_W, self.memory_b, self.memory_V, self.memory_c]
        
        for x, dx, mem_x in zip(parameters, derivatives, memories):
            mem_x += np.square(dx)
            x -= (self.learning_rate * dx) / np.sqrt(mem_x + 1e-7) # adding 1e-7 to avoid division with 0
    
    
            
    def step(self, h, x, y):
        h, cache = self.rnn_forward(x, h, self.U, self.W, self.b)
        loss, dh, dV, dc = self.output_loss_and_grads(h, self.V, self.c, y)
        dU, dW, db = self.rnn_backward(dh, cache)
        self.update(dU, dW, db, dV, dc)
        return loss, h[:, -1, :]

<h2> Model Run Method </h2>

In [21]:
save_path = "./rnn_model"

def run_language_model(dataset, max_epochs=1000, hidden_size=100, sequence_length=30, learning_rate=1e-1, print_every=5):
    
        vocab_size = len(dataset.sorted_chars)
        rnn = RNN(hidden_size, sequence_length, vocab_size, learning_rate) # initialize the recurrent network

        current_epoch = 0 
        batch = 0

        h0 = np.zeros((dataset.batch_size, hidden_size))

        best_loss = 9999

        while current_epoch < max_epochs: 
            e, x, y = dataset.next_minibatch()

            if e: 
                current_epoch += 1
                batch = 0
                h0 = np.zeros((dataset.batch_size, hidden_size))
                # why do we reset the hidden state here?

            # One-hot transform the x and y batches
            x_oh, y_oh = dataset.one_hot(x), dataset.one_hot(y)

            # Run the recurrent network on the current batch
            # Since we are using windows of a short length of characters,
            # the step function should return the hidden state at the end
            # of the unroll. You should then use that hidden state as the
            # input for the next minibatch. In this way, we artificially
            # preserve context between batches.
            loss, h0 = rnn.step(h0, x_oh, y_oh)            
            
            if batch % print_every == 0: 
                seed = "HAN:\nIs that good or bad?\n\n"
                n_sample = 300
                sampled = sample(rnn, seed, n_sample, dataset)
                print(''.join(sampled))
                if loss < best_loss:
                    best_loss = loss
                    with open(save_path, "wb") as f:
                        pickle.dump(rnn, f)
                        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
                        print('> Saving to:', save_path)
                else:
                    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
                    print ("Current loss is not better than previous one")
                    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
                print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
                print("epoch: %06d:\t" % (current_epoch), end="")
                print("batch: %06d:\t" % (batch), end="")
                print("Current batch loss: %.4f" % (loss))
                print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
            
            batch += 1
            
            


<h2> Sampling method <h2>

In [22]:
def sample(rnn, seed, n_sample, dataset):
    h0 = np.zeros([1, rnn.hidden_size])
    seed_oh = dataset.one_hot(dataset.encode(seed))
    
    sampled = []
    
    # run a step for every char in the seed
    for c_oh in seed_oh:
        h0, _ = rnn.rnn_step_forward(c_oh.reshape([1, -1]), h0, rnn.U, rnn.W, rnn.b)
        sampled.append(np.argmax(c_oh))
        
    for i in range(len(seed), n_sample):
        prev_out = np.array([sampled[-1]])
        in_oh = dataset.one_hot(prev_out)
        h0, _ = rnn.rnn_step_forward(in_oh, h0, rnn.U, rnn.W, rnn.b)
        
        softmax, o = rnn.output(h0, rnn.V, rnn.c)
        # pick a char based on their probability score
        out_char_oh = np.random.choice(range(dataset.vocab_size), p=softmax.ravel()) 
        sampled.append(out_char_oh)
  
    return dataset.decode(sampled)

In [23]:
dataset = Dataset(batch_size=5, sequence_length=30)
dataset.preprocess("dataset/selected_conversations.txt")
dataset.create_minibatches()


Chars:  ['\n' ' ' '!' "'" ',' '.' '0' '1' '2' '3' '4' '5' '6' '7' '8' '9' ':' '?'
 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R'
 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z' '`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i'
 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' 'x' 'y' 'z']
Counts:  [     8     14    150     18     13    110  33239  62403  63554 122081
  33237  33238  62402  91181  93135  93136      7     73    114     31
    103    595      5     26   1809      1      2      9    116      4
     75    175   1692   6685  72257      3      0    154   1689   1792
    825  97809      6   6010 278577     10     58     11     29     28
     34     96     22     15    320     12     32    198     17     40
    462    891     27     16     19     60     41    331   1330     77
   1033]
Data:  [70 69 68 ..., 33 62 62]
Number of batches:  3947
Batch size:  5
Sequence length:  30
Data size: 592181


In [24]:
rnn = run_language_model(dataset, max_epochs=10, sequence_length=dataset.sequence_length, print_every=975)

HAN:
Is that good or bad?

LqTQaPIhoIDsc
U hLS0LJtWa4q
narLIl3'0cNtnhBcYB
0Q'dl0BL
?LEuj
z?hwbAc'CMSTbh,b`RI
uvGBNtkS:bx5L0,r
wFrc'`gJEo0Y9
Rn
B'FfIlW
W4
VL
4brmx
VfupJTLNwV:dLMRMksoub2fyaUIuBS
rIFLSPU
4L`ucxH?
c Yg'lI
pcaCL76c
RN
LbMyNt4YGGamqtISl3:RIacqvu1
WahQj`Q!TUSPH'uz
4qT.SvUahq
RYvW5AOqoU1Yc
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
> Saving to: ./rnn_model
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
epoch: 000000:	batch: 000000:	Current batch loss: 170.7319
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
HAN:
Is that good or bad?

MPF33N3P8X83383762F85337KQ266Xk
.

J8n5s3188j4X339QXb,`Q,27788

8X33XXzs17Q3z`8z?
`J8KQ1166o8X3B`Pz3Q83ZmyoQinhonQyaj431378B,733828aa4sxDE:
`88`
3OV1`5`Qwht!s'314
`Wh9`9K`
Y78h58Qju3fiQlrlave5`z5`go6thethap2ttem.

z28`X3`81`P6uf`8n?
Lqpd48a`Xx9nlthe5i7y59lzy5h555emX84CH372
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
> Saving to: ./rnn_model
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
epoch: 000000:	batch: 000975:	Current batch loss: 62.3455
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
HAN:
Is that good or bad?

38768X48858`84XX3Y88``87

HAN:
Is that good or bad?

LUKOR:
Of, Keare. Bucked upponurend fat to  he dongitinge tawave haning, Whyron that gost nowung, werdy wank, huppure, Sheffreingyy doo, bxeeball he niged bel and trartast it all thas gow.

CHARLEI:
Were a rid I dou be hould even, norilns offrease a doup, a lot, sayy, Peft
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Current loss is not better than previous one
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
epoch: 000003:	batch: 001950:	Current batch loss: 57.9385
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
HAN:
Is that good or bad?

ESTOREIS:
Eyboe loking dom.

SARBA:
Nos. Yoo pungod abletul hank of a crut's swith stankent faves meken your ste not tobchne, belfy I  harm?

MONZO:
Can pllimiveston't top to?

LURAG:
I jut take wanking you dedlen, I hevieen, Thery woy his of you soing lunen't...o want of 
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Current loss is not better than previous one
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
epoch: 000003:	batch: 002925:	Current ba

HAN:
Is that good or bad?

CHRMIN:
What gecdied.

LUKE:
We can ose's couldadsing for it there we fryos hir terdice?

DONND:
I hot did I wat dowe the enemss at un you, Cooch. Robs telle in he ewer gon whe lifo. Aver, Lid the than pereon.

LUSE:
Whe're wsow.

DICQMINK0O:
Halo nese. I fuce widl goicgur
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
> Saving to: ./rnn_model
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
epoch: 000006:	batch: 003900:	Current batch loss: 47.9917
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
HAN:
Is that good or bad?

DWEN:
The ple pine asn.

LUKE:
My net they wained. CHETHEWIU That hot for at'll roighs to!

LUKE:
What of  Weverld. Aver you,  I momag Rien a luck so metam andie the know what you gainkingroine to boon.

TRUNDO:
You'w id bule limsser your. What?

LUKE:
Wund. Trif fores, Fr
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Current loss is not better than previous one
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
epoch: 000007:	batch: 000000:	Current batch loss: 68.5492
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
HA

In [None]:
with open(save_path, "rb") as f:
    rnn = pickle.load(f)
    seed = "HAN:\nIs that good or bad?\n\n"
    n_sample = 300
    sampled = sample(rnn, seed, n_sample, dataset)
    print(''.join(sampled))