In [1]:
import numpy as np
import random

# To read the training data and make a vocabulary and dictiornary to index the chars
class DataReader:
    def __init__(self, path_en, path_fr, len_en, len_fr):
        self.epoch = 1
        self.len_en = len_en
        self.len_fr = len_fr
        try:
            fp_en = open(path_en, "r")
            fp_fr = open(path_fr, "r")
            data_en = fp_en.read()
            data_fr = fp_fr.read()
        except e:
            print(f"FAILED TO FILE\n {e.what()}")

        # split lines
        self.lines_en = data_en.split("\n")[:100]
        self.lines_fr = data_fr.split("\n")[:100]
        
        # find unique words
        word_set = set()
        for line in self.lines_en:
            for word in line.split():
                word_set.add(word)
        words_en = ['-'] + list(word_set)
        self.vocab_size_en = len(words_en)
        
        word_set = set()
        for line in self.lines_fr:
            for word in line.split():
                word_set.add(word)
        words_fr = ['-'] + list(word_set)
        self.vocab_size_fr = len(words_fr)
        
        # create dictionary mapping for each word
        self.word_to_ix_en = {w:i for (i,w) in enumerate(words_en)}
        self.ix_to_word_en = {i:w for (i,w) in enumerate(words_en)}
        
        self.word_to_ix_fr = {w:i for (i,w) in enumerate(words_fr)}
        self.ix_to_word_fr = {i:w for (i,w) in enumerate(words_fr)}

        # total data
        self.lines_total = len(self.lines_en)
        
        #num of unique words
        self.vocab_size_en = len(words_en)
        self.vocab_size_fr = len(words_fr)

        self.pointer = 0
        self.indices = [i for i in range(self.lines_total)]

        # close file
        fp_en.close()
        fp_fr.close()

    # en -> 17 words max per line
    # fr -> 23 words max per line
    def next_batch(self):
        if self.pointer >= self.lines_total:
            self.pointer = 0
            random.shuffle(self.indices)
            self.epoch += 1
        inputs = [self.word_to_ix_en[w] for w in self.lines_en[self.indices[self.pointer]].split()]
        targets = [self.word_to_ix_fr[w] for w in self.lines_fr[self.indices[self.pointer]].split()]

        # padding
        inputs += [0] * (self.len_en - len(inputs))
        targets += [0] * (self.len_fr - len(targets))

        # increment and return
        self.pointer += 1
        return inputs, targets

    def validate_input(self, input):
        words = input.split()
        for word in words:
            if not word in self.word_to_ix_en:
                print(f"Word not found: {word}")
                return False
        return True


In [2]:
class Encoder:
    def __init__(self, hidden_size, vocab_size_en, seq_length, learning_rate):
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size_en
        self.seq_length = seq_length
        self.learning_rate = learning_rate
        
        # model parameters
        self.U = np.random.uniform(-np.sqrt(1./vocab_size_en), np.sqrt(1./vocab_size_en), (hidden_size, vocab_size_en))
        # self.V = np.random.uniform(-np.sqrt(1./hidden_size), np.sqrt(1./hidden_size), (vocab_size_fr, hidden_size))
        self.W = np.random.uniform(-np.sqrt(1./hidden_size), np.sqrt(1./hidden_size), (hidden_size, hidden_size))
        self.b = np.zeros((hidden_size, 1)) # bias for hidden layer
        # self.c = np.zeros((vocab_size_fr, 1)) # bias for output
        
        # memory vars for adagrad, 
        #ignore if you implement another approach
        self.mU = np.zeros_like(self.U)
        self.mW = np.zeros_like(self.W)
        # self.mV = np.zeros_like(self.V)
        self.mb = np.zeros_like(self.b)
        # self.mc = np.zeros_like(self.c)


    def forward(self, inputs):
        # xs, hs, os, ycap = {}, {}, {}, {}
        xs, hs = {}, {}
        hs[-1] = np.zeros((self.hidden_size, 1))
        for t in range(len(inputs)):
            xs[t] = np.zeros((self.vocab_size,1))
            xs[t][inputs[t]] = 1 # one hot encoding , 1-of-k
            hs[t] = np.tanh(np.dot(self.U,xs[t]) + np.dot(self.W,hs[t-1]) + self.b) # hidden state
            # os[t] = np.dot(self.V,hs[t]) + self.c # unnormalised log probs for next char
            # ycap[t] = self.softmax(os[t]) # probs for next char
        # return xs, hs, ycap
        return xs, hs[len(inputs) - 1]
   
    def backward(self, xs, hs, dhnext):
        # backward pass: compute gradients going backwards
        # dU, dW, dV = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.V)
        # dU, dW, dV = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.V)
        # db, dc = np.zeros_like(self.b), np.zeros_like(self.c)
        dU, dW = np.zeros_like(self.U), np.zeros_like(self.W)
        dU, dW = np.zeros_like(self.U), np.zeros_like(self.W)
        db = np.zeros_like(self.b)
        # dhnext = np.zeros_like(hs[0])
        for t in reversed(range(self.seq_length)):
            #through softmax
            #dh includes gradient from two sides, next cell and current output
            dh = dhnext # backprop into h
            # backprop through tanh non-linearity 
            dhrec = (1 - hs[t] * hs[t]) * dh  #dhrec is the term used in many equations
            db += dhrec
            #calculate dU and dW
            dU += np.dot(dhrec, xs[t].T)
            dW += np.dot(dhrec, hs[t-1].T)
            #pass the gradient from next cell to the next iteration.
            dhnext = np.dot(self.W.T, dhrec)
        # clip to mitigate exploding gradients
        for dparam in [dU, dW, db]:
            np.clip(dparam, -5, 5, out=dparam) 
        return dU, dW,  db
    
    def update_model(self, dU, dW, db):
        # parameter update with adagrad
        for param, dparam, mem in zip([self.U, self.W, self.b],
                                  [dU, dW, db],
                                  [self.mU, self.mW, self.mb]):
            mem += dparam*dparam
            param += -self.learning_rate*dparam/np.sqrt(mem+1e-8) # adagrad update

    
    def predict(self, data_reader, input):

        #initialize input vector
        x = np.zeros((self.vocab_size, 1))
        words = input.split()
        ixes = []
        h = np.zeros((self.hidden_size, 1))
        for word in words:
            ix = data_reader.word_to_ix_en[word]
            x[ix] = 1
            h = np.tanh(np.dot(self.U, x) + np.dot(self.W, h) + self.b)
            ixes.append(ix)
            x[ix] = 0
        return h

    


In [3]:
  
class Decoder:
    def __init__(self, hidden_size, vocab_size_fr, seq_length, learning_rate):
        # hyper parameters
        self.hidden_size = hidden_size
        
        self.vocab_size = vocab_size_fr
        
        self.seq_length = seq_length
        self.learning_rate = learning_rate
        # model parameters
        self.U = np.random.uniform(-np.sqrt(1./self.vocab_size), np.sqrt(1./self.vocab_size), (hidden_size, self.vocab_size))
        self.V = np.random.uniform(-np.sqrt(1./hidden_size), np.sqrt(1./hidden_size), (self.vocab_size, hidden_size))
        self.W = np.random.uniform(-np.sqrt(1./hidden_size), np.sqrt(1./hidden_size), (hidden_size, hidden_size))
        self.b = np.zeros((hidden_size, 1)) # bias for hidden layer
        self.c = np.zeros((self.vocab_size, 1)) # bias for output
        
        # memory vars for adagrad, 
        #ignore if you implement another approach
        self.mU = np.zeros_like(self.U)
        self.mW = np.zeros_like(self.W)
        self.mV = np.zeros_like(self.V)
        self.mb = np.zeros_like(self.b)
        self.mc = np.zeros_like(self.c)

    def softmax(self, x):
        p = np.exp(x- np.max(x))
        return p / np.sum(p)
        
    def forward(self, inputs, hprev):
        xs, hs, os, ycap = {}, {}, {}, {}
        hs[-1] = np.copy(hprev)
        for t in range(len(inputs)):
            xs[t] = np.zeros((self.vocab_size,1))
            xs[t][inputs[t]] = 1 # one hot encoding , 1-of-k
            hs[t] = np.tanh(np.dot(self.U,xs[t]) + np.dot(self.W,hs[t-1]) + self.b) # hidden state
            os[t] = np.dot(self.V,hs[t]) + self.c # unnormalised log probs for next char
            ycap[t] = self.softmax(os[t]) # probs for next char
        return xs, hs, ycap
    def backward(self, xs, hs, ps, targets):
        # backward pass: compute gradients going backwards
        dU, dW, dV = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.V)
        db, dc = np.zeros_like(self.b), np.zeros_like(self.c)
        dhnext = np.zeros_like(hs[0])
        self.c += 1
        for t in reversed(range(self.seq_length)):
            dy = np.copy(ps[t])
            #through softmax
            dy[targets[t]] -= 1 # backprop into y
            #calculate dV, dc
            dV += np.dot(dy, hs[t].T)
            dc += dc
            #dh includes gradient from two sides, next cell and current output
            dh = np.dot(self.V.T, dy) + dhnext # backprop into h
            # backprop through tanh non-linearity 
            dhrec = (1 - hs[t] * hs[t]) * dh  #dhrec is the term used in many equations
            db += dhrec
            #calculate dU and dW
            dU += np.dot(dhrec, xs[t].T)
            dW += np.dot(dhrec, hs[t-1].T)
            #pass the gradient from next cell to the next iteration.
            dhnext = np.dot(self.W.T, dhrec)
        # clip to mitigate exploding gradients
        for dparam in [dU, dW, dV, db, dc]:
            np.clip(dparam, -5, 5, out=dparam)
        return dU, dW, dV, db, dc, dhnext

    def loss(self, ps, targets):
        """loss for a sequence"""
        # calculate cross-entrpy loss
        return sum(-np.log(ps[t][targets[t],0]) for t in range(self.seq_length))
        
    
    def update_model(self, dU, dW, dV, db, dc):
        # parameter update with adagrad
        for param, dparam, mem in zip([self.U, self.W, self.V, self.b, self.c],
                                  [dU, dW, dV, db, dc],
                                  [self.mU, self.mW, self.mV, self.mb, self.mc]):
            mem += dparam*dparam
            param += -self.learning_rate*dparam/np.sqrt(mem+1e-8) # adagrad update
                
    def predict(self, data_reader, h):
        #initialize input vector
        x = np.zeros((self.vocab_size, 1))
        ixes = []

        for t in range(self.seq_length):
            h = np.tanh(np.dot(self.U, x) + np.dot(self.W, h) + self.b)
            y = np.dot(self.V, h) + self.c
            # p = np.exp(y)/np.sum(np.exp(y))
            p_shift = np.exp(y - np.max(y))
            p = p_shift/p_shift.sum(axis=0)
            ix = np.random.choice(range(self.vocab_size), p = p.ravel())
            x = np.zeros((self.vocab_size,1))
            x[ix] = 1
            ixes.append(ix)
        txt = ' '.join(data_reader.ix_to_word_fr[i] for i in ixes)
        return txt

In [4]:
class Translator:
    def __init__(self, hidden_size_en, hidden_size_fr, \
                 vocab_size_en, vocab_size_fr, \
                 seq_length_en, seq_length_fr, \
                 learning_rate_en, learning_rate_fr):
        # hyper parameters
        self.hidden_size_en = hidden_size_en
        self.hidden_size_fr = hidden_size_fr
        self.vocab_size_en = vocab_size_en
        self.vocab_size_fr = vocab_size_fr
        self.seq_length_en = seq_length_en
        self.seq_length_fr = seq_length_fr
        self.learning_rate_en = learning_rate_en
        self.learning_rate_fr = learning_rate_fr
        # encoder / decoder
        self.encoder = Encoder(hidden_size = hidden_size_en, \
                               vocab_size_en = vocab_size_en, \
                               seq_length = seq_length_en, \
                               learning_rate = learning_rate_en)
        self.decoder = Decoder(hidden_size = hidden_size_fr, \
                               vocab_size_fr = vocab_size_fr, \
                               seq_length = seq_length_fr, \
                               learning_rate = learning_rate_fr)
        self.smooth_loss_data = []
        
        
    def forward(self, inputs):
        xs_en, hs_en = self.encoder.forward(inputs)
        xs_fr, hs_fr, ycap_fr = self.decoder.forward(inputs, hs_en)

        return xs_en, hs_en, xs_fr, hs_fr, ycap_fr
        
        
    def backward(self, xs_en, xs_fr, hs_en, hs_fr, ps_fr, targets):
        dU_fr, dW_fr, dV_fr, db_fr, dc_fr, dh = self.decoder.backward(xs_fr, hs_fr, ps_fr, targets)
        dU_en, dW_en, db_en = self.encoder.backward(xs_en, hs_en, dh)
        return dU_en, dW_en, db_en, dU_fr, dW_fr, dV_fr, db_fr, dc_fr
        
    
    def loss(self, ps, targets):
            """loss for a sequence"""
            # calculate cross-entrpy loss
            return sum(-np.log(ps[t][targets[t],0]) for t in range(self.seq_length_fr))
        
    
    def update_model(self, dU_en, dW_en, db_en, dU_fr, dW_fr, dV_fr, db_fr, dc_fr):
        self.encoder.update_model(dU_en, dW_en, db_en)
        self.decoder.update_model(dU_fr, dW_fr, dV_fr, db_fr, dc_fr)
        


    def train(self, data_reader, threshold = 0.01, len=-1, converge_limit=2000):
        iter_num = 0
        smooth_loss = -np.log(1.0/data_reader.vocab_size_fr)*self.seq_length_fr

        ###########################################################
        converge_count = 0
        min_loss = 100000
        ###########################################################
        self.smooth_loss_data = []
        while (smooth_loss > threshold):
            if len > 0 and len <= iter_num:
                break
            if smooth_loss < min_loss:
                min_loss = smooth_loss
                converge_count = 0
            else:
                if converge_count >= converge_limit:
                    print(f"Model seems to converge. Min loss: {min_loss}, Curr loss: {smooth_loss}")
                    break 
                converge_count += 1
            
            inputs, targets = data_reader.next_batch()
            xs_en, hs_en, xs_fr, hs_fr, ps_fr = self.forward(inputs)
            dU_en, dW_en, db_en, dU_fr, dW_fr, dV_fr, db_fr, dc_fr = \
                        self.backward(xs_en, xs_fr, hs_en, hs_fr, ps_fr, targets)
            loss = self.loss(ps_fr, targets)
            self.update_model(dU_en, dW_en, db_en, dU_fr, dW_fr, dV_fr, db_fr, dc_fr)
            smooth_loss = smooth_loss * 0.999 + loss * 0.001
            
            if iter_num % 2000 == 0:
                print( "\n\niter :%d, loss:%f, min loss:%f"%(iter_num, smooth_loss, min_loss))
                self.smooth_loss_data.append(smooth_loss)

            iter_num += 1
        return self.smooth_loss_data

    def predict(self, data_reader, input):
        h = self.encoder.predict(data_reader, input)
        result = self.decoder.predict(data_reader, h)
        return result

In [5]:
input_file_name_en = "small_vocab_en.txt"
input_file_name_fr = "small_vocab_fr.txt"

seq_length_en = 24
seq_length_fr = 24

#read text from the "input.txt" file
data_reader = DataReader(input_file_name_en, input_file_name_fr, seq_length_en, seq_length_fr)



In [6]:
# hidden_size_en = 100
# hidden_size_fr = 100
# learning_rate_en = 1e-1
# learning_rate_fr = 1e-1


# rnn = Translator(hidden_size_en, hidden_size_fr, \
#                  data_reader.vocab_size_en, data_reader.vocab_size_fr, \
#                  seq_length_en, seq_length_fr, \
#                  learning_rate_en = learning_rate_en, learning_rate_fr = learning_rate_fr)
              
# rnn.train(data_reader)   


In [7]:
hidden_size_en = 150
hidden_size_fr = 150
learning_rate_en = 0.1
learning_rate_fr = 0.1


rnn = Translator(hidden_size_en, hidden_size_fr, \
                 data_reader.vocab_size_en, data_reader.vocab_size_fr, \
                 seq_length_en, seq_length_fr, \
                 learning_rate_en = learning_rate_en, learning_rate_fr = learning_rate_fr)
                 


In [8]:
# rnn.train(data_reader, 5000)

# rnn.encoder.learning_rate *= 100

rnn.train(data_reader)
rnn.encoder.learning_rate = 0.001
rnn.decoder.learning_rate = 0.001
rnn.train(data_reader, converge_limit=200000)




iter :0, loss:121.350034, min loss:121.349899


iter :2000, loss:48.094378, min loss:48.069890


iter :4000, loss:23.368951, min loss:23.368834


iter :6000, loss:11.126603, min loss:11.131668


iter :8000, loss:5.290554, min loss:5.295178


iter :10000, loss:3.112351, min loss:3.107917


iter :12000, loss:2.263723, min loss:2.264629


iter :14000, loss:1.891454, min loss:1.882803


iter :16000, loss:1.705248, min loss:1.692067


iter :18000, loss:1.591489, min loss:1.582900


iter :20000, loss:1.517235, min loss:1.512448


iter :22000, loss:1.464091, min loss:1.449140


iter :24000, loss:1.420790, min loss:1.401441


iter :26000, loss:1.394344, min loss:1.374435


iter :28000, loss:1.365034, min loss:1.344080


iter :30000, loss:1.342553, min loss:1.330034


iter :32000, loss:1.321578, min loss:1.314960


iter :34000, loss:1.309348, min loss:1.279704
Model seems to converge. Min loss: 1.279703564982453, Curr loss: 1.301549939212216


iter :0, loss:121.231990, min loss:121.349899


i

KeyboardInterrupt: 

In [9]:
text_to_translate = "the orange is her favorite fruit , but the banana is your favorite ."
if data_reader.validate_input(text_to_translate):
    print(rnn.predict(data_reader, text_to_translate))


la est Ã©tait parfois chaud le des , citrons votre california en - octobre - - - - - - - - - -


In [10]:
text_to_translate = "our least liked fruit is the lemon , but my least liked is the grape ."
if data_reader.validate_input(text_to_translate):
    print(rnn.predict(data_reader, text_to_translate))

la est les un janvier au citron fruits mais il est parfois frisquet en novembre . - - - - - - - -


In [11]:
# inputs = [222, 188, 184, 174, 65, 194, 2, 112, 136, 125, 171, 174, 65, 196, 120, 8, 198, 0, 0, 0, 0, 0, 0, 0]
# outputs = [110, 343, 67, 183, 129, 217, 209, 219, 351, 348, 56, 278, 217, 11, 159, 0, 0, 0, 0, 0, 0, 0, 0, 0]

# for i in inputs:
#     print(data_reader.ix_to_word_en[i], end=' ')
# print()
# for i in outputs:
#     print(data_reader.ix_to_word_fr[i], end=' ')


In [12]:
valid_words = [k for k in data_reader.word_to_ix_en.keys()]
valid_words.sort()
print(valid_words)

[',', '-', '.', 'a', 'and', 'animal', 'animals', 'apple', 'apples', 'april', 'are', 'august', 'autumn', 'banana', 'bananas', 'bananas.', 'beautiful', 'busy', 'but', 'california', 'car', 'cat', 'chilly', 'china', 'cold', 'december', 'dislike', 'disliked', 'dislikes', 'driving', 'dry', 'during', 'elephants', 'fall', 'favorite', 'favorite.', 'feared', 'february', 'france', 'freezing', 'fruit', 'going', 'grape', 'grapefruit', 'grapes', 'he', 'her', 'his', 'hot', 'i', 'in', 'india', 'is', 'it', 'january', 'jersey', 'july', 'june', 'least', 'lemon', 'lemons', 'like', 'liked', 'likes', 'lime', 'limes', 'little', 'loved', 'mango', 'mangoes', 'mangoes.', 'march', 'may', 'mild', 'most', 'my', 'never', 'new', 'next', 'nice', 'november', 'october', 'old', 'orange', 'oranges', 'our', 'paris', 'peaches', 'pear', 'pears', 'plan', 'pleasant', 'quiet', 'rainy', 'red', 'relaxing', 'rusty', 'saw', 'september', 'shark', 'she', 'snowy', 'sometimes', 'spring', 'states', 'strawberries', 'strawberry', 'summer