In [1]:
import numpy as np
import random

# To read the training data and make a vocabulary and dictiornary to index the chars
class DataReader:
    def __init__(self, path_en, path_de, len_en, len_de):
        self.len_en = len_en
        self.len_de = len_de
        try:
            fp_en = open(path_en, "r")
            fp_de = open(path_de, "r")
            data_en = fp_en.read()
            data_de = fp_de.read()
        except e:
            print(f"FAILED TO FILE\n {e.what()}")

        # split lines
        self.lines_en = data_en.split("\n")[:68930]
        self.lines_de = data_de.split("\n")[:68930]
        # find unique words
        word_set = set()
        for line in self.lines_en:
            for word in line.split():
                word_set.add(word)
        words_en = ['-'] + list(word_set)
        self.vocab_size_en = len(words_en)
        
        word_set = set()
        for line in self.lines_de:
            for word in line.split():
                word_set.add(word)
        words_de = ['-'] + list(word_set)
        self.vocab_size_de = len(words_de)
        
        # create dictionary mapping for each word
        self.word_to_ix_en = {w:i for (i,w) in enumerate(words_en)}
        self.ix_to_word_en = {i:w for (i,w) in enumerate(words_en)}
        
        self.word_to_ix_de = {w:i for (i,w) in enumerate(words_de)}
        self.ix_to_word_de = {i:w for (i,w) in enumerate(words_de)}

        # total data
        self.lines_total = len(self.lines_en)
        
        #num of unique words
        self.vocab_size_en = len(words_en)
        self.vocab_size_de = len(words_de)

        self.pointer = -1
        self.indices = [i for i in range(self.lines_total)]

        # close file
        fp_en.close()
        fp_de.close()

    # en -> 17 words max per line
    # fr -> 23 words max per line
    def next_batch(self):
        if self.pointer >= self.lines_total:
            self.pointer = -1
            random.shuffle(self.indices)
            return None, None
        inputs = [self.word_to_ix_en[w] for w in self.lines_en[self.indices[self.pointer]].split()]
        targets = [self.word_to_ix_de[w] for w in self.lines_de[self.indices[self.pointer]].split()]

        # padding
        # inputs += [0] * (self.len_en - len(inputs))
        targets += [0] * (self.len_de - len(targets))

        # increment and return
        self.pointer += 1
        return inputs, targets
        
    def get_first_10(self):
        inputs = []
        targets = []
        for i in range(10):
            inputs.append([self.word_to_ix_en[w] for w in self.lines_en[self.indices[i]].split()])
            tar = [self.word_to_ix_de[w] for w in self.lines_de[self.indices[i]].split()]
            tar += [0] * (self.len_de - len(tar))
            targets.append(tar)
            
        return inputs, targets

    
    def new_epoch(self):
        return self.pointer == -1
    def start_epoch(self):
        self.pointer = 0
    
    def validate_input(self, input):
        words = input.split()
        for word in words:
            if not word in self.word_to_ix_en:
                print(f"Word not found: {word}")
                return False
        return True


In [2]:
class Encoder:
    def __init__(self, hidden_size, vocab_size_en, seq_length, learning_rate):
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size_en
        self.learning_rate = learning_rate
        
        # model parameters
        self.U = np.random.uniform(-np.sqrt(1./vocab_size_en), np.sqrt(1./vocab_size_en), (hidden_size, vocab_size_en))
        # self.V = np.random.uniform(-np.sqrt(1./hidden_size), np.sqrt(1./hidden_size), (vocab_size_de, hidden_size))
        self.W = np.random.uniform(-np.sqrt(1./hidden_size), np.sqrt(1./hidden_size), (hidden_size, hidden_size))
        self.b = np.zeros((hidden_size, 1)) # bias for hidden layer
        # self.c = np.zeros((vocab_size_de, 1)) # bias for output
        
        # memory vars for adagrad, 
        #ignore if you implement another approach
        self.mU = np.zeros_like(self.U)
        self.mW = np.zeros_like(self.W)
        # self.mV = np.zeros_like(self.V)
        self.mb = np.zeros_like(self.b)
        # self.mc = np.zeros_like(self.c)


    def forward(self, inputs):
        # xs, hs, os, ycap = {}, {}, {}, {}
        xs, hs = {}, {}
        hs[-1] = np.zeros((self.hidden_size, 1))
        for t in range(len(inputs)):
            xs[t] = np.zeros((self.vocab_size,1))
            xs[t][inputs[t]] = 1 # one hot encoding , 1-of-k
            hs[t] = np.tanh(np.dot(self.U,xs[t]) + np.dot(self.W,hs[t-1]) + self.b) # hidden state
            # os[t] = np.dot(self.V,hs[t]) + self.c # unnormalised log probs for next char
            # ycap[t] = self.softmax(os[t]) # probs for next char
        # return xs, hs, ycap
        return xs, hs, hs[len(inputs) - 1]
   
    def backward(self, xs, hs, dhnext):
        # backward pass: compute gradients going backwards
        # dU, dW, dV = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.V)
        # dU, dW, dV = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.V)
        # db, dc = np.zeros_like(self.b), np.zeros_like(self.c)
        dU, dW = np.zeros_like(self.U), np.zeros_like(self.W)
        db = np.zeros_like(self.b)
        # dhnext = np.zeros_like(hs[0])
        for t in reversed(range(len(hs) - 1)):
            #through softmax
            #dh includes gradient from two sides, next cell and current output
            dh = dhnext # backprop into h
            # backprop through tanh non-linearity 
            dhrec = (1 - hs[t] * hs[t]) * dh  #dhrec is the term used in many equations
            db += dhrec
            #calculate dU and dW
            dU += np.dot(dhrec, xs[t].T)
            dW += np.dot(dhrec, hs[t-1].T)
            #pass the gradient from next cell to the next iteration.
            dhnext = np.dot(self.W.T, dhrec)
        # clip to mitigate exploding gradients
        for dparam in [dU, dW, db]:
            np.clip(dparam, -5, 5, out=dparam) 
        return dU, dW,  db
    
    def update_model(self, dU, dW, db):
        # parameter update with adagrad
        for param, dparam, mem in zip([self.U, self.W, self.b],
                                  [dU, dW, db],
                                  [self.mU, self.mW, self.mb]):
            mem += dparam*dparam
            param += -self.learning_rate*dparam/np.sqrt(mem+1e-8) # adagrad update

    
    def predict(self, data_reader, input):

        #initialize input vector
        x = np.zeros((self.vocab_size, 1))
        words = input.split()
        ixes = []
        h = np.zeros((self.hidden_size, 1))
        for word in words:
            ix = data_reader.word_to_ix_en[word]
            x[ix] = 1
            h = np.tanh(np.dot(self.U, x) + np.dot(self.W, h) + self.b)
            ixes.append(ix)
            x[ix] = 0
        return h

    


In [3]:
  
class Decoder:
    def __init__(self, hidden_size, vocab_size_de, seq_length, learning_rate):
        # hyper parameters
        self.hidden_size = hidden_size
        
        self.vocab_size = vocab_size_de
        
        self.seq_length = seq_length
        self.learning_rate = learning_rate
        # model parameters
        self.U = np.random.uniform(-np.sqrt(1./self.vocab_size), np.sqrt(1./self.vocab_size), (hidden_size, self.vocab_size))
        self.V = np.random.uniform(-np.sqrt(1./hidden_size), np.sqrt(1./hidden_size), (self.vocab_size, hidden_size))
        self.W = np.random.uniform(-np.sqrt(1./hidden_size), np.sqrt(1./hidden_size), (hidden_size, hidden_size))
        self.b = np.zeros((hidden_size, 1)) # bias for hidden layer
        self.c = np.zeros((self.vocab_size, 1)) # bias for output
        
        # memory vars for adagrad, 
        #ignore if you implement another approach
        self.mU = np.zeros_like(self.U)
        self.mW = np.zeros_like(self.W)
        self.mV = np.zeros_like(self.V)
        self.mb = np.zeros_like(self.b)
        self.mc = np.zeros_like(self.c)

    def softmax(self, x):
        p = np.exp(x- np.max(x))
        return p / np.sum(p)
        
    def forward(self, hprev):
        xs, hs, os, ycap = {}, {}, {}, {}
        hs[-1] = np.copy(hprev)
        xs[0] = np.zeros((self.vocab_size, 1))
        for t in range(self.seq_length):
            hs[t] = np.tanh(np.dot(self.U,xs[t]) + np.dot(self.W,hs[t-1]) + self.b) # hidden state
            os[t] = np.dot(self.V,hs[t]) + self.c # unnormalised log probs for next char
            ycap[t] = self.softmax(os[t]) # probs for next char

            ix = np.random.choice(range(self.vocab_size), p = ycap[t].ravel())
            xs[t+1] = np.zeros((self.vocab_size, 1))
            xs[t+1][ix] = 1
        return xs, hs, ycap
    def backward(self, xs, hs, ps, targets):
        # backward pass: compute gradients going backwards
        dU, dW, dV = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.V)
        db, dc = np.zeros_like(self.b), np.zeros_like(self.c)
        dhnext = np.zeros_like(hs[0])
        self.c += 1
        for t in reversed(range(self.seq_length)):
            dy = np.copy(ps[t])
            #through softmax
            dy[targets[t]] -= 1 # backprop into y
            #calculate dV, dc
            dV += np.dot(dy, hs[t].T)
            dc += dc
            #dh includes gradient from two sides, next cell and current output
            dh = np.dot(self.V.T, dy) + dhnext # backprop into h
            # backprop through tanh non-linearity 
            dhrec = (1 - hs[t] * hs[t]) * dh  #dhrec is the term used in many equations
            db += dhrec
            #calculate dU and dW
            dU += np.dot(dhrec, xs[t].T)
            dW += np.dot(dhrec, hs[t-1].T)
            #pass the gradient from next cell to the next iteration.
            dhnext = np.dot(self.W.T, dhrec)
        # clip to mitigate exploding gradients
        for dparam in [dU, dW, dV, db, dc]:
            np.clip(dparam, -5, 5, out=dparam)
        return dU, dW, dV, db, dc, dhnext

    def loss(self, ps, targets):
        """loss for a sequence"""
        # calculate cross-entrpy loss
        return sum(-np.log(ps[t][targets[t],0]) for t in range(self.seq_length))
    
    def update_model(self, dU, dW, dV, db, dc):
        # parameter update with adagrad
        for param, dparam, mem in zip([self.U, self.W, self.V, self.b, self.c],
                                  [dU, dW, dV, db, dc],
                                  [self.mU, self.mW, self.mV, self.mb, self.mc]):
            mem += dparam*dparam
            param += -self.learning_rate*dparam/np.sqrt(mem+1e-8) # adagrad update
                
    def predict(self, data_reader, h):
        #initialize input vector
        x = np.zeros((self.vocab_size, 1))
        ixes = []

        for t in range(self.seq_length):
            h = np.tanh(np.dot(self.U, x) + np.dot(self.W, h) + self.b)
            y = np.dot(self.V, h) + self.c
            # p = np.exp(y)/np.sum(np.exp(y))
            p_shift = np.exp(y - np.max(y))
            p = p_shift/p_shift.sum(axis=0)
            ix = np.random.choice(range(self.vocab_size), p = p.ravel())
            x = np.zeros((self.vocab_size,1))
            x[ix] = 1
            ixes.append(ix)
        txt = ' '.join(data_reader.ix_to_word_de[i] for i in ixes)
        return txt

In [4]:
class Translator:
    def __init__(self, hidden_size_en, hidden_size_de, \
                 vocab_size_en, vocab_size_de, \
                 seq_length_en, seq_length_de, \
                 learning_rate_en, learning_rate_de):
        # hyper parameters
        self.hidden_size_en = hidden_size_en
        self.hidden_size_de = hidden_size_de
        self.vocab_size_en = vocab_size_en
        self.vocab_size_de = vocab_size_de
        self.seq_length_en = seq_length_en
        self.seq_length_de = seq_length_de
        self.learning_rate_en = learning_rate_en
        self.learning_rate_de = learning_rate_de
        # encoder / decoder
        self.encoder = Encoder(hidden_size = hidden_size_en, \
                               vocab_size_en = vocab_size_en, \
                               seq_length = seq_length_en, \
                               learning_rate = learning_rate_en)
        self.decoder = Decoder(hidden_size = hidden_size_de, \
                               vocab_size_de = vocab_size_de, \
                               seq_length = seq_length_de, \
                               learning_rate = learning_rate_de)
        
        
    def forward(self, inputs):
        xs_en, hs_en, last_hs = self.encoder.forward(inputs)
        xs_de, hs_de, ycap_de = self.decoder.forward(last_hs)

        return xs_en, hs_en, xs_de, hs_de, ycap_de
        
        
    def backward(self, xs_en, xs_de, hs_en, hs_de, ps_de, targets):
        dU_de, dW_de, dV_de, db_de, dc_de, dh = self.decoder.backward(xs_de, hs_de, ps_de, targets)
        dU_en, dW_en, db_en = self.encoder.backward(xs_en, hs_en, dh)
        return dU_en, dW_en, db_en, dU_de, dW_de, dV_de, db_de, dc_de
        
    
    def loss(self, ps, targets):
        """loss for a sequence"""
        # calculate cross-entrpy loss
        # ps_len = len(ps)
        # tar_len = len(targets)
        # if ps_len > tar_len:
        #     targets = targets + np.zeros((self.vocab_size, 1))*(ps_len - tar_len)
        # elif ps_len < tar_len:
        #     ps = ps + np.zeros((self.vocab_size, 1))*(tar_len - ps_len)
            
        return sum(-np.log(ps[t][targets[t],0]) for t in range(self.seq_length_de))
        
    
    def update_model(self, dU_en, dW_en, db_en, dU_de, dW_de, dV_de, db_de, dc_de):
        self.encoder.update_model(dU_en, dW_en, db_en)
        self.decoder.update_model(dU_de, dW_de, dV_de, db_de, dc_de)
        


    def train(self, data_reader, threshold = 0.01, epoch=100):
        iter_num = 1
        
        while epoch != 0:
            data_reader.start_epoch()
            inputs, targets = data_reader.next_batch()
            while not data_reader.new_epoch():
                xs_en, hs_en, xs_de, hs_de, ps_de = self.forward(inputs)
                dU_en, dW_en, db_en, dU_de, dW_de, dV_de, db_de, dc_de = \
                            self.backward(xs_en, xs_de, hs_en, hs_de, ps_de, targets)
                loss = self.loss(ps_de, targets)
                self.update_model(dU_en, dW_en, db_en, dU_de, dW_de, dV_de, db_de, dc_de)
                
                inputs, targets = data_reader.next_batch()

            if iter_num % 1 == 0:
                i, t = data_reader.get_first_10()
                total_loss = 0
                for idx in range(len(i)):
                    _, _, _, _, y_h = self.forward(i[idx])
                    total_loss += self.loss(y_h, t[idx])
                    
                print(f"========== Epoch {iter_num} completed, average first 10 loss: {total_loss / 10} ==========")
                text_to_translate = "new jersey is sometimes quiet during autumn , and it is snowy in april ."
                if data_reader.validate_input(text_to_translate):
                    print(f"ENG: {text_to_translate}")
                    print(self.predict(data_reader, text_to_translate))
                text_to_translate = "the united states is usually chilly during july , and it is usually freezing in november ."
                if data_reader.validate_input(text_to_translate):
                    print(f"ENG: {text_to_translate}")
                    print(self.predict(data_reader, text_to_translate))
                print()

            iter_num += 1
            epoch -= (epoch >= 0)

    def predict(self, data_reader, input):
        h = self.encoder.predict(data_reader, input)
        result = self.decoder.predict(data_reader, h)
        return result

In [5]:
input_file_name_en = "small_vocab_en.txt"
input_file_name_de = "small_vocab_fr.txt"

seq_length_en = 24
seq_length_de = 24

#read text from the "input.txt" file
data_reader = DataReader(input_file_name_en, input_file_name_de, seq_length_en, seq_length_de)

valid_words = [k for k in data_reader.word_to_ix_en.keys()]
valid_words.sort()
print(valid_words)
print(len(valid_words))

[',', '-', '.', '?', 'a', 'am', 'and', 'animal', 'animals', 'apple', 'apple.', 'apples', 'apples.', 'april', 'are', "aren't", 'august', 'automobile', 'autumn', 'banana', 'banana.', 'bananas', 'bananas.', 'bear', 'bears', 'beautiful', 'been', 'between', 'big', 'bird', 'birds', 'black', 'blue', 'busy', 'but', 'california', 'car', 'cat', 'cats', 'chilly', 'china', 'chinese', 'cold', 'december', 'did', "didn't", 'difficult', 'dislike', 'disliked', 'dislikes', 'do', 'does', 'dog', 'dogs', 'drives', 'driving', 'drove', 'dry', 'during', 'easy', 'eiffel', 'elephant', 'elephants', 'english', 'fall', 'favorite', 'favorite.', 'feared', 'february', 'field', 'football', 'france', 'freezing', 'french', 'fruit', 'fruit.', 'fun', 'go', 'going', 'grape', 'grape.', 'grapefruit', 'grapefruit.', 'grapes', 'grapes.', 'green', 'grocery', 'has', 'have', 'he', 'her', 'his', 'horse', 'horses', 'hot', 'how', 'i', 'in', 'india', 'is', "isn't", 'it', "it's", 'january', 'jersey', 'july', 'june', 'lake', 'last', 'l

In [6]:
# hidden_size_en = 300
# hidden_size_de = 300
# learning_rate_en = 0.009
# learning_rate_de = 0.009


# rnn = Translator(hidden_size_en, hidden_size_de, \
#                  data_reader.vocab_size_en, data_reader.vocab_size_de, \
#                  seq_length_en, seq_length_de, \
#                  learning_rate_en = learning_rate_en, learning_rate_de = learning_rate_de)

hidden_size_en = 300
hidden_size_de = 300
learning_rate_en = 0.009
learning_rate_de = 0.009


rnn = Translator(hidden_size_en, hidden_size_de, \
                 data_reader.vocab_size_en, data_reader.vocab_size_de, \
                 seq_length_en, seq_length_de, \
                 learning_rate_en = learning_rate_en, learning_rate_de = learning_rate_de)
              
# rnn.train(data_reader)   


In [7]:

rnn.train(data_reader, epoch = -1)


ENG: new jersey is sometimes quiet during autumn , and it is snowy in april .
l' jersey est gÃ©nÃ©ralement agrÃ©able en mois de mars il il agrÃ©able est pamplemousse janvier . . - - - - - - -
ENG: the united states is usually chilly during july , and it is usually freezing in november .
les Ã©tats-unis est le au en avril , il il est son chaud jamais avril . - - - - - - prÃ©fÃ©rÃ© -

ENG: new jersey is sometimes quiet during autumn , and it is snowy in april .
new jersey est gÃ©nÃ©ralement occupÃ© le l' Ã©tÃ© , il et fait Ã parfois mars . - - - - - - - -
ENG: the united states is usually chilly during july , and it is usually freezing in november .
les Ã©tats-unis est jamais froid en l' , mais et fait est pluvieux en fÃ©vrier . - - - - - - - -

ENG: new jersey is sometimes quiet during autumn , and it is snowy in april .
new jersey est parfois calme au mois rendre et aoÃ»t est merveilleux en froid en - - - - - - - - -
ENG: the united states is usually chilly during july , and it is usua

KeyboardInterrupt: 

In [12]:
text_to_translate = "new jersey is sometimes quiet during autumn , and it is snowy in april ."
if data_reader.validate_input(text_to_translate):
    print(rnn.predict(data_reader, text_to_translate))
text_to_translate = "the united states is usually chilly during july , and it is usually freezing in november ."
if data_reader.validate_input(text_to_translate):
    print(rnn.predict(data_reader, text_to_translate))


visiter Ã©tÃ© rendre proches notre intention pourrait Ã‰tats-unis - Ã‰tats-unis rendre grosse sur aimeraient notre intention - - - - - - - -
aiment-ils beau blanche redoutÃ©e mouillÃ©e Ã‰tats-unis aimÃ©e mouillÃ©e oranges rendre limes neige poires serpent poires - - - - - - - - -


(348, 1)

In [11]:
text_to_translate = "our least liked fruit is the lemon , but my least liked is the grape ."
if data_reader.validate_input(text_to_translate):
    print(rnn.predict(data_reader, text_to_translate))

notre moins aimÃ© des fruits est pamplemousse raisin mais mais mais aimÃ© moins moins la . - - - - - - - -


In [12]:
text_to_translate = "california is usually hot during december , and it is never dry in autumn . "
if data_reader.validate_input(text_to_translate):
    print(rnn.predict(data_reader, text_to_translate))

text_to_translate = "california is usually hot during december , and it is never dry in autumn . "
if data_reader.validate_input(text_to_translate):
    print(rnn.predict(data_reader, text_to_translate))

text_to_translate = "california is usually hot during december , and it is never dry in autumn . "
if data_reader.validate_input(text_to_translate):
    print(rnn.predict(data_reader, text_to_translate))

text_to_translate = "california is usually hot during december , and it is never dry in autumn . "
if data_reader.validate_input(text_to_translate):
    print(rnn.predict(data_reader, text_to_translate))

text_to_translate = "california is usually hot during december , and it is never dry in autumn . "
if data_reader.validate_input(text_to_translate):
    print(rnn.predict(data_reader, text_to_translate))

californie est gÃ©nÃ©ralement calme en avril , mais il est fait Ã gel l' automne l' - - - - - - - -
californie est jamais enneigÃ©e en juillet , mais il est jamais froid Ã l' l' automne - - - - - - - -
california est jamais occupÃ© en avril , et il est jamais Ã Ã l' automne automne - - - - - - - -
californie est gÃ©nÃ©ralement chaude en juin , et il est jamais Ã Ã l' automne . - - - - - - - -
californie est gÃ©nÃ©ralement pluvieux en juillet , et il est jamais Ã Ã Ã automne hiver - - - - - - - -


In [23]:
# np.savetxt("encoder_U.out", rnn.encoder.U)
# np.savetxt("encoder_W.out", rnn.encoder.W)
# np.savetxt("encoder_b.out", rnn.encoder.b)


# np.savetxt("decoder_U.out", rnn.decoder.U)
# np.savetxt("decoder_V.out", rnn.decoder.V)
# np.savetxt("decoder_W.out", rnn.decoder.W)
# np.savetxt("decoder_b.out", rnn.decoder.b)
# np.savetxt("decoder_c.out", rnn.decoder.c)

In [7]:

rnn.encoder.U = np.loadtxt("encoder_U.out")
rnn.encoder.W = np.loadtxt("encoder_W.out")
rnn.encoder.b = np.loadtxt("encoder_b.out")
rnn.encoder.b = np.reshape(rnn.encoder.b, (rnn.encoder.b.size, 1))

rnn.decoder.U = np.loadtxt("decoder_U.out")
rnn.decoder.V = np.loadtxt("decoder_V.out")
rnn.decoder.W = np.loadtxt("decoder_W.out")
rnn.decoder.b = np.loadtxt("decoder_b.out")
rnn.decoder.c = np.loadtxt("decoder_c.out")
rnn.decoder.b = np.reshape(rnn.decoder.b, (rnn.decoder.b.size, 1))
rnn.decoder.c = np.reshape(rnn.decoder.c, (rnn.decoder.c.size, 1))



In [23]:
rnn.decoder.b.shape

(300, 1)