# Index:
This ipnb contains three main parts:
1. Data cleaning and data loading
2. Seq2Seq model implemented on data using Tensorflow
3. Evaluation of the model performance

## Data Cleaning and Loading

In [None]:
input_lines = open('input_refined.txt').readlines()
output_lines = open('output_refined.txt').readlines()

In [None]:
input_lines = [x.replace('\n', '') for x in input_lines]
output_lines = [x.replace('\n', '') for x in output_lines]

In [None]:
input_lines[:10]

In [None]:
output_lines[:10]

In [None]:
len(input_lines)

In [None]:
len(output_lines)

In [None]:
import re
output_lines = [x for x, y in zip(output_lines, input_lines) if re.match("^[A-Za-z0-9_-]*$", y.replace(" ", "").replace("'", ""))]
input_lines = [x for x in input_lines if re.match("^[A-Za-z0-9_-]*$", x.replace(" ", "").replace("'", ""))]

In [None]:
len(input_lines)

In [None]:
len(output_lines)

In [None]:
input_lines = [x for x, y in zip(input_lines, output_lines) if re.match("^[A-Za-z0-9_-]*$", y.replace(" ", "").replace("'", ""))]
output_lines = [x for x in output_lines if re.match("^[A-Za-z0-9_-]*$", x.replace(" ", "").replace("'", ""))]

In [None]:
len(input_lines)

In [None]:
len(output_lines)

We get dictionaries to convert between indexes and letters/phonemes

In [None]:
input_chars = set()
for line in input_lines:
    for letter in line:
        input_chars.add(letter)

In [None]:
list(input_chars)

In [None]:
output_chars = set()
for line in output_lines:
    for letter in line:
        output_chars.add(letter)

In [None]:
list(output_chars)

In [None]:
import string

In [None]:
string.lowercase + string.uppercase + string.digits + ' ' + "'" + '_'

In [None]:
char_set = string.lowercase + string.uppercase + string.digits + ' ' + "'" + '_'

In [None]:
len(char_set)

In [None]:
index_to_letter = dict(enumerate(char_set))
letter_to_index = dict((v, k) for k,v in index_to_letter.items())

Biggest word in dictionary

In [None]:
len(input_lines)

In [None]:
len([x for x in input_lines if len(x) < 50])

In [None]:
len([x for x in input_lines if len(x) < 100])

In [None]:
len(output_lines)

In [None]:
len([x for x in output_lines if len(x) < 50])

In [None]:
len([x for x in output_lines if len(x) < 100])

In [None]:
len([x for x, y in zip(input_lines, output_lines) if (len(x) <= 50 and len(y) <= 50)])

In [None]:
len([x for x, y in zip(output_lines, input_lines) if (len(x) <= 50 and len(y) <= 50)])

In [None]:
input_lines_temp = [x for x, y in zip(input_lines, output_lines) if (len(x) <= 50 and len(y) <= 50)]
output_lines_temp = [x for x, y in zip(output_lines, input_lines) if (len(x) <= 50 and len(y) <= 50)]

In [None]:
input_lines = input_lines_temp
output_lines = output_lines_temp

In [None]:
len(input_lines)

In [None]:
len(output_lines)

We get rid of words that are too long, or that have punctuation or spaces in them

In [None]:
import random

c = list(zip(input_lines, output_lines))
random.shuffle(c)
input_lines, output_lines = zip(*c)

In [None]:
import numpy as np

input_ = np.zeros((len(input_lines), 50))
labels_ = np.zeros((len(output_lines), 50))

for i, (inp, out) in enumerate(zip(input_lines, output_lines)):
    inp = inp + "_" * (50 - len(inp))
    out = out + "_" * (50 - len(out))
    
    for j, letter in enumerate(inp):
        input_[i][j] = letter_to_index[letter]
    for j, letter in enumerate(out):
        labels_[i][j] = letter_to_index[letter]

In [None]:
input_.shape

In [None]:
input_ = input_.astype(np.int32)
labels_ = labels_.astype(np.int32)

input_test   = input_[:3000]
input_val    = input_[3000:6000]
input_train  = input_[6000:]
labels_test  = labels_[:3000]
labels_val   = labels_[3000:6000]
labels_train = labels_[6000:]

data_test  = zip(input_test, labels_test)
data_val   = zip(input_val, labels_val)
data_train = zip(input_train, labels_train)

In [None]:
labels_train.shape

## Seq2Seq model for sanskrit segmentation

In [None]:
import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.python.ops import rnn_cell, seq2seq

This cell resets the graphs and session

In [None]:
ops.reset_default_graph()
try:
    sess.close()
except:
    
    pass
sess = tf.InteractiveSession()

In [None]:
input_seq_length = 50
output_seq_length = 50
batch_size = 128

input_vocab_size = 65
output_vocab_size = 65
embedding_dim = 256

As on this page we take our Seq2Seq learner to have the follwing shape:

![alt text](https://www.tensorflow.org/versions/r0.7/images/basic_seq2seq.png "Seq2Seq")

This means the decode_input has to be shifted along by one from the labels

In [None]:
encode_input = [tf.placeholder(tf.int32, 
                                shape=(None,),
                                name = "ei_%i" %i)
                                for i in range(input_seq_length)]

labels = [tf.placeholder(tf.int32,
                                shape=(None,),
                                name = "l_%i" %i)
                                for i in range(output_seq_length)]

decode_input = [tf.zeros_like(encode_input[0], dtype=np.int32, name="GO")] + labels[:-1]

This cell is the meat of the model, and a lot is happening here under the hood.  We take our cells to be LSTM recurrent units, with dropout between the feed-forward layers.  We take 3 of these stacked as our neural network.  We then run this using the seq2seq.embedding_rnn_seq2seq pattern - this let's us hand the neural network sequences like 1,2,3,2,1 - and the neural network automatically embeds this as a one-hot tensor for us.  

Note that we build two networks within the 'decoders' scope.  One of these is using feed_previous = True, the other not.  We set this to False during training, so that even if the learner makes a mistake on a letter - we still give it the correct label in the decoder_inputs.  Since we don't have the real label for the test set, this is set to True, and the decoder takes the letter with maximum probability from the last step of the decoder output.  

The decode_output is a tensor of shape (batch_size, output_vocab_size).  We can run softmax on this to get logit scores for each letter.

In [None]:
keep_prob = tf.placeholder("float")

cells = [rnn_cell.DropoutWrapper(
        rnn_cell.BasicLSTMCell(embedding_dim), output_keep_prob=keep_prob
    ) for i in range(3)]

stacked_lstm = rnn_cell.MultiRNNCell(cells)

with tf.variable_scope("decoders") as scope:
    decode_outputs, decode_state = seq2seq.embedding_rnn_seq2seq(
        encode_input, decode_input, stacked_lstm, input_vocab_size, output_vocab_size, 1)
    
    scope.reuse_variables()
    
    decode_outputs_test, decode_state_test = seq2seq.embedding_rnn_seq2seq(
        encode_input, decode_input, stacked_lstm, input_vocab_size, output_vocab_size, 1, 
    feed_previous=True)

sequence_loss is cross-entropy on the soft max of the decode outputs.

In [None]:
loss_weights = [tf.ones_like(l, dtype=tf.float32) for l in labels]
loss = seq2seq.sequence_loss(decode_outputs, labels, loss_weights, output_vocab_size)
optimizer = tf.train.AdamOptimizer(1e-4)
train_op = optimizer.minimize(loss)

In [None]:
sess.run(tf.initialize_all_variables())

## Training model

Simple class for getting random batches and reshaping them properly for the model.

In [None]:
class DataIterator:
    def __init__(self, data, batch_size):
        self.data = data
        self.batch_size = batch_size
        self.iter = self.make_random_iter()
        
    def next_batch(self):
        try:
            idxs = self.iter.next()
        except StopIteration:
            self.iter = self.make_random_iter()
            idxs = self.iter.next()
        X, Y = zip(*[self.data[i] for i in idxs])
        X = np.array(X).T
        Y = np.array(Y).T
        return X, Y

    def make_random_iter(self):
        splits = np.arange(self.batch_size, len(self.data), self.batch_size)
        it = np.split(np.random.permutation(range(len(self.data))), splits)[:-1]
        return iter(it)
    
train_iter = DataIterator(data_train, 128)
val_iter = DataIterator(data_val, 128)
test_iter = DataIterator(data_test, 128)

Our evaluation scores are based on the seq2seq loss, and on the precision - the number of words that the model spells perfectly.

In [None]:
import sys

def get_feed(X, Y):
    feed_dict = {encode_input[t]: X[t] for t in range(input_seq_length)}
    feed_dict.update({labels[t]: Y[t] for t in range(output_seq_length)})
    return feed_dict

def train_batch(data_iter):
    X, Y = data_iter.next_batch()
    feed_dict = get_feed(X, Y)
    feed_dict[keep_prob] = 0.5
    _, out = sess.run([train_op, loss], feed_dict)
    return out

def get_eval_batch_data(data_iter):
    X, Y = data_iter.next_batch()
    feed_dict = get_feed(X, Y)
    feed_dict[keep_prob] = 1.
    all_output = sess.run([loss] + decode_outputs_test, feed_dict)
    eval_loss = all_output[0]
    decode_output = np.array(all_output[1:]).transpose([1,0,2])
    return eval_loss, decode_output, X, Y

def eval_batch(data_iter, num_batches):
    losses = []
    predict_loss = []
    for i in range(num_batches):
        eval_loss, output, X, Y = get_eval_batch_data(data_iter)
        losses.append(eval_loss)
        
        for index in range(len(output)):
            real = Y.T[index]
            predict = np.argmax(output, axis = 2)[index]
            predict_loss.append(all(real==predict))
    return np.mean(losses), np.mean(predict_loss)

In [None]:
#saver.restore(sess, "skt.ckpt") #if pretrained model is present => load it

for i in range(100000):
    try:
        train_batch(train_iter)
        if i % 1000 == 0:
            val_loss, val_predict = eval_batch(val_iter, 16)
            train_loss, train_predict = eval_batch(train_iter, 16)
            print "val loss   : %f, val predict   = %.1f%%" %(val_loss, val_predict * 100)
            print "train loss : %f, train predict = %.1f%%" %(train_loss, train_predict * 100)
            print
            sys.stdout.flush()
            
            saver.save(sess, "skt.ckpt") #Saving the model to skt.ckpt file
            
    except KeyboardInterrupt:
        print "interrupted by user"
        break

## Examining model outputs

In [None]:
saver.restore(sess, "skt.ckpt")

In [None]:
eval_loss, output, X, Y = get_eval_batch_data(test_iter)

In [None]:
for index in random.sample(range(len(output)), 10):
    inp = [index_to_letter[l] for l in X.T[index]] 
    real = [index_to_letter[l] for l in Y.T[index]] 
    predict = [index_to_letter[l] for l in np.argmax(output, axis = 2)[index]]
    
    print "input :        " + "".join(inp).split("_")[0]
    print "real output :  " + "".join(real).split("_")[0]
    print "model output : " + "".join(predict).split("_")[0]
    print "is correct :   " + str(real == predict)
    print

for getting outputs that are correct

In [None]:
for index in range(len(output)):
    inp = [index_to_letter[l] for l in X.T[index]] 
    real = [index_to_letter[l] for l in Y.T[index]] 
    predict = [index_to_letter[l] for l in np.argmax(output, axis = 2)[index]]
    
    if (real == predict):
        print "input :        " + "".join(inp).split("_")[0]
        print "real output :  " + "".join(real).split("_")[0]
        print "model output : " + "".join(predict).split("_")[0]
        print "is correct :   " + str(real == predict)
        print