In [0]:
from __future__ import print_function,division
import random 
import tensorflow as tf
import numpy as np
import os

In [0]:
# Create a directory and mount Google Drive using that directory.
!mkdir -p drive
!google-drive-ocamlfuse drive

print 'Files in Drive:'
!ls drive/

# Create a file in Drive.
!echo "This newly created file will appear in your Drive file list." > drive/created.txt

fuse: mountpoint is not empty
fuse: if you are sure this is safe, use the 'nonempty' mount option
Files in Drive:
created.txt


In [0]:
# Create a directory and mount Google Drive using that directory.
!mkdir -p drive
!google-drive-ocamlfuse drive

!ls drive/

# Create a file in Drive.
!echo "This newly created file will appear in your Drive file list." > drive/created.txt

A3 (1).ipynb   class_pos.txt		 Getting started
class_neg.txt  Drive_FUSE_example.ipynb


In [0]:
import sys
if sys.version_info >= (3, 0):
  from builtins import map as m
  def map(f,l):
    return list(m(f,l))

## Sentiment Classification - dataset analysis

We will use movie review dataset taken from http://www.cs.cornell.edu/people/pabo/movie-review-data/. The exact dataset we will use is the Sentence-polarity dataset.

In [0]:
data = []
for file_,label in zip(["drive/class_neg.txt","drive/class_pos.txt"],[0,1]):
    lines = open(file_).readlines()
    lines = list(map(lambda x:x.strip().replace("-"," ").split(),lines))
    for line in lines:
        data.append([line,label])
    print("Number of reviews of {} = {}".format(file_[:-4],len(lines)))
    print("\tMax number of tokens in a sentence = {}".format(max(map(lambda x:len(x),lines))))
    print("\tMin number of tokens in a sentence = {}".format(min(map(lambda x:len(x),lines))))
random.Random(5).shuffle(data)

Number of reviews of drive/class_neg = 5331
	Max number of tokens in a sentence = 56
	Min number of tokens in a sentence = 1
Number of reviews of drive/class_pos = 5331
	Max number of tokens in a sentence = 59
	Min number of tokens in a sentence = 2


Observe that the lengths of sentences are different. In case, we need to vectorize the operations, we need all sentences to be of equal length. Therefore, we will pad all sentences to be of equal length and substitute the padded parts of sentence with zeros. 

In [0]:
# See some randomly sampled sentences
print(" ".join(data[random.randint(0,len(data))][0]))

tries too hard to be funny in a way that's too loud , too goofy and too short of an attention span .


We will work with the sentence as given and not remove any stop-words or punctuation marks. 

In [0]:
sents = map(lambda x:x[0],data) # all sentences
all_words = set()
for sent in sents:
    all_words |= set(sent)
all_words = sorted(list(all_words))
vocab = {all_words[i]:i for i in range(len(all_words))}
print("Number of words : ",len(vocab))
train = data[:int(0.8*len(data))]
test = data[int(0.8*len(data)):]
train_data = []
train_targets = []
test_data = []
test_targets = []
for list_all,list_data,list_target,label_list in zip([train,test],[train_data,test_data],[train_targets,test_targets],["train","test"]):
    for datum,label in list_all:
        list_data.append([vocab[w] for w in datum])
        list_target.append([label])
    print(label_list)
    print("\tNumber of positive examples : ",list_target.count([1]))
    print("\tNumber of negative examples : ",list_target.count([0]))

Number of words :  19757
train
	Number of positive examples :  4288
	Number of negative examples :  4241
test
	Number of positive examples :  1043
	Number of negative examples :  1090


For implementation purposes, we will need an index for the padded word and we will use the index 19757.
Note: For a dataset of this <i>small</i> size, we will need to do K-Fold Cross-validation to evaluate the performance. However, we will work with this train-test split for the rest of this assignment. 

## Simple Classifier

<img src="https://web.cs.dal.ca/~sastry/cnn_simple.jpg"/>

The above image shows the architecture of the simple model that we will implement for text classification. We are interested in the following hyperparameters apart from the number of filters (which we will set to 1 for this problem):
* The span of the filter/the number of words considered for making the prediction.
* The size of the stride.
* The number of activations selected for feeding into softmax classifier.


First, we will write code which can select k top elements in the order they appeared. 

In [0]:
def k_max_pool(A,k):
    """
    A = 2 dimensional array (assume that the length of last dimension of A will be always more than k)
    k = number of elements.
    Return: For every row of A, top k elements in the order they appear.
    """
    assert len(A.get_shape())==2
    def func(row):
        """
        Hint : I used top_k and reverse.
        I am not sure whether the order of the indices are retained when sorted = False in top_k. (did not find any documentation)
        Therefore, I suggest that you sort the indices before selecting the elements from the array(Trick: use top_k again!)"""
        b=tf.nn.top_k(row,k)
        c=tf.nn.top_k(b.indices,k)[0]
        c=tf.reverse(c,[0])
        ret_tensor = tf.gather_nd(row,tf.reshape(c,[k,1]))
        ## your code here to compute ret_tensor ##
        return ret_tensor
    return tf.map_fn(func,A)

In [0]:
A = tf.placeholder(shape=[None,None],dtype=tf.float64)
top = k_max_pool(A,5)
sess = tf.Session()
for i in range(1,6):
    np.random.seed(5)
    l = np.random.randn(i*10,i*10)
    top_elements = sess.run(top,feed_dict={A:l})
    l = l.tolist()
    top_elements2 = np.array(map(lambda x: [x[i] for i in range(len(x)) if x[i]>sorted(x,reverse=True)[5]],l))
    # Note that this test assumes that the 6th largest element and 5th largest element are different.
    print(((top_elements - top_elements2)<10**-10).all())

True
True
True
True
True


In [0]:
def initializer(shape):
    xavier = tf.contrib.layers.xavier_initializer(seed=1)
    return xavier(shape)

In [0]:
class CNN_simple:
    def __init__(self,num_words,embedding_size = 30,span=2,k=5):
        self.num_words = num_words

        # The batch of text documents. Let's assume that it is always padded to length 100. 
        # We could use [None,None], but we'll use [None,100] for simplicity. 
        self.input = tf.placeholder(shape=[None,100],dtype=tf.int32)
        self.expected_output = tf.placeholder(shape=[None,1],dtype=tf.float32)
        

        embedding_matrix = tf.Variable(initializer((num_words, embedding_size)), name="embeddings")
        # Add an additional row of zeros to denote padded words.
        paddings = tf.constant([[0, 1], [0, 0]])
        self.embedding_matrix = tf.pad(embedding_matrix, paddings, "CONSTANT")
        
        
        # Extract the vectors from the embedding matrix. The dimensions should be None x 100 x embedding_size. 
        # Use embedding lookup
        vectors = tf.nn.embedding_lookup(self.embedding_matrix,self.input) # None x 100 x embedding_size
        
        # In order to use conv2d, we need vectors to be 4 dimensional.
        # The convention is NHWC - None (Batch Size) x Height(Height of image) x Width(Width of image) x Channel(Depth - similar to RGB).
        # For text, let's consider Height = 1, width = number of words, channel = embedding_size.
        # Use expand-dims to modify. 
        vectors2d = tf.expand_dims(vectors, 1) # None x 1 x 100 x embedding_size
        
        # Conv2d needs a filter bank.
        # The dimensions of the filter bank = Height, Width, in-channels, out-channels(Number-of-Filters).
        # We are creating a single filter of size = span. 
        # So, height = 1, width = span, in-channels = embedding_size ,out-channels = 1. 
        single_filter = tf.Variable(initializer((1, span, embedding_size, 1)), name="filter")  
        bias = tf.Variable(0.0,name="bias") # You need a bias for each filter.
        conv_span = tf.nn.conv2d(
            input=vectors2d,
            filter=single_filter,
            # Note that the first and last elements SHOULD be 1. 
            strides=[1, 1, 1, 1], 
            # This means that we are ok with input size being reduced during the process of convolution.
            padding="VALID"
        ) # Shape = None x 1 x 99 x 1
        acts = tf.nn.leaky_relu(conv_span+bias)
        
        # Now, let us extract the top k activations. 
        # But, we need to first convert acts this into 2-dimensional.  
        # Use tf.squeeze. Be sure to specify the squeeze-dimensions
        acts_2d = tf.squeeze(acts,[1,3])
        
        # Use k_max_pool to extract top-k activations
        input_fully_connected = k_max_pool(acts_2d,k) # None x k
        
        # Initialize the weight and bias needed for softmax classifier.
        self.softmax_weight = tf.Variable(dtype=tf.float32,initial_value=initializer((k,2)))
        self.softmax_bias = tf.Variable(dtype=tf.float32,initial_value=np.zeros(shape=[2]))
        
        # Write out the equation for computing the logits.
        self.output = tf.nn.softmax(tf.matmul(input_fully_connected, self.softmax_weight) + self.softmax_bias, axis=1) # Shape = Nonex2
        
        # Compute the cross-entropy cost. 
        # You might either sum or take mean of all the costs across all the examples. 
        # It is your choice as the test case is on Stochastic Training. 
        self.cost = tf.reduce_mean(-((self.expected_output*tf.log(self.output[:,1])+((1-self.expected_output)*tf.log(self.output[:,0])))))
        
        correct_prediction = tf.equal(tf.reshape(tf.argmax(self.output, 1),[-1,1]), tf.cast(self.expected_output, dtype=tf.int64))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))        
        
        optimizer = tf.train.AdamOptimizer()
        self.train_op = optimizer.minimize(self.cost)
        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())

    def pad(self,data,pad_word,pad_length=100):
        for datum in data:
            datum.extend([pad_word]*(pad_length-len(datum)))
        return data
    
    def train(self,train_data,test_data,train_targets,test_targets,batch_size=1,epochs=1,verbose=False):
        sess = self.session
        self.pad(train_data,self.num_words)
        self.pad(test_data,self.num_words)
        print("Starting training...")
        #print(sess.run(tf.shape(acts)))
        for epoch in range(epochs):
            cost_epoch = 0
            c = 0
            for datum,target in zip([train_data[i:i+batch_size] for i in range(0,len(train_data),batch_size)],
                                   [train_targets[i:i+batch_size] for i in range(0,len(train_targets),batch_size)]):
                _,cost = sess.run([self.train_op,self.cost],feed_dict={self.input:datum,self.expected_output:target})
                cost_epoch += cost
                c += 1
                if c%100 == 0 and verbose:
                    print("\t{} batches finished. Cost : {}".format(c,cost_epoch/c))
            print("Epoch {}: {}".format(epoch,cost_epoch/len(train_data)))
            print("\tTrain accuracy: {}".format(self.compute_accuracy(train_data,train_targets)))
            print("\tTest accuracy: {}".format(self.compute_accuracy(test_data,test_targets)))
    
    def compute_accuracy(self,data,targets):
        return self.session.run(self.accuracy,feed_dict={self.input:data,self.expected_output:targets})

In [0]:
c=CNN_simple(len(vocab))
c.train(train_data,test_data,train_targets,test_targets,epochs=1,verbose=True)

Starting training...
	100 batches finished. Cost : 0.688363182545
	200 batches finished. Cost : 0.695461704433
	300 batches finished. Cost : 0.6959020708
	400 batches finished. Cost : 0.697339072824
	500 batches finished. Cost : 0.698220447183
	600 batches finished. Cost : 0.697462852498
	700 batches finished. Cost : 0.697759581293
	800 batches finished. Cost : 0.697326156646
	900 batches finished. Cost : 0.696654122339
	1000 batches finished. Cost : 0.696934172928
	1100 batches finished. Cost : 0.696292648017
	1200 batches finished. Cost : 0.695106213714
	1300 batches finished. Cost : 0.695746991932
	1400 batches finished. Cost : 0.69638582832
	1500 batches finished. Cost : 0.69563915801
	1600 batches finished. Cost : 0.695392685775
	1700 batches finished. Cost : 0.695754958128
	1800 batches finished. Cost : 0.695977654043
	1900 batches finished. Cost : 0.695701952247
	2000 batches finished. Cost : 0.695247689158
	2100 batches finished. Cost : 0.694428743805
	2200 batches finished. Co

	6600 batches finished. Cost : 0.683325836033
	6700 batches finished. Cost : 0.682583970579
	6800 batches finished. Cost : 0.681868460882
	6900 batches finished. Cost : 0.682024431525
	7000 batches finished. Cost : 0.681632122029
	7100 batches finished. Cost : 0.680960628168
	7200 batches finished. Cost : 0.680770801177
	7300 batches finished. Cost : 0.680385479517
	7400 batches finished. Cost : 0.679857558429
	7500 batches finished. Cost : 0.679272243466
	7600 batches finished. Cost : 0.678875331777
	7700 batches finished. Cost : 0.67837705911
	7800 batches finished. Cost : 0.678056778775
	7900 batches finished. Cost : 0.678070614286
	8000 batches finished. Cost : 0.677414160635
	8100 batches finished. Cost : 0.676894019406
	8200 batches finished. Cost : 0.676402020847
	8300 batches finished. Cost : 0.675500193958
	8400 batches finished. Cost : 0.674959771196
	8500 batches finished. Cost : 0.67482759468
Epoch 0: 0.675099702244
	Train accuracy: 0.718958854675
	Test accuracy: 0.66432255

The expected output for the above snippet is
<pre>
Starting training...
	100 batches finished. Cost : 0.688363179564
	200 batches finished. Cost : 0.695461705327
	300 batches finished. Cost : 0.695902070602
	400 batches finished. Cost : 0.697339072227
	500 batches finished. Cost : 0.698220448136
    ...
Epoch 0: 0.675099702418
	Train accuracy: 0.718958854675
	Test accuracy: 0.664322555065   
</pre>
If you get any other output and you feel you are correct, you can proceed (However, I cannot think of any case where you can get a different output). 

## ConvNet 

### Architecture

<img src="https://web.cs.dal.ca/~sastry/cnn.png" style="height:40%;width:40%">

Essentially, there are 2 kind of hyper-parameters - the filter size and number of filters of each size. In the image shown, there are 3 filter-sizes - 2,3,4 and number of filters of each size is 2. Once the convolution is obtained, 1-max pooling is done - it basically involves extracting 1 activation from the list of activations which is the maximum activation. The reason we need to do this is to construct the inputs to the softmax layer which are of a fixed size.
Read more at https://arxiv.org/pdf/1510.03820.pdf. 

In [0]:
class CNN:
    def __init__(self,num_words,embedding_size = 30):
        self.num_words = num_words

        # The batch of text documents. Let's assume that it is always padded to length 100. 
        # We could use [None,None], but we'll use [None,100] for simplicity. 
        self.input = tf.placeholder(shape=[None,100],dtype=tf.int32)
        self.expected_output = tf.placeholder(shape=[None,1],dtype=tf.float32)
        

        embedding_matrix = tf.Variable(initializer((num_words, embedding_size)), name="embeddings")
        # Add an additional row of zeros to denote padded words.
        paddings = tf.constant([[0, 1], [0, 0]])
        self.embedding_matrix = tf.pad(embedding_matrix, paddings, "CONSTANT")
        
        # Extract the vectors from the embedding matrix. The dimensions should be None x 100 x embedding_size. 
        # Use embedding lookup
        vectors = tf.nn.embedding_lookup(self.embedding_matrix,self.input) # None x 100 x embedding_size
        
        # In order to use conv2d, we need vectors to be 4 dimensional.
        # The convention is NHWC - None (Batch Size) x Height(Height of image) x Width(Width of image) x Channel(Depth - similar to RGB).
        # For text, let's consider Height = 1, width = number of words, channel = embedding_size.
        # Use expand-dims to modify. 
        vectors2d = tf.expand_dims(vectors, 1) # None x 1 x 100 x embedding_size
        
        # Create 50 filters with span of 3 words. You need 1 bias for each filter.
        filter_tri = tf.Variable(initializer((1,3,embedding_size,50)), name="weight3")  
        bias_tri = tf.Variable(tf.zeros((1,50)), name="bias3")  
        conv1 = tf.nn.conv2d(
            input=vectors2d,
            filter=filter_tri,
            strides=[1, 1, 1, 1],
            padding="VALID"
        )  # Shape = ?
        A1 = tf.nn.leaky_relu(conv1+bias_tri)

        # Create 50 filters with span of 4 words. You need 1 bias for each filter.
        filter_4 = tf.Variable(initializer((1,4,embedding_size,50)), name="weight4")  
        bias_4 = tf.Variable(tf.zeros((1,50)), name="bias4")
        conv2 = tf.nn.conv2d(
            input=vectors2d,
            filter=filter_4,
            strides=[1, 1, 1, 1],
            padding="VALID"
        )  # Shape = ?

        A2 = tf.nn.leaky_relu(conv2+bias_4)

        # Create 50 filters with span of 5 words. You need 1 bias for each filter.
        filter_5 = tf.Variable(initializer((1,5,embedding_size,50)), name="weight5")  
        bias_5 = tf.Variable(tf.zeros((1,50)), name="bias5")
        conv3 = tf.nn.conv2d(
            input=vectors2d,
            filter=filter_5,
            strides=[1, 1, 1, 1],
            padding="VALID"
        )  # Shape = ?

        A3 = tf.nn.leaky_relu(conv3+bias_5)
        
        A1_2d = tf.squeeze(A1,[1])
        A2_2d = tf.squeeze(A2,[1])
        A3_2d = tf.squeeze(A3,[1])
        
        # Now extract the maximum activations for each of the filters. The shapes are listed alongside. 
        max_A1 = tf.reduce_max(A1_2d,reduction_indices=[1])  # None x 50
        max_A2 =  tf.reduce_max(A2_2d,reduction_indices=[1])  # None x 50
        max_A3 =  tf.reduce_max(A3_2d,reduction_indices=[1])  # None x 50
        
        concat = tf.concat([max_A1, max_A2, max_A3], axis=1) # None x 150
        
        # Initialize the weight and bias needed for softmax classifier. 
        self.softmax_weight = tf.Variable(dtype=tf.float32,initial_value=initializer((150,2)))
        self.softmax_bias = tf.Variable(dtype=tf.float32,initial_value=np.zeros(shape=[2]))
        
        # Write out the equation for computing the logits.
        self.output = tf.nn.softmax(tf.matmul(concat, self.softmax_weight) + self.softmax_bias, axis=1) # Shape = ?
        
        # Compute the cross-entropy cost. 
        # You might either sum or take mean of all the costs across all the examples. 
        # It is your choice as the test case is on Stochastic Training. 
        self.cost = tf.reduce_mean(-((self.expected_output*tf.log(self.output[:,1])+((1-self.expected_output)*tf.log(self.output[:,0])))))
        
        correct_prediction = tf.equal(tf.reshape(tf.argmax(self.output, 1),[-1,1]), tf.cast(self.expected_output, dtype=tf.int64))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        
        optimizer = tf.train.AdamOptimizer()
        self.train_op = optimizer.minimize(self.cost)
        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())

    def pad(self,data,pad_word,pad_length=100):
        for datum in data:
            datum.extend([pad_word]*(pad_length-len(datum)))
        return data
    
    def train(self,train_data,test_data,train_targets,test_targets,batch_size=1,epochs=1,verbose=False):
        sess = self.session
        self.pad(train_data,self.num_words)
        self.pad(test_data,self.num_words)
        print("Starting training...")
        for epoch in range(epochs):
            cost_epoch = 0
            c = 0
            for datum,target in zip([train_data[i:i+batch_size] for i in range(0,len(train_data),batch_size)],
                                   [train_targets[i:i+batch_size] for i in range(0,len(train_targets),batch_size)]):
                _,cost = sess.run([self.train_op,self.cost],feed_dict={self.input:datum,self.expected_output:target})
                cost_epoch += cost
                c += 1
                if c%100 == 0 and verbose:
                    print("\t{} batches finished. Cost : {}".format(c,cost_epoch/c))
            print("Epoch {}: {}".format(epoch,cost_epoch/len(train_data)))
            print("\tTrain accuracy: {}".format(self.compute_accuracy(train_data,train_targets)))
            print("\tTest accuracy: {}".format(self.compute_accuracy(test_data,test_targets)))
    
    def compute_accuracy(self,data,targets):
        return self.session.run(self.accuracy,feed_dict={self.input:data,self.expected_output:targets})

In [0]:
c=CNN(len(vocab))
c.train(train_data,test_data,train_targets,test_targets,epochs=1,verbose=True)

Starting training...
	100 batches finished. Cost : 0.692921407223
	200 batches finished. Cost : 0.694593489766
	300 batches finished. Cost : 0.69501670599
	400 batches finished. Cost : 0.695035843849
	500 batches finished. Cost : 0.693291912556
	600 batches finished. Cost : 0.692537607749
	700 batches finished. Cost : 0.692302573579
	800 batches finished. Cost : 0.688138688877
	900 batches finished. Cost : 0.684975062774
	1000 batches finished. Cost : 0.673650282636
	1100 batches finished. Cost : 0.66513232009
	1200 batches finished. Cost : 0.658742382556
	1300 batches finished. Cost : 0.657242691471
	1400 batches finished. Cost : 0.659336286146
	1500 batches finished. Cost : 0.656265231178
	1600 batches finished. Cost : 0.654087435502
	1700 batches finished. Cost : 0.652741391861
	1800 batches finished. Cost : 0.65330711311
	1900 batches finished. Cost : 0.651790104513
	2000 batches finished. Cost : 0.649372568377
	2100 batches finished. Cost : 0.64534604844
	2200 batches finished. Co

	6600 batches finished. Cost : 0.588505509759
	6700 batches finished. Cost : 0.58700422433
	6800 batches finished. Cost : 0.586377448646
	6900 batches finished. Cost : 0.586005952792
	7000 batches finished. Cost : 0.58512639721
	7100 batches finished. Cost : 0.583860160822
	7200 batches finished. Cost : 0.5825608631
	7300 batches finished. Cost : 0.582344473591
	7400 batches finished. Cost : 0.581754032691
	7500 batches finished. Cost : 0.580579865641
	7600 batches finished. Cost : 0.579680734467
	7700 batches finished. Cost : 0.579085824643
	7800 batches finished. Cost : 0.578347276748
	7900 batches finished. Cost : 0.577519263627
	8000 batches finished. Cost : 0.578204833062
	8100 batches finished. Cost : 0.577526843726
	8200 batches finished. Cost : 0.577229891062
	8300 batches finished. Cost : 0.575433941908
	8400 batches finished. Cost : 0.573412873759
	8500 batches finished. Cost : 0.572945575809
Epoch 0: 0.5740312021
	Train accuracy: 0.894360423088
	Test accuracy: 0.761837780476

The expected output for the above snippet is
<pre>
Starting training...
	100 batches finished. Cost : 0.692921404839
	200 batches finished. Cost : 0.694593518078
	300 batches finished. Cost : 0.695016788642
	400 batches finished. Cost : 0.695038306713
	500 batches finished. Cost : 0.693231915712
    ...
Epoch 0: 0.571991487547
	Train accuracy: 0.895532906055
	Test accuracy: 0.759962499142 
</pre>
If you get any other output and you feel you are correct, you can proceed (However, I cannot think of any case where you can get a different output). 

### Effect of Batch Size on Training

Study the effects of changing batch size. Just run the various experiments and observe the results (Run it in non-verbose mode). No need to make any comments here.

In [0]:
c=CNN(len(vocab))
c.train(train_data,test_data,train_targets,test_targets,4,epochs=1,verbose=False)

Starting training...
Epoch 0: 0.173166305431
	Train accuracy: 0.719662308693
	Test accuracy: 0.679793715477


In [0]:
c=CNN(len(vocab))
c.train(train_data,test_data,train_targets,test_targets,6,epochs=1,verbose=False)

Starting training...
Epoch 0: 0.115678352248
	Train accuracy: 0.689998805523
	Test accuracy: 0.661040782928


In [0]:
c=CNN(len(vocab))
c.train(train_data,test_data,train_targets,test_targets,10,epochs=1,verbose=False)

Starting training...
Epoch 0: 0.0693999522176
	Train accuracy: 0.649431347847
	Test accuracy: 0.639006078243


In [0]:
c=CNN(len(vocab))
c.train(train_data,test_data,train_targets,test_targets,20,epochs=1,verbose=False)

Starting training...
Epoch 0: 0.0347383128533
	Train accuracy: 0.570055127144
	Test accuracy: 0.541959702969


### Embeddings

2 functions - get_distance and get_most_similar to the CNN class (the big one). 
* get_distance(word1,word2) - should return the cosine distance between the 2 words.
* get_most_similar(word) - should return top 10 most similar words to the word passed.



In [0]:
class CNN2:
    def __init__(self,num_words,embedding_size = 30):
        self.num_words = num_words
        self.embedding_size = embedding_size
        # The batch of text documents. Let's assume that it is always padded to length 100. 
        # We could use [None,None], but we'll use [None,100] for simplicity. 
        self.input = tf.placeholder(shape=[None,100],dtype=tf.int32)
        self.expected_output = tf.placeholder(shape=[None,1],dtype=tf.float32)
        

        embedding_matrix = tf.Variable(initializer((num_words, embedding_size)), name="embeddings")
        # Add an additional row of zeros to denote padded words.
        paddings = tf.constant([[0, 1], [0, 0]])
        self.embedding_matrix = tf.pad(embedding_matrix, paddings, "CONSTANT")
        
        # Extract the vectors from the embedding matrix. The dimensions should be None x 100 x embedding_size. 
        # Use embedding lookup
        vectors = tf.nn.embedding_lookup(self.embedding_matrix,self.input) # None x 100 x embedding_size
        
        # In order to use conv2d, we need vectors to be 4 dimensional.
        # The convention is NHWC - None (Batch Size) x Height(Height of image) x Width(Width of image) x Channel(Depth - similar to RGB).
        # For text, let's consider Height = 1, width = number of words, channel = embedding_size.
        # Use expand-dims to modify. 
        vectors2d = tf.expand_dims(vectors, 1) # None x 1 x 100 x embedding_size
        
        # Create 50 filters with span of 3 words. You need 1 bias for each filter.
        filter_tri = tf.Variable(initializer((1,3,embedding_size,50)), name="weight3")  
        bias_tri = tf.Variable(tf.zeros((1,50)), name="bias3")  
        conv1 = tf.nn.conv2d(
            input=vectors2d,
            filter=filter_tri,
            strides=[1, 1, 1, 1],
            padding="VALID"
        )  # Shape = ?
        A1 = tf.nn.leaky_relu(conv1+bias_tri)

        # Create 50 filters with span of 4 words. You need 1 bias for each filter.
        filter_4 = tf.Variable(initializer((1,4,embedding_size,50)), name="weight4")  
        bias_4 = tf.Variable(tf.zeros((1,50)), name="bias4")
        conv2 = tf.nn.conv2d(
            input=vectors2d,
            filter=filter_4,
            strides=[1, 1, 1, 1],
            padding="VALID"
        )  # Shape = ?

        A2 = tf.nn.leaky_relu(conv2+bias_4)

        # Create 50 filters with span of 5 words. You need 1 bias for each filter.
        filter_5 = tf.Variable(initializer((1,5,embedding_size,50)), name="weight5")  
        bias_5 = tf.Variable(tf.zeros((1,50)), name="bias5")
        conv3 = tf.nn.conv2d(
            input=vectors2d,
            filter=filter_5,
            strides=[1, 1, 1, 1],
            padding="VALID"
        )  # Shape = ?

        A3 = tf.nn.leaky_relu(conv3+bias_5)
        
        A1_2d = tf.squeeze(A1,[1])
        A2_2d = tf.squeeze(A2,[1])
        A3_2d = tf.squeeze(A3,[1])
        
        # Now extract the maximum activations for each of the filters. The shapes are listed alongside. 
        max_A1 = tf.reduce_max(A1_2d,reduction_indices=[1])  # None x 50
        max_A2 =  tf.reduce_max(A2_2d,reduction_indices=[1])  # None x 50
        max_A3 =  tf.reduce_max(A3_2d,reduction_indices=[1])  # None x 50
        
        concat = tf.concat([max_A1, max_A2, max_A3], axis=1) # None x 150
        
        # Initialize the weight and bias needed for softmax classifier. 
        self.softmax_weight = tf.Variable(dtype=tf.float32,initial_value=initializer((150,2)))
        self.softmax_bias = tf.Variable(dtype=tf.float32,initial_value=np.zeros(shape=[2]))
        
        # Write out the equation for computing the logits.
        self.output = tf.nn.softmax(tf.matmul(concat, self.softmax_weight) + self.softmax_bias, axis=1) # Shape = ?
        
        # Compute the cross-entropy cost. 
        # You might either sum or take mean of all the costs across all the examples. 
        # It is your choice as the test case is on Stochastic Training. 
        self.cost = tf.reduce_mean(-((self.expected_output*tf.log(self.output[:,1])+((1-self.expected_output)*tf.log(self.output[:,0])))))
        
        correct_prediction = tf.equal(tf.reshape(tf.argmax(self.output, 1),[-1,1]), tf.cast(self.expected_output, dtype=tf.int64))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        
        optimizer = tf.train.AdamOptimizer()
        self.train_op = optimizer.minimize(self.cost)
        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
    
    def get_distance(self,word1,word2):
      sess1=self.session
      #First let's load meta graph and restore weights
      saver = tf.train.import_meta_graph('drive/my_cnn_model.meta')
      saver.restore(sess1,tf.train.latest_checkpoint('drive/./'))

      graph = tf.get_default_graph()
      embedding_matrix = graph.get_tensor_by_name("self.embedding_matrix:0")

      index1=vocab[word1]
      index2=vocab[word2]
      embedding1=embedding_matrix[index1,:]
      embedding2=embedding_matrix[index2,:]
      l1=tf.nn.l2_normalize(embedding1,axis=0)
      l2=tf.nn.l2_normalize(embedding2,axis=0)
      distance=tf.reduce_sum(tf.multiply(l1,l2))
      return self.session.run(distance)
    
    def get_most_similar_indices(self,word):
      length=len(vocab)+1
      t=tf.nn.l2_normalize(self.embedding_matrix,axis=1)
      index2=vocab[word]
      ts=self.embedding_matrix[index2,:]
      p=tf.nn.l2_normalize(ts,axis=0)
      p=tf.reshape(p,[1,self.embedding_size])
      p=tf.tile(p,[length,1])
      #s = 1-tf.losses.cosine_distance(p, t, dim=0)
      s=1-tf.reduce_sum(tf.multiply(p,t),reduction_indices=[1])
      s=tf.reshape(s,[1,length])
      r=tf.nn.top_k(s,11).indices
      return r
    
    def get_most_similar(self,word):
      sess2 = self.session
      #sess2.run(tf.global_variables_initializer())
      #print(sess.run(tf.cast(k_max_pool(s,11),tf.int32)))
      tp=sess2.run(self.get_most_similar_indices(word))
      words=[]
      for i in tp[0,:]:
          words.append(all_words[i])
      return words

    def pad(self,data,pad_word,pad_length=100):
        for datum in data:
            datum.extend([pad_word]*(pad_length-len(datum)))
        return data
    
    def train(self,train_data,test_data,train_targets,test_targets,batch_size=1,epochs=1,verbose=False):
        sess = self.session
        self.pad(train_data,self.num_words)
        self.pad(test_data,self.num_words)
        print("Starting training...")
        for epoch in range(epochs):
            cost_epoch = 0
            c = 0
            for datum,target in zip([train_data[i:i+batch_size] for i in range(0,len(train_data),batch_size)],
                                   [train_targets[i:i+batch_size] for i in range(0,len(train_targets),batch_size)]):
                saver = tf.train.Saver()
                _,cost = sess.run([self.train_op,self.cost],feed_dict={self.input:datum,self.expected_output:target})
                cost_epoch += cost
                c += 1
                if c%5==0:

                  #Run the operation by feeding input
                  #Prints 24 which is sum of (w1+w2)*b1 

                  #Now, save the graph
                  saver.save(sess, 'drive/my_cnn_model')
                  print(self.get_distance('good','bad'))
                if c%100 == 0 and verbose:
                    print("\t{} batches finished. Cost : {}".format(c,cost_epoch/c))
            print("Epoch {}: {}".format(epoch,cost_epoch/len(train_data)))
            print("\tTrain accuracy: {}".format(self.compute_accuracy(train_data,train_targets)))
            print("\tTest accuracy: {}".format(self.compute_accuracy(test_data,test_targets)))
    
    def compute_accuracy(self,data,targets):
        return self.session.run(self.accuracy,feed_dict={self.input:data,self.expected_output:targets})

### Learnings:

List out the observations and conclusions you made from the various experiments. 