# Importing libraries 

In [14]:
from ml_models.sent2vec import Sent2Vec
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd
import numpy as np
import tensorflow as tf

## Loading training and test 

In [15]:
training_data = pd.read_pickle("resources/pickle/training.pkl")
testing_data = pd.read_pickle("resources/pickle/test.pkl")

# Neural network construction 

In [16]:
BATCH_SIZE = 64
EPOCHES = 50
LEARNING_RATE = 0.0001
L2_LAMBDA = 10
KEEP_PROB = 0.8

In [17]:
def padding(sent,vec_size):
    while len(sent)<20:
        sent.append(np.zeros(vec_size))
    return sent

In [18]:
def pre_process(sent_df):
    
    try:
        all_sents = sent_df["Sentence"].values.tolist()
    except:
        all_sents = sent_df["sentences"].values.tolist()

    try:
        all_labels = sent_df["Label"].values.tolist()
    except:
        all_labels = sent_df["label"].values.tolist()

    all_labels = np.array([[1,0,0] if x == "positive" else ([0,1,0] if x == "negative" else [0,0,1]) for x in all_labels])

    sent2vec = Sent2Vec()
    all_sents = [sent2vec.transform_text_to_vec_matrix(x) for x in all_sents]
    vec_size = len(all_sents[0][0])
    
    padded_sents = [padding(sent,vec_size) if len(sent) < 20
                   else sent[:20] for sent in all_sents]
    # for sent in all_sents:
    #     if len(sent) < 20:
    #         sent = padding(sent)
    #     if len(sent) > 20:
    #         sent = sent[:19]
    padded_sents = np.array(padded_sents)
    padded_sents = np.expand_dims(padded_sents,axis = -1)
    
    return padded_sents, all_labels,vec_size

In [19]:
def get_batch(x, y, batch_size=BATCH_SIZE, shuffle=True):
    
    if shuffle:
        shuffled_index = np.random.permutation(range(x.shape[0]))
        x = x[shuffled_index]
        y = y[shuffled_index]
    
    n_batches = int(x.shape[0] / batch_size)
    
    for i in range(n_batches - 1):
        x_batch = x[i*batch_size: (i+1)*batch_size]
        y_batch = y[i*batch_size: (i+1)*batch_size]
        yield x_batch, y_batch

In [None]:
padded_sents,all_labels, vec_size= pre_process(training_data)
test_padded_sents,test_all_labels,_ = pre_process(testing_data)


In [8]:
filter_size = [2,3,4,5,6]
filter_num = 100
vec_size = vec_size
sent_size = 20

In [9]:
x_input = tf.placeholder(tf.float32,[None,20,50,1])
y_input = tf.placeholder(tf.float32,[None,3])
pooling_output=[]

for i, flt_size in enumerate(filter_size):
    flt_shape = [flt_size, vec_size, 1, filter_num]
    W = tf.Variable(tf.truncated_normal(flt_shape,stddev = 0.1))
    b = tf.Variable(tf.zeros(filter_num))
    
    conv = tf.nn.conv2d(x_input, W, [1,1,1,1], "VALID")
    active = tf.nn.relu(tf.nn.bias_add(conv,b))
    max_pool = tf.nn.max_pool(active, [1,sent_size-flt_size+1,1,1],
                              [1,1,1,1],"VALID")
    pooling_output.append(max_pool)

total_pooling = tf.concat(pooling_output, 3)
total_flt = filter_num*len(filter_size)
flattern = tf.reshape(total_pooling, (-1, total_flt))

dropout = tf.nn.dropout(flattern, KEEP_PROB)

W = tf.Variable(tf.truncated_normal([total_flt,3],stddev = 0.1))
b = tf.Variable(tf.zeros(3)) 

full_nn = tf.add(tf.matmul(dropout, W), b)
pred = tf.nn.softmax(full_nn)

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = y_input, logits = full_nn))
loss = loss+L2_LAMBDA*tf.nn.l2_loss(W)
optimizer = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)

correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y_input,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
saver = tf.train.Saver()    

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



## Training of Neural Network 

In [13]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(501):
        for x_train, y_train in get_batch(padded_sents, all_labels):
            sess.run(optimizer, feed_dict={x_input:x_train,y_input:y_train})
        train_accuracy = sess.run(accuracy, feed_dict={x_input:padded_sents,y_input:all_labels})
        test_accuracy = sess.run(accuracy, feed_dict={x_input:test_padded_sents,y_input:test_all_labels})
        
        if epoch%100 == 0:
            print("Epoch:", epoch)
            print("train_accuracy is", train_accuracy)
            print("test_accuracy is", test_accuracy, "\n")
    save_path = saver.save(sess, "saved_cnn_models/cnn_model.ckpt")
    print("Model saved in path: %s" % save_path)        

Epoch: 0
train_accuracy is 0.6015901
test_accuracy is 0.47368422 

Epoch: 100
train_accuracy is 0.73277384
test_accuracy is 0.4868421 

Epoch: 200
train_accuracy is 0.7632509
test_accuracy is 0.62171054 

Epoch: 300
train_accuracy is 0.8701413
test_accuracy is 0.68421054 

Epoch: 400
train_accuracy is 0.8860424
test_accuracy is 0.6875 

Epoch: 500
train_accuracy is 0.8909011
test_accuracy is 0.69736844 

Model saved in path: saved_cnn_models/cnn_model.ckpt
