# MLCL Assignment - 3
## Submitted by Chetan Patil

### Both the tasks have been trained over Polarity dataset http://www.cs.cornell.edu/people/pabo/movie-review-data/
## Task 1:
### Part a: Text Classification using CNNs on polarity Dataset

In [206]:
import tensorflow as tf
import numpy as np
import os, glob

import pandas as pd
import keras
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Activation, Conv1D, GlobalMaxPooling1D
from keras.regularizers import l2
from sklearn.preprocessing import LabelEncoder

In [207]:
sentence = []
labels = []
def read_train_dataset(child, sentence_list, labels_list, label, path = "./txt_sentoken/"):
    for root, dirs, files in os.walk(path+child):
        for file in files:
            if file.endswith(".txt"):
                f = open(path+child+"/"+file, "r")
                for line in f.readlines():
                    sentence_list.append(line)
                    labels_list.append(child)

In [208]:
read_train_dataset("pos", sentence, labels, "pos")
read_train_dataset("neg", sentence, labels, "neg")

In [209]:
print(len(sentence))

64720


In [210]:
vocab_dictionary = 10000
max_seq_len = 20
embedding_size = 50
batch_size = 64
epochs = 10
filter_size = 3
filters = 250
hidden_dims = 256

In [211]:
#preprocessing
token = Tokenizer(num_words=vocab_dictionary)
token.fit_on_texts(sentence)
x = token.texts_to_sequences(sentence)
x = sequence.pad_sequences(x, maxlen=max_seq_len)
y = labels
label_encoder = LabelEncoder()
label_encoder.fit(y)
y = label_encoder.transform(y)

y = to_categorical(y)
print(x.shape)
print(y.shape)

(64720, 20)
(64720, 2)


In [212]:
#model
model = Sequential()

model.add(Embedding(input_dim=vocab_dictionary, output_dim=embedding_size, input_length=max_seq_len))

model.add(Dropout(0.2))
model.add(Conv1D(filters=filters, kernel_size=filter_size, strides=1, padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(units=hidden_dims, activation='relu', kernel_regularizer=l2(0.2)))
model.add(Dropout(0.2))
model.add(Dense(units=2, activation='softmax'))

In [215]:
#compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_32 (Embedding)     (None, 20, 50)            500000    
_________________________________________________________________
dropout_51 (Dropout)         (None, 20, 50)            0         
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 18, 250)           37750     
_________________________________________________________________
global_max_pooling1d_23 (Glo (None, 250)               0         
_________________________________________________________________
dense_44 (Dense)             (None, 256)               64256     
_________________________________________________________________
dropout_52 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_45 (Dense)             (None, 2)                 514       
Total para

In [216]:
#model fitting
history = model.fit(x, y, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Train on 51776 samples, validate on 12944 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [217]:
input_vec = ["I am good", "Very bad", "That is so awesome", "it stinks", "enjoying but distressed"]
input_vec = token.texts_to_sequences(input_vec)
input_vec = sequence.pad_sequences(input_vec, maxlen=max_seq_len)
label_prob = model.predict(input_vec)
label_pred = label_prob.argmax(axis=-1)
print(label_encoder.inverse_transform(label_pred));

['neg' 'pos' 'pos' 'neg' 'neg']


  if diff:


## Part c: Extending the approaches 

We can consider ther named entities as well as POS tags for words along with text in order to enrich the features. In case of sentiment analysis we can also consider the tf-idf values in the feature vector. We can also take care of punctuations and include them in text as well as feature vector neglecting the irrelevant punctuations.

## Task 2
### Part a:  Text Classification using RNN's (LSTM) on polarity dataset
Same above data is used in this task 

In [218]:
from keras.layers import LSTM

In [219]:
#preprocessing
token = Tokenizer(num_words=vocab_dictionary)
token.fit_on_texts(sentence)
x = token.texts_to_sequences(sentence)
x = sequence.pad_sequences(x, maxlen=max_seq_len)
y = labels
label_encoder = LabelEncoder()
label_encoder.fit(y)
y = label_encoder.transform(y)

y = to_categorical(y)
print(x.shape)
print(y.shape)

(64720, 20)
(64720, 2)


In [223]:
model = Sequential()
model.add(Embedding(input_dim = vocab_dictionary, output_dim = embedding_size, input_length = max_seq_len))

model.add(LSTM(units = 100, recurrent_dropout=0.3, kernel_regularizer=l2(0.02), recurrent_regularizer=l2(0.02)))

model.add(Dropout(0.2))

model.add(Dense(2))

model.add(Activation('softmax'))

In [224]:
#compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_34 (Embedding)     (None, 20, 50)            500000    
_________________________________________________________________
lstm_10 (LSTM)               (None, 100)               60400     
_________________________________________________________________
dropout_54 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_47 (Dense)             (None, 2)                 202       
_________________________________________________________________
activation_7 (Activation)    (None, 2)                 0         
Total params: 560,602
Trainable params: 560,602
Non-trainable params: 0
_________________________________________________________________
None


In [225]:
history = model.fit(x, y, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Train on 51776 samples, validate on 12944 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [226]:
input_vec = ["I am good", "Very bad", "That is so awesome", "it stinks", "enjoying but distressed"]
input_vec = token.texts_to_sequences(input_vec)
input_vec = sequence.pad_sequences(input_vec, maxlen=max_seq_len)
label_prob = model.predict(input_vec)
label_pred = label_prob.argmax(axis=-1)
print(label_encoder.inverse_transform(label_pred))

['pos' 'neg' 'pos' 'neg' 'neg']


  if diff:


### Comparision of CNN's and RNN's results

The same polarity dataset was trained and tested over both CNNs and RNNs where we found the accuracy of RNNs using LSTMs is around 92% which is more than what we obtained using CNNs i.e 88%. We can also see that the first predicted sentence being explicitly stated good is not considered positive while LSTMs performed better over the same.

References:

https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
https://cambridgespark.com/content/tutorials/convolutional-neural-networks-with-keras/index.html
