In [1]:
import os
os.chdir('D:/Datasets/Hackerearth Problem Setting/8')

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import Adam

Using TensorFlow backend.


In [3]:
df=pd.read_csv('amazon_review.csv')
df.head()

Unnamed: 0,Review_no,reviewText,Sentiment
0,1,I enjoy vintage books and movies so I enjoyed ...,Happy
1,2,This book is a reissue of an old one; the auth...,Happy
2,3,This was a fairly interesting read. It had ol...,Happy
3,4,I'd never read any of the Amy Brewster mysteri...,Happy
4,5,"If you like period pieces - clothing, lingo, y...",Happy


In [4]:
df['Sentiment'].value_counts() # Highly imbalanced Dataset

Happy      925123
Unhappy     57144
Name: Sentiment, dtype: int64

In [5]:
df.shape

(982267, 3)

In [58]:
df['words']=df['reviewText'].str.split().apply(len).value_counts()

In [64]:
print("Total no. of words present including all reviews",df['words'].sum())

Total no. of words present including all reviews 4224864.0


In [60]:
def count_fx(s):
    return s.count('.')

In [62]:
df['sentences'] = df['reviewText'].apply(count_fx)

In [65]:
print("Total no. of sentencecs present including all reviews",df['sentences'].sum())

Total no. of sentencecs present including all reviews 11772209


In [6]:
msk = np.random.rand(len(df)) < 0.7 # Splitting into train(70%) and test(30%) randomly

In [7]:
train_df=df[msk]
test_df=df[~msk]

In [8]:
print(train_df.shape)
print(test_df.shape)

(687258, 3)
(295009, 3)


In [9]:
print('Checking target values for train data:\n')
print(train_df['Sentiment'].value_counts(),'\n')
print('Checking target values for test data:\n')
print(test_df['Sentiment'].value_counts())

Checking target values for train data:

Happy      647353
Unhappy     39905
Name: Sentiment, dtype: int64 

Checking target values for test data:

Happy      277770
Unhappy     17239
Name: Sentiment, dtype: int64


### Upsampling the training dataset to fix imbalance

In [10]:
from sklearn.utils import resample

In [11]:
df_majority = train_df[train_df.Sentiment=="Happy"]
df_minority_1 = train_df[train_df.Sentiment=="Unhappy"]

In [12]:
df_minority_1_upsampled = resample(df_minority_1, 
                                 replace=True,     # sample with replacement
                                 n_samples=647569,    # to match majority class
                                 random_state=1000) 

In [13]:
train_df = pd.concat([df_majority, df_minority_1_upsampled])

In [14]:
train_df['Sentiment'].value_counts()

Unhappy    647569
Happy      647353
Name: Sentiment, dtype: int64

In [15]:
x_train=train_df['reviewText']
y_train=train_df['Sentiment']
x_test=test_df['reviewText']
y_test=test_df['Sentiment']

In [16]:
MAX_NB_WORDS = 20000
maxlen = 120  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

In [17]:
texts_train = x_train.astype(str)
texts_test = x_test.astype(str)

# Normal Embedding Method

In [18]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, char_level=False)
tokenizer.fit_on_texts(texts_train)
sequences = tokenizer.texts_to_sequences(texts_train)
sequences_test = tokenizer.texts_to_sequences(texts_test)

In [19]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 271831 unique tokens.


In [20]:
MAX_SEQUENCE_LENGTH = 200
#pad sequences are used to bring all sentences to same size.
# pad sequences with 0s
x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', x_train.shape)
print('Shape of data test tensor:', x_test.shape)

Shape of data tensor: (1294922, 200)
Shape of data test tensor: (295009, 200)


In [21]:
y_train = y_train.map({"Happy": 1, "Unhappy" : 0 })
y_test = y_test.map({"Happy": 1, "Unhappy" : 0 })

In [22]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2,input_shape=(1,)))
model.add(Dense(1, activation='sigmoid'))

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [23]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [24]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         2560000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=2,
          validation_data=(x_test, y_test))
          
#score, acc = model.evaluate(x_test, y_test,
#                            batch_size=batch_size)

Train on 1294922 samples, validate on 295009 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x229c35d52b0>

# Word2Vec Embedding

In [31]:
df=train_df.append(test_df)

In [32]:
from gensim.models import Word2Vec
word_model = Word2Vec(df['reviewText'], size=200, min_count = 1, window = 5,sg=0, negative=5)

In [33]:
word_vectors = word_model.wv

In [34]:
print("Number of word vectors: {}".format(len(word_vectors.vocab)))

Number of word vectors: 103


In [35]:
MAX_NB_WORDS = len(word_vectors.vocab)
MAX_SEQUENCE_LENGTH = 200

In [36]:
x_train=df["reviewText"][:train_df.shape[0]]
y_train=df["Sentiment"][:train_df.shape[0]]
x_test=df["reviewText"][train_df.shape[0]:]
y_test=df["Sentiment"][train_df.shape[0]:]

In [37]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, char_level=False)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)
sequences_test = tokenizer.texts_to_sequences(x_test)

In [38]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, 
                     padding="pre", truncating="post")
test_data = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH, padding="pre",
                          truncating="post")

In [39]:
print('Shape of data tensor:', data.shape)
print('Shape of test_data tensor:', test_data.shape)

Shape of data tensor: (1294922, 200)
Shape of test_data tensor: (295009, 200)


In [40]:
word_index = tokenizer.word_index

In [41]:
WV_DIM = 100
nb_words = min(MAX_NB_WORDS, len(word_vectors.vocab))
# we initialize the matrix with random numbers
wv_matrix = (np.random.rand(nb_words, WV_DIM) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        # words not found in embedding index will be all-zeros.
        wv_matrix[i] = embedding_vector
    except:
        pass        

In [42]:
y_train = y_train.map({"Happy": 1, "Unhappy" : 0 })
y_test = y_test.map({"Happy": 1, "Unhappy" : 0 })

In [43]:
model = Sequential()
model.add(Embedding(nb_words,
                     WV_DIM,
                     mask_zero=False,
                     weights=[wv_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2,input_shape=(1,)))
model.add(Dense(1, activation='sigmoid'))

In [44]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [45]:
model.fit(data, y_train,
          batch_size=batch_size,
          epochs=2,
          validation_data=(test_data, y_test))
          
#score, acc = model.evaluate(x_test, y_test,
#                            batch_size=batch_size)

Train on 1294922 samples, validate on 295009 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2298bc28ba8>

# GLoVe Embedding

In [46]:
x_train=train_df['reviewText']
y_train=train_df['Sentiment']
x_test=test_df['reviewText']
y_test=test_df['Sentiment']

In [47]:
MAX_NB_WORDS = 20000
maxlen = 120  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

In [48]:
texts_train = x_train.astype(str)
texts_test = x_test.astype(str)

In [49]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, char_level=False)
tokenizer.fit_on_texts(texts_train)
sequences = tokenizer.texts_to_sequences(texts_train)
sequences_test = tokenizer.texts_to_sequences(texts_test)

In [50]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 271831 unique tokens.


In [51]:
MAX_SEQUENCE_LENGTH = 200
#pad sequences are used to bring all sentences to same size.
# pad sequences with 0s
x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', x_train.shape)
print('Shape of data test tensor:', x_test.shape)

Shape of data tensor: (1294922, 200)
Shape of data test tensor: (295009, 200)


In [52]:
y_train = y_train.map({"Happy": 1, "Unhappy" : 0 })
y_test = y_test.map({"Happy": 1, "Unhappy" : 0 })

In [53]:
# load the whole embedding into memory
embeddings_index = {}
f = open('D:/Datasets/glove.6B/glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [54]:
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [55]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                            100,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2,input_shape=(1,)))
model.add(Dense(1, activation='sigmoid'))

In [56]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [57]:
model.fit(data, y_train,
          batch_size=batch_size,
          epochs=2,
          validation_data=(test_data, y_test))
          
#score, acc = model.evaluate(x_test, y_test,
#                            batch_size=batch_size)

Train on 1294922 samples, validate on 295009 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2298feb4a20>