In [123]:
import pandas as pd
import numpy as np

from nltk.util import skipgrams
from nltk.corpus import stopwords, movie_reviews

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [106]:
reviews = list(zip(*[(" ".join(list(movie_reviews.words(fileid))), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]))

text = reviews[0]
category = [0 if cat == "neg" else 1 for cat in reviews[1]]

## TF-IDF approach

A simple approach using n-grams or skip-grams

In [48]:
def grams_by_word(wordlist, n=3, k=1):
    output = []
    strings = list(filter(lambda word: word not in stopWords, wordlist))
    for string in strings:
        string = string.ljust(n)
        output.extend(skipgrams(string,n,k))
    return output

vectorizer = TfidfVectorizer(min_df=1, analyzer=grams_by_word)
vectorizer.fit([review[0] for review in train])

TfidfVectorizer(analyzer=<function grams_by_word at 0x11497be18>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), norm='l2',
        preprocessor=None, smooth_idf=True, stop_words=None,
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [33]:
stopWords = set(stopwords.words('english'))

In [32]:
X_train, X_test, y_train, y_test = train_test_split(text, category, test_size=0.25, stratify=category)

In [49]:
X_train_tfidf = vectorizer.transform(X_train)

In [50]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [65]:
parameters = {'C':[0.01,0.1,1,10,100]}
model=GridSearchCV(LogisticRegression(), parameters, cv=5)
model.fit(X_train_tfidf, y_train);

In [66]:
model.cv_results_['mean_test_score']

array([ 0.73466667,  0.76733333,  0.79733333,  0.81133333,  0.812     ])

## LSTM approach

#### GloVe embeddings

In [76]:
embeddings_index = dict()
glove_path = "word_embeddings/glove.6B/glove.6B.50d.txt"
with open(glove_path) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


#### Tokenize text

In [99]:
t = Tokenizer()
t.fit_on_texts(text)
vocab_size = len(t.word_index) + 1

In [115]:
encoded_text = t.texts_to_sequences(text)
# pad documents to a max length of 4 words
max_length = 100
padded_docs = pad_sequences(encoded_text, maxlen=max_length, padding='post')

In [116]:
# train-test split 
X_train, X_test, y_train, y_test = train_test_split(padded_docs, category, test_size=0.25, stratify=category)

In [117]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 50))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [124]:
# define model
model = Sequential()
e = Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=100, trainable=False)
model.add(e)
model.add(LSTM(20, input_shape=(100, 1)))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='RMSprop', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(X_train, y_train, epochs=50, validation_split=0.2, verbose=1)
# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 100, 50)           1970400   
_________________________________________________________________
lstm_1 (LSTM)                (None, 20)                5680      
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 21        
Total params: 1,976,101
Trainable params: 5,701
Non-trainable params: 1,970,400
_________________________________________________________________
None
Train on 1200 samples, validate on 300 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
E

#### Training Embedding layer from scratch

In [125]:
# define model
model = Sequential()
e = Embedding(vocab_size, 50, input_length=100)
model.add(e)
model.add(LSTM(20, input_shape=(100, 1)))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='RMSprop', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(X_train, y_train, epochs=50, validation_split=0.2, verbose=1)
# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 100, 50)           1970400   
_________________________________________________________________
lstm_2 (LSTM)                (None, 20)                5680      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 21        
Total params: 1,976,101
Trainable params: 1,976,101
Non-trainable params: 0
_________________________________________________________________
None
Train on 1200 samples, validate on 300 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50

KeyboardInterrupt: 

In [126]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 69.600000
