In [1]:
import matplotlib
matplotlib.use('TkAgg')  # Or any other X11 back-ends

In [2]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt


%matplotlib inline

In [20]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
import h5py
from collections import Counter
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn import tree
from sklearn.svm import LinearSVC
from sklearn.utils import shuffle
import nltk
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
nltk.download("stopwords") # uncomment this line if you already have stopwords downloaded
from nltk.corpus import stopwords

stemmer = SnowballStemmer("english", ignore_stopwords=True)
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')
np.set_printoptions(threshold=np.nan)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sadhvi_mehta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
# Load data
IMDB_train = pd.read_csv('./Dataset/Input/IMDB-train.txt', sep='\t', encoding='latin-1', header=None)
IMDB_train_y = IMDB_train[:][1]
IMDB_valid = pd.read_csv('./Dataset/Input/IMDB-valid.txt', sep='\t', encoding='latin-1', header=None)
IMDB_valid_y = IMDB_valid[:][1]
IMDB_test = pd.read_csv('./Dataset/Input/IMDB-test.txt', sep='\t', encoding='latin-1', header=None)
IMDB_test_y = IMDB_test[:][1]
stemmer = SnowballStemmer("english", ignore_stopwords=True)

print("Data loaded.")

Data loaded.


In [22]:
print(IMDB_train.shape)
print(IMDB_valid.shape)
print(IMDB_test.shape)

(15000, 2)
(10000, 2)
(25000, 2)


In [23]:
frames = [IMDB_train, IMDB_valid]
frames_y = [IMDB_train_y, IMDB_valid_y]
IMDB_train = pd.concat(frames)
IMDB_train_y = pd.concat(frames_y)

In [24]:
print(IMDB_train.shape)
print(IMDB_test.shape)

(25000, 2)
(25000, 2)


In [25]:
print(IMDB_train_y.shape)
print(IMDB_test_y.shape)

(25000,)
(25000,)


# Preprocessing

In [26]:
# function to remove words with low occurence from vocabulary

from collections import Counter


In [27]:
# function responsible for initial preprocessing: removal of HTML tags, punctuation, conversion to lowercase
def preprocessing(data):
    new_data = []
    #i = 0
    for sentence in (data[:][0]):
        #clean = re.compile('<.*?>')
        new_sentence = re.sub('<.*?>', '', sentence) # remove HTML tags
        new_sentence = re.sub(r'[^\w\s]', '', new_sentence) # remove punctuation
        new_sentence = new_sentence.lower() # convert to lower case
        if new_sentence != '':
            new_data.append(new_sentence)
    return new_data

In [28]:
# obtain processed train and test

IMDB_train = preprocessing(IMDB_train)
IMDB_test = preprocessing(IMDB_test)

# Bag of n-gram 

In [29]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [10]:
# limit vocabulary to top 30,000 words to actaully be able to process data WITHOUT stop words
unigram = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 1), stop_words='english', max_features =30000)
bigram = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(2, 2), stop_words='english', max_features =30000)
trigram = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(3, 3), stop_words='english', max_features =30000)



In [11]:
all_gram = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 3), stop_words='english', max_features =30000)

### Below loading n-gram representation w/o stop words (only relevant one is trigrams and all_grams)

In [12]:
import nltk
# comment out below two lines if you already have these downloaded
nltk.download('punkt') 
nltk.download('wordnet')

train_unigram = unigram.fit_transform(IMDB_train).toarray()
test_unigram = unigram.transform(IMDB_test).toarray()


[nltk_data] Downloading package punkt to
[nltk_data]     /home/sadhvi_mehta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/sadhvi_mehta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  sorted(inconsistent))


In [13]:
import nltk
# comment out below two lines if you already have these downloaded
nltk.download('punkt') 
nltk.download('wordnet')

train_bigram = bigram.fit_transform(IMDB_train).toarray()
test_bigram = bigram.transform(IMDB_test).toarray()

[nltk_data] Downloading package punkt to
[nltk_data]     /home/sadhvi_mehta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/sadhvi_mehta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  sorted(inconsistent))


In [14]:
import nltk
# comment out below two lines if you already have these downloaded
nltk.download('punkt') 
nltk.download('wordnet')

train_trigram = trigram.fit_transform(IMDB_train).toarray()
test_trigram = trigram.transform(IMDB_test).toarray()

[nltk_data] Downloading package punkt to
[nltk_data]     /home/sadhvi_mehta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/sadhvi_mehta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  sorted(inconsistent))


In [12]:
import nltk
# comment out below two lines if you already have these downloaded
nltk.download('punkt') 
nltk.download('wordnet')

train_allgram = all_gram.fit_transform(IMDB_train).toarray()
test_allgram = all_gram.transform(IMDB_test).toarray()

[nltk_data] Downloading package punkt to
[nltk_data]     /home/sadhvi_mehta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/sadhvi_mehta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  sorted(inconsistent))


### Below, loading n-gram representations w/ stop words (only relevant one is trigrams)

In [10]:

unigram_w_sw = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 1), stop_words=None, max_features =30000)
bigram_w_sw = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(2, 2), stop_words=None, max_features =30000)
trigram_w_sw = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=3, 3), stop_words=None, max_features =30000)


In [30]:
allgram_w_sw = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', ngram_range=(1, 3), stop_words=None, max_features =30000)

In [18]:
train_unigram_w_sw = unigram_w_sw.fit_transform(IMDB_train).toarray()
test_unigram_w_sw = unigram_w_sw.transform(IMDB_test).toarray()


In [19]:
train_bigram_w_sw = bigram_w_sw.fit_transform(IMDB_train).toarray()
test_bigram_w_sw = bigram_w_sw.transform(IMDB_test).toarray()


In [12]:
train_trigram_w_sw = trigram_w_sw.fit_transform(IMDB_train).toarray()
test_trigram_w_sw = trigram_w_sw.transform(IMDB_test).toarray()

In [31]:
train_allgram_w_sw = allgram_w_sw.fit_transform(IMDB_train).toarray()
test_allgram_w_sw = allgram_w_sw.transform(IMDB_test).toarray()

In [32]:
# function that removes empty sentences

def rm_sents(data, target):
    new_data = []
    new_target = []
    for i in range(0,len(data)):
        if len(list(set(data[i]))) is not 1:
            new_data.append(data[i])
            new_target.append(target[i])
    return new_data, new_target

In [33]:
# function to shuffle labels and data at same time
import numpy as np
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    images = [a[i] for i in p]
    lbls = [b.iloc[i] for i in p]
    return images, lbls

### Neural Network Code

In [34]:
import numpy
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.utils import np_utils
from keras.layers import GlobalAveragePooling1D

Using TensorFlow backend.


In [35]:
import sklearn
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
import numpy as np
from keras.wrappers.scikit_learn import KerasClassifier

# function that tries three different initializations of weights to get average performance
def cv_model(epoch_num, x_arr, y_arr):
    validation_scores = list()
    for i in range(3):
        model = baseline_model()
        # fit network
        history = model.fit(x_arr[:15000], y_arr[:15000], 
          validation_data=(x_arr[15000:], y_arr[15000:]), batch_size=200, epochs=epoch_num, verbose=0)
        validation_scores.append(history.history['val_acc'][:-1])
    return validation_scores

### First attempted architecture

In [42]:
# model definition
def baseline_model():
    model = Sequential()
    model.add(Dense(500, input_shape=(30000,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [28]:
# train trigram model w/o stopwords
first_archi_tri_scores = cv_model(10, train_trigram, IMDB_train_y)

In [36]:
print('First Architecture Trigram Scores:')
temp = []
for arr in first_archi_tri_scores:
    temp.append(arr[8])
print(temp)

First Architecture Trigram Scores:
[0.7035999953746795, 0.7018999993801117, 0.7029999947547912]


In [37]:
# train trigram (uni, bi, and tri) model w/o stopwords
first_archi_all_scores = cv_model(10, train_allgram, IMDB_train_y)

In [38]:
print('First Architecture Allgram Scores:')
temp = []
for arr in first_archi_all_scores:
    temp.append(arr[8])
print(temp)

First Architecture Allgram Scores:
[0.8874999964237213, 0.8879999959468842, 0.8879999935626983]


In [19]:
# train trigram model w/ stopwords
first_archi_all_scores_sw = cv_model(10, train_allgram_w_sw, IMDB_train_y)

In [20]:
print('First Architecture Allgram Scores W/ sw:')
temp = []
for arr in first_archi_all_scores_sw:
    temp.append(arr[8])
print(temp)

First Architecture Allgram Scores W/ sw:
[0.890799994468689, 0.8890999972820282, 0.8911999940872193]


In [61]:
# UNIGRAM, BIGRAM, AND TRIGRAM

# train trigram model w/o stop words
allgram_model = baseline_model()
allgram_model.fit(train_allgram[:15000], IMDB_train_y[:15000], 
          validation_data=(train_allgram[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=10, verbose=2)

Train on 15000 samples, validate on 10000 samples
Epoch 1/10
 - 11s - loss: 0.3808 - acc: 0.8487 - val_loss: 0.2892 - val_acc: 0.8891
Epoch 2/10
 - 10s - loss: 0.1316 - acc: 0.9633 - val_loss: 0.2861 - val_acc: 0.8931
Epoch 3/10
 - 10s - loss: 0.0603 - acc: 0.9907 - val_loss: 0.3087 - val_acc: 0.8917
Epoch 4/10
 - 10s - loss: 0.0297 - acc: 0.9975 - val_loss: 0.3338 - val_acc: 0.8900
Epoch 5/10
 - 10s - loss: 0.0168 - acc: 0.9995 - val_loss: 0.3567 - val_acc: 0.8897
Epoch 6/10
 - 10s - loss: 0.0106 - acc: 0.9997 - val_loss: 0.3782 - val_acc: 0.8901
Epoch 7/10
 - 10s - loss: 0.0073 - acc: 0.9999 - val_loss: 0.3954 - val_acc: 0.8886
Epoch 8/10
 - 10s - loss: 0.0053 - acc: 0.9999 - val_loss: 0.4109 - val_acc: 0.8889
Epoch 9/10
 - 10s - loss: 0.0040 - acc: 0.9999 - val_loss: 0.4248 - val_acc: 0.8884
Epoch 10/10
 - 10s - loss: 0.0032 - acc: 0.9999 - val_loss: 0.4376 - val_acc: 0.8885


<keras.callbacks.History at 0x7fe3a61bb198>

In [22]:
# UNIGRAM, BIGRAM, AND TRIGRAM

# train trigram model w/ stop words
allgram_sw_model = baseline_model()
allgram_sw_model.fit(train_allgram_w_sw[:15000], IMDB_train_y[:15000], 
          validation_data=(train_allgram_w_sw[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=10, verbose=2)

Train on 15000 samples, validate on 10000 samples
Epoch 1/10
 - 9s - loss: 0.3536 - acc: 0.8580 - val_loss: 0.2853 - val_acc: 0.8901
Epoch 2/10
 - 8s - loss: 0.1077 - acc: 0.9720 - val_loss: 0.2887 - val_acc: 0.8935
Epoch 3/10
 - 7s - loss: 0.0450 - acc: 0.9950 - val_loss: 0.3121 - val_acc: 0.8922
Epoch 4/10
 - 7s - loss: 0.0198 - acc: 0.9992 - val_loss: 0.3418 - val_acc: 0.8925
Epoch 5/10
 - 7s - loss: 0.0097 - acc: 0.9997 - val_loss: 0.3726 - val_acc: 0.8923
Epoch 6/10
 - 7s - loss: 0.0060 - acc: 0.9997 - val_loss: 0.3928 - val_acc: 0.8916
Epoch 7/10
 - 7s - loss: 0.0035 - acc: 0.9998 - val_loss: 0.4142 - val_acc: 0.8913
Epoch 8/10
 - 7s - loss: 0.0024 - acc: 1.0000 - val_loss: 0.4318 - val_acc: 0.8912
Epoch 9/10
 - 7s - loss: 0.0018 - acc: 1.0000 - val_loss: 0.4480 - val_acc: 0.8913
Epoch 10/10
 - 7s - loss: 0.0014 - acc: 1.0000 - val_loss: 0.4606 - val_acc: 0.8906


<keras.callbacks.History at 0x7fcc4d324f60>

In [44]:
# UNIGRAM, BIGRAM, AND TRIGRAM

# train trigram model w/ stop words (and 20 epochs just for testing)
bs = baseline_model()
bs.fit(train_allgram_w_sw[:15000], IMDB_train_y[:15000], 
          validation_data=(train_allgram_w_sw[15000:], IMDB_train_y[15000:]), batch_size=30, epochs=20, verbose=2)

Train on 15000 samples, validate on 10000 samples
Epoch 1/20
 - 3s - loss: 0.4124 - acc: 0.8247 - val_loss: 0.3296 - val_acc: 0.8658
Epoch 2/20
 - 2s - loss: 0.2854 - acc: 0.8885 - val_loss: 0.3318 - val_acc: 0.8691
Epoch 3/20
 - 2s - loss: 0.2488 - acc: 0.9022 - val_loss: 0.3492 - val_acc: 0.8644
Epoch 4/20
 - 2s - loss: 0.2280 - acc: 0.9107 - val_loss: 0.3581 - val_acc: 0.8643
Epoch 5/20
 - 2s - loss: 0.2109 - acc: 0.9173 - val_loss: 0.3772 - val_acc: 0.8612
Epoch 6/20
 - 2s - loss: 0.1957 - acc: 0.9242 - val_loss: 0.3847 - val_acc: 0.8576
Epoch 7/20
 - 2s - loss: 0.1760 - acc: 0.9339 - val_loss: 0.4046 - val_acc: 0.8551
Epoch 8/20
 - 2s - loss: 0.1550 - acc: 0.9401 - val_loss: 0.4313 - val_acc: 0.8525
Epoch 9/20
 - 2s - loss: 0.1367 - acc: 0.9509 - val_loss: 0.4536 - val_acc: 0.8517
Epoch 10/20
 - 2s - loss: 0.1204 - acc: 0.9571 - val_loss: 0.4917 - val_acc: 0.8465
Epoch 11/20
 - 2s - loss: 0.1061 - acc: 0.9625 - val_loss: 0.5047 - val_acc: 0.8438
Epoch 12/20
 - 2s - loss: 0.0903 - 

<keras.callbacks.History at 0x7fcc42e021d0>

### Second attempted architecture: added dropout for overfitting

In [44]:
def baseline_model(dropout_rate=0.0):
    model = Sequential()
    model.add(Dense(500, input_shape=(30000,), activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [43]:
# TRIGRAM:

from sklearn.model_selection import RandomizedSearchCV
from scipy import stats
import time
from scipy.stats import expon
from keras.wrappers.scikit_learn import KerasClassifier

# below, dictionary of params being modified
dropout_range = [0, 0.1, 0.2, 0.3]
param_grid = dict(dropout_rate=dropout_range) 

# random search
model = KerasClassifier(build_fn=baseline_model) 
random = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=3, n_iter=3)

start_time = time.time()
random_result = random.fit(train_trigram[:15000], IMDB_train_y[:15000], 
          validation_data=(train_trigram[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=10, verbose=0)

# Summarize results
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

SyntaxError: invalid syntax (<ipython-input-43-55bb5e8b01db>, line 10)

In [96]:
# train trigram model w/o stop words w/ best dropout
trigram_model = baseline_model()
trigram_model.fit(train_trigram[:15000], IMDB_train_y[:15000], 
          validation_data=(train_trigram[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=10, verbose=2)

Train on 15000 samples, validate on 10000 samples
Epoch 1/10
 - 13s - loss: 0.0749 - acc: 0.9488 - val_loss: 1.4123 - val_acc: 0.6787
Epoch 2/10
 - 14s - loss: 0.0748 - acc: 0.9505 - val_loss: 1.4205 - val_acc: 0.6788
Epoch 3/10
 - 14s - loss: 0.0748 - acc: 0.9493 - val_loss: 1.4290 - val_acc: 0.6791
Epoch 4/10
 - 14s - loss: 0.0747 - acc: 0.9501 - val_loss: 1.4366 - val_acc: 0.6789
Epoch 5/10
 - 14s - loss: 0.0748 - acc: 0.9483 - val_loss: 1.4447 - val_acc: 0.6797
Epoch 6/10
 - 14s - loss: 0.0747 - acc: 0.9497 - val_loss: 1.4513 - val_acc: 0.6792
Epoch 7/10
 - 12s - loss: 0.0747 - acc: 0.9487 - val_loss: 1.4584 - val_acc: 0.6785
Epoch 8/10
 - 14s - loss: 0.0746 - acc: 0.9505 - val_loss: 1.4658 - val_acc: 0.6778
Epoch 9/10
 - 13s - loss: 0.0743 - acc: 0.9506 - val_loss: 1.4728 - val_acc: 0.6784
Epoch 10/10
 - 14s - loss: 0.0744 - acc: 0.9501 - val_loss: 1.4799 - val_acc: 0.6788


<keras.callbacks.History at 0x7fd8bcee54a8>

In [47]:
# ALLGRAM:

from sklearn.model_selection import RandomizedSearchCV
from scipy import stats
import time
from scipy.stats import expon
from keras.wrappers.scikit_learn import KerasClassifier


# below, dictionary of params being modified
dropout_range = [0, 0.1, 0.2, 0.3]
param_grid = dict(dropout_rate=dropout_range, epochs=[10], batch_size=[200]) 

# random search
model = KerasClassifier(build_fn=baseline_model) 
random = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=3, n_iter=4)

start_time = time.time()
random_result = random.fit(train_allgram[:15000], IMDB_train_y[:15000], 
          validation_data=(train_allgram[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=20, verbose=2)

# Summarize results
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')



Train on 10000 samples, validate on 10000 samples
Epoch 1/20
 - 12s - loss: 0.4652 - acc: 0.7807 - val_loss: 0.4780 - val_acc: 0.7946
Epoch 2/20
 - 11s - loss: 0.2121 - acc: 0.9419 - val_loss: 0.3504 - val_acc: 0.8624
Epoch 3/20
 - 11s - loss: 0.1032 - acc: 0.9802 - val_loss: 0.4062 - val_acc: 0.8383
Epoch 4/20
 - 11s - loss: 0.0494 - acc: 0.9956 - val_loss: 0.4273 - val_acc: 0.8447
Epoch 5/20
 - 11s - loss: 0.0237 - acc: 0.9987 - val_loss: 0.4723 - val_acc: 0.8422
Epoch 6/20
 - 12s - loss: 0.0127 - acc: 0.9993 - val_loss: 0.4893 - val_acc: 0.8467
Epoch 7/20
 - 11s - loss: 0.0077 - acc: 0.9997 - val_loss: 0.5108 - val_acc: 0.8468
Epoch 8/20
 - 11s - loss: 0.0050 - acc: 0.9997 - val_loss: 0.5515 - val_acc: 0.8434
Epoch 9/20
 - 11s - loss: 0.0036 - acc: 0.9997 - val_loss: 0.5627 - val_acc: 0.8463
Epoch 10/20
 - 11s - loss: 0.0027 - acc: 0.9998 - val_loss: 0.5858 - val_acc: 0.8448
Epoch 11/20
 - 11s - loss: 0.0021 - acc: 0.9999 - val_loss: 0.6083 - val_acc: 0.8438
Epoch 12/20
 - 11s - los

In [53]:
best_dropout = random_result.best_params_['dropout_rate']

In [76]:
# train allgram model (uni, bi, tri) w/o stop words w/ best dropout
allgram_model = baseline_model(best_dropout)
allgram_model.fit(train_allgram[:15000], IMDB_train_y[:15000], 
          validation_data=(train_allgram[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=20, verbose=2)

Train on 15000 samples, validate on 10000 samples
Epoch 1/20
 - 18s - loss: 0.4047 - acc: 0.8385 - val_loss: 0.2950 - val_acc: 0.8890
Epoch 2/20
 - 15s - loss: 0.1579 - acc: 0.9549 - val_loss: 0.2876 - val_acc: 0.8895
Epoch 3/20
 - 14s - loss: 0.0806 - acc: 0.9826 - val_loss: 0.3055 - val_acc: 0.8880
Epoch 4/20
 - 13s - loss: 0.0441 - acc: 0.9947 - val_loss: 0.3315 - val_acc: 0.8850
Epoch 5/20
 - 13s - loss: 0.0258 - acc: 0.9982 - val_loss: 0.3554 - val_acc: 0.8842
Epoch 6/20
 - 13s - loss: 0.0170 - acc: 0.9993 - val_loss: 0.3803 - val_acc: 0.8860
Epoch 7/20
 - 13s - loss: 0.0116 - acc: 0.9997 - val_loss: 0.4004 - val_acc: 0.8859
Epoch 8/20
 - 12s - loss: 0.0087 - acc: 0.9998 - val_loss: 0.4191 - val_acc: 0.8841
Epoch 9/20
 - 12s - loss: 0.0068 - acc: 0.9998 - val_loss: 0.4355 - val_acc: 0.8833
Epoch 10/20
 - 12s - loss: 0.0053 - acc: 0.9999 - val_loss: 0.4576 - val_acc: 0.8796
Epoch 11/20
 - 13s - loss: 0.0042 - acc: 0.9999 - val_loss: 0.4715 - val_acc: 0.8817
Epoch 12/20
 - 13s - los

<keras.callbacks.History at 0x7f8c5ec9b748>

In [77]:
# apply on test set to see performance:
score = allgram_model.evaluate(test_allgram, IMDB_test_y, verbose=2)

In [78]:
print('Test loss: ', score[0])
print('Test accuracy: ', score[1])

Test loss:  0.6158240246343613
Test accuracy:  0.86176


In [27]:
# ALLGRAM W/ SW:

from sklearn.model_selection import RandomizedSearchCV
from scipy import stats
import time
from scipy.stats import expon
from keras.wrappers.scikit_learn import KerasClassifier


# below, dictionary of params being modified
dropout_range = [0, 0.1, 0.2, 0.3]
param_grid = dict(dropout_rate=dropout_range, epochs=[10], batch_size=[200]) 

# random search
model = KerasClassifier(build_fn=baseline_model) 
random = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=3, n_iter=4)

start_time = time.time()
random_result = random.fit(train_allgram_w_sw[:15000], IMDB_train_y[:15000], 
          validation_data=(train_allgram_w_sw[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=20, verbose=2)

# Summarize results
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Train on 10000 samples, validate on 10000 samples
Epoch 1/20
 - 19s - loss: 0.3756 - acc: 0.8511 - val_loss: 0.3164 - val_acc: 0.8785
Epoch 2/20
 - 16s - loss: 0.0870 - acc: 0.9760 - val_loss: 0.4537 - val_acc: 0.8387
Epoch 3/20
 - 16s - loss: 0.0223 - acc: 0.9980 - val_loss: 0.4925 - val_acc: 0.8492
Epoch 4/20
 - 16s - loss: 0.0070 - acc: 0.9997 - val_loss: 0.5528 - val_acc: 0.8484
Epoch 5/20
 - 17s - loss: 0.0032 - acc: 0.9998 - val_loss: 0.5629 - val_acc: 0.8544
Epoch 6/20
 - 17s - loss: 0.0017 - acc: 1.0000 - val_loss: 0.6275 - val_acc: 0.8513
Epoch 7/20
 - 17s - loss: 9.5332e-04 - acc: 1.0000 - val_loss: 0.6661 - val_acc: 0.8481
Epoch 8/20
 - 17s - loss: 6.1358e-04 - acc: 1.0000 - val_loss: 0.6814 - val_acc: 0.8505
Epoch 9/20
 - 17s - loss: 4.2970e-04 - acc: 1.0000 - val_loss: 0.6992 - val_acc: 0.8512
Epoch 10/20
 - 17s - loss: 3.1995e-04 - acc: 1.0000 - val_loss: 0.7238 - val_acc: 0.8505
Epoch 11/20
 - 17s - loss: 2.4431e-04 - acc: 1.0000 - val_loss: 0.7483 - val_acc: 0.8492
Epoc

In [29]:
best_dropout = random_result.best_params_['dropout_rate']

In [45]:
# train allgram model (uni, bi, tri) w/o stop words w/ best dropout
allgram_model = baseline_model(best_dropout)
allgram_model.fit(train_allgram_w_sw[:15000], IMDB_train_y[:15000], 
          validation_data=(train_allgram_w_sw[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=20, verbose=2)

Train on 15000 samples, validate on 10000 samples
Epoch 1/20
 - 35s - loss: 0.3579 - acc: 0.8472 - val_loss: 0.2794 - val_acc: 0.8928
Epoch 2/20
 - 25s - loss: 0.0901 - acc: 0.9747 - val_loss: 0.3195 - val_acc: 0.8899
Epoch 3/20
 - 26s - loss: 0.0280 - acc: 0.9950 - val_loss: 0.3974 - val_acc: 0.8809
Epoch 4/20
 - 26s - loss: 0.0101 - acc: 0.9988 - val_loss: 0.4223 - val_acc: 0.8901
Epoch 5/20
 - 27s - loss: 0.0041 - acc: 0.9997 - val_loss: 0.4579 - val_acc: 0.8907
Epoch 6/20
 - 25s - loss: 0.0017 - acc: 1.0000 - val_loss: 0.4806 - val_acc: 0.8901
Epoch 7/20
 - 26s - loss: 0.0010 - acc: 1.0000 - val_loss: 0.5013 - val_acc: 0.8904
Epoch 8/20
 - 26s - loss: 6.7226e-04 - acc: 1.0000 - val_loss: 0.5146 - val_acc: 0.8897
Epoch 9/20
 - 27s - loss: 4.5617e-04 - acc: 1.0000 - val_loss: 0.5313 - val_acc: 0.8897
Epoch 10/20
 - 25s - loss: 3.5621e-04 - acc: 1.0000 - val_loss: 0.5429 - val_acc: 0.8907
Epoch 11/20
 - 26s - loss: 2.6735e-04 - acc: 1.0000 - val_loss: 0.5532 - val_acc: 0.8896
Epoch 12

<keras.callbacks.History at 0x7f1dd29f0470>

In [34]:
# apply on test set to see performance:
score = allgram_model.evaluate(test_allgram_w_sw, IMDB_test_y, verbose=2)

In [35]:
print('Test loss: ', score[0])
print('Test accuracy: ', score[1])

Test loss:  0.7464952216601372
Test accuracy:  0.87388


### Third Attempted Architecture: L2 Regularization for overfitting

In [42]:
# used L2 b/c suggested that this was best regularizer for MLP
from keras.regularizers import l2

def baseline_model(l2_reg=1e-03):
    model = Sequential()
    model.add(Dense(500, input_shape=(30000,), activation='relu', kernel_regularizer=l2(l2_reg)))
    model.add(Dense(1, activation='sigmoid'))
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [57]:
# ALLGRAM W/O SW:

from sklearn.model_selection import RandomizedSearchCV
from scipy import stats
import time
from scipy.stats import expon
from keras.wrappers.scikit_learn import KerasClassifier


# below, dictionary of params being modified
l2_range = [1e-1, 1e-3, 1e-5]
param_grid = dict(l2_reg=l2_range) 

# random search
model = KerasClassifier(build_fn=baseline_model) 
random = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=3, n_iter=3)

start_time = time.time()
random_result = random.fit(train_allgram[:15000], IMDB_train_y[:15000], 
          validation_data=(train_allgram[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=20, verbose=0)

# Summarize results
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Best: 0.781400 using {'l2_reg': 1e-05}
Execution time: 4916.787871599197 ms


In [64]:
train_allgram, IMDB_train_y = unison_shuffled_copies(train_allgram, IMDB_train_y)

In [67]:
import numpy as np
train_allgram = np.asarray(train_allgram)
IMDB_train_y = np.asarray(IMDB_train_y)

In [None]:
best_l2 = random_result.best_params_['l2_reg']

In [68]:
allgram_model = baseline_model(best_l2)
# train trigram model (uni, bi, tri) w/o stop words w/ best l2
allgram_model.fit(train_allgram[:15000], IMDB_train_y[:15000], 
          validation_data=(train_allgram[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=20, verbose=2)

Train on 15000 samples, validate on 10000 samples
Epoch 1/20
 - 34s - loss: 0.3686 - acc: 0.8534 - val_loss: 0.3168 - val_acc: 0.8866
Epoch 2/20
 - 28s - loss: 0.0951 - acc: 0.9783 - val_loss: 0.3641 - val_acc: 0.8852
Epoch 3/20
 - 29s - loss: 0.0395 - acc: 0.9979 - val_loss: 0.4300 - val_acc: 0.8849
Epoch 4/20
 - 29s - loss: 0.0255 - acc: 0.9999 - val_loss: 0.4665 - val_acc: 0.8848
Epoch 5/20
 - 29s - loss: 0.0208 - acc: 0.9999 - val_loss: 0.4886 - val_acc: 0.8836
Epoch 6/20
 - 29s - loss: 0.0181 - acc: 1.0000 - val_loss: 0.5054 - val_acc: 0.8825
Epoch 7/20
 - 29s - loss: 0.0160 - acc: 1.0000 - val_loss: 0.5165 - val_acc: 0.8819
Epoch 8/20
 - 29s - loss: 0.0144 - acc: 1.0000 - val_loss: 0.5248 - val_acc: 0.8813
Epoch 9/20
 - 29s - loss: 0.0130 - acc: 1.0000 - val_loss: 0.5317 - val_acc: 0.8807
Epoch 10/20
 - 29s - loss: 0.0118 - acc: 1.0000 - val_loss: 0.5416 - val_acc: 0.8802
Epoch 11/20
 - 29s - loss: 0.0107 - acc: 1.0000 - val_loss: 0.5527 - val_acc: 0.8799
Epoch 12/20
 - 29s - los

<keras.callbacks.History at 0x7f8c50505630>

In [69]:
print(allgram_model.metrics)

['accuracy']


In [71]:
# apply on test set to see performance:
score = allgram_model.evaluate(test_allgram, IMDB_test_y, verbose=2)

In [74]:
print('Test loss: ', score[0])
print('Test accuracy: ', score[1])

Test loss:  0.52534321767807
Test accuracy:  0.85004


In [38]:
# ALLGRAM W/ SW:

from sklearn.model_selection import RandomizedSearchCV
from scipy import stats
import time
from scipy.stats import expon
from keras.wrappers.scikit_learn import KerasClassifier


# below, dictionary of params being modified
l2_range = [1e-1, 1e-3, 1e-5]
param_grid = dict(l2_reg=l2_range) 

# random search
model = KerasClassifier(build_fn=baseline_model) 
random = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=3, n_iter=3)

start_time = time.time()
random_result = random.fit(train_allgram_w_sw[:15000], IMDB_train_y[:15000], 
          validation_data=(train_allgram_w_sw[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=20, verbose=0)

# Summarize results
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Best: 0.777467 using {'l2_reg': 1e-05}
Execution time: 4223.002087831497 ms


In [39]:
train_allgram_w_sw, IMDB_train_y = unison_shuffled_copies(train_allgram_w_sw, IMDB_train_y)

In [40]:
import numpy as np
train_allgram_w_sw = np.asarray(train_allgram_w_sw)
IMDB_train_y = np.asarray(IMDB_train_y)

In [45]:
best_l2 = random_result.best_params_['l2_reg']

1e-05


In [46]:
allgram_w_sw_model = baseline_model(best_l2)
# train trigram model (uni, bi, tri) w/ stop words w/ best l2
allgram_w_sw_model.fit(train_allgram_w_sw[:15000], IMDB_train_y[:15000], 
          validation_data=(train_allgram_w_sw[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=20, verbose=2)

Train on 15000 samples, validate on 10000 samples
Epoch 1/20
 - 45s - loss: 0.3561 - acc: 0.8507 - val_loss: 0.3015 - val_acc: 0.8876
Epoch 2/20
 - 27s - loss: 0.0906 - acc: 0.9796 - val_loss: 0.3460 - val_acc: 0.8867
Epoch 3/20
 - 27s - loss: 0.0386 - acc: 0.9980 - val_loss: 0.3956 - val_acc: 0.8900
Epoch 4/20
 - 26s - loss: 0.0248 - acc: 0.9999 - val_loss: 0.4398 - val_acc: 0.8910
Epoch 5/20
 - 26s - loss: 0.0205 - acc: 1.0000 - val_loss: 0.4586 - val_acc: 0.8908
Epoch 6/20
 - 27s - loss: 0.0180 - acc: 1.0000 - val_loss: 0.4663 - val_acc: 0.8905
Epoch 7/20
 - 27s - loss: 0.0161 - acc: 1.0000 - val_loss: 0.4715 - val_acc: 0.8899
Epoch 8/20
 - 26s - loss: 0.0144 - acc: 1.0000 - val_loss: 0.4725 - val_acc: 0.8893
Epoch 9/20
 - 32s - loss: 0.0130 - acc: 1.0000 - val_loss: 0.4743 - val_acc: 0.8894
Epoch 10/20
 - 27s - loss: 0.0118 - acc: 1.0000 - val_loss: 0.4779 - val_acc: 0.8889
Epoch 11/20
 - 27s - loss: 0.0107 - acc: 1.0000 - val_loss: 0.4815 - val_acc: 0.8870
Epoch 12/20
 - 27s - los

<keras.callbacks.History at 0x7fcfd9ebecc0>

### Fourth Attempt: Adding early stop from best of (L2 Reg and Dropout) to reduce overfitting

In [79]:
# DROPOUT MODEL PERFORMED BETTER -> USING THAT (w/o sw)
from keras.callbacks import EarlyStopping

# create early-stopping callback
earlystop = EarlyStopping(monitor='val_acc', min_delta=0, patience=5, verbose=2, 
                          mode='auto', baseline=None, restore_best_weights=True)

# train trigram model (uni, bi, tri) w/o stop words w/ best dropout
allgram_model = baseline_model(best_dropout)
allgram_model.fit(train_allgram[:15000], IMDB_train_y[:15000], validation_data=(train_allgram[15000:], IMDB_train_y[15000:]), 
                  batch_size=200, epochs=20, verbose=2, callbacks=[earlystop])

Train on 15000 samples, validate on 10000 samples
Epoch 1/20
 - 18s - loss: 0.4148 - acc: 0.8353 - val_loss: 0.2999 - val_acc: 0.8870
Epoch 2/20
 - 15s - loss: 0.1672 - acc: 0.9505 - val_loss: 0.2865 - val_acc: 0.8914
Epoch 3/20
 - 15s - loss: 0.0862 - acc: 0.9809 - val_loss: 0.3066 - val_acc: 0.8890
Epoch 4/20
 - 15s - loss: 0.0483 - acc: 0.9935 - val_loss: 0.3288 - val_acc: 0.8885
Epoch 5/20
 - 15s - loss: 0.0279 - acc: 0.9979 - val_loss: 0.3556 - val_acc: 0.8877
Epoch 6/20
 - 15s - loss: 0.0179 - acc: 0.9991 - val_loss: 0.3811 - val_acc: 0.8860
Epoch 7/20
 - 15s - loss: 0.0121 - acc: 0.9995 - val_loss: 0.3998 - val_acc: 0.8848
Restoring model weights from the end of the best epoch
Epoch 00007: early stopping


<keras.callbacks.History at 0x7f8c666ea940>

In [80]:
# apply on test set to see performance:
score = allgram_model.evaluate(test_allgram, IMDB_test_y, verbose=2)

In [81]:
print('Test loss: ', score[0])
print('Test accuracy: ', score[1])

Test loss:  0.308475244140625
Test accuracy:  0.87748


In [22]:
# DROPOUT MODEL PERFORMED BETTER -> USING THAT (MAKE SURE NOT TO USE L2 MODEL)
from keras.callbacks import EarlyStopping

# create early-stopping callback
earlystop = EarlyStopping(monitor='val_acc', min_delta=0, patience=7, verbose=2, 
                          mode='auto', baseline=None, restore_best_weights=True)

# train trigram model (uni, bi, tri) w/ stop words w/ best dropout
allgram_w_sw_model = baseline_model(best_dropout)
allgram_w_sw_model.fit(train_allgram_w_sw[:15000], IMDB_train_y[:15000], 
                       validation_data=(train_allgram_w_sw[15000:], IMDB_train_y[15000:]), 
                  batch_size=200, epochs=20, verbose=2, callbacks=[earlystop])

Train on 15000 samples, validate on 10000 samples
Epoch 1/20
 - 29s - loss: 0.3486 - acc: 0.8556 - val_loss: 0.2761 - val_acc: 0.8950
Epoch 2/20
 - 26s - loss: 0.0820 - acc: 0.9751 - val_loss: 0.3200 - val_acc: 0.8917
Epoch 3/20
 - 26s - loss: 0.0222 - acc: 0.9969 - val_loss: 0.3776 - val_acc: 0.8883
Epoch 4/20
 - 27s - loss: 0.0073 - acc: 0.9996 - val_loss: 0.4308 - val_acc: 0.8876
Epoch 5/20
 - 26s - loss: 0.0033 - acc: 0.9998 - val_loss: 0.4490 - val_acc: 0.8905
Epoch 6/20
 - 26s - loss: 0.0020 - acc: 1.0000 - val_loss: 0.4817 - val_acc: 0.8908
Epoch 7/20
 - 27s - loss: 0.0010 - acc: 1.0000 - val_loss: 0.5016 - val_acc: 0.8910
Epoch 8/20
 - 27s - loss: 6.4982e-04 - acc: 1.0000 - val_loss: 0.5201 - val_acc: 0.8913
Restoring model weights from the end of the best epoch
Epoch 00008: early stopping


<keras.callbacks.History at 0x7f5a248d6160>

In [24]:
# apply on test set to see performance:
score = allgram_w_sw_model.evaluate(test_allgram_w_sw, IMDB_test_y, verbose=2)

In [25]:
print('Test loss: ', score[0])
print('Test accuracy: ', score[1])

Test loss:  0.2778815042543411
Test accuracy:  0.88912


### Fifth Attempt: Change Optimizer & Learning Rate

##### Adam Optimizer

In [38]:
from keras.regularizers import l2
from keras.optimizers import Adam

def baseline_model(learn_rate=0.01):
    model = Sequential()
    model.add(Dense(500, input_shape=(30000,), activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = Adam(lr=learn_rate)
    # compile network
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [2]:
# ALLGRAM:

from sklearn.model_selection import RandomizedSearchCV
from scipy import stats
import time
from scipy.stats import expon
from keras.wrappers.scikit_learn import KerasClassifier


# below, dictionary of params being modified
lr_range = [1e-4, 1e-2, 1e-1, 1]
param_grid = dict(learn_rate=lr_range) 

# random search
model = KerasClassifier(build_fn=baseline_model) 
random = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=3, n_iter=3)

start_time = time.time()
random_result = random.fit(train_allgram[:15000], IMDB_train_y[:15000], 
          validation_data=(train_allgram[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=10, verbose=0)

# Summarize results
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Using TensorFlow backend.


NameError: name 'baseline_model' is not defined

In [104]:
import numpy as np
train_allgram, IMDB_train_y = unison_shuffled_copies(train_allgram, IMDB_train_y)
train_allgram = np.asarray(train_allgram)
IMDB_train_y = np.asarray(IMDB_train_y)

In [39]:
# train with best learning rate
from keras.callbacks import EarlyStopping

# create early-stopping callback
earlystop = EarlyStopping(monitor='val_acc', min_delta=0, patience=5, verbose=2, 
                          mode='auto', baseline=None, restore_best_weights=True)

best_lr = random_result.best_params_['learn_rate']
allgram_model = baseline_model(best_lr)
allgram_model.fit(train_allgram[:15000], IMDB_train_y[:15000], 
          validation_data=(train_allgram[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=30, verbose=2, callbacks=[earlystop])

Train on 15000 samples, validate on 10000 samples
Epoch 1/30
 - 26s - loss: 0.5627 - acc: 0.7778 - val_loss: 0.4414 - val_acc: 0.8566
Epoch 2/30
 - 26s - loss: 0.3349 - acc: 0.9049 - val_loss: 0.3328 - val_acc: 0.8859
Epoch 3/30
 - 26s - loss: 0.2301 - acc: 0.9397 - val_loss: 0.2953 - val_acc: 0.8922
Epoch 4/30
 - 24s - loss: 0.1701 - acc: 0.9612 - val_loss: 0.2804 - val_acc: 0.8957
Epoch 5/30
 - 25s - loss: 0.1294 - acc: 0.9745 - val_loss: 0.2739 - val_acc: 0.8962
Epoch 6/30
 - 26s - loss: 0.1016 - acc: 0.9829 - val_loss: 0.2750 - val_acc: 0.8958
Epoch 7/30
 - 26s - loss: 0.0798 - acc: 0.9888 - val_loss: 0.2771 - val_acc: 0.8954
Epoch 8/30
 - 26s - loss: 0.0638 - acc: 0.9933 - val_loss: 0.2813 - val_acc: 0.8962
Epoch 9/30
 - 24s - loss: 0.0514 - acc: 0.9950 - val_loss: 0.2878 - val_acc: 0.8959
Epoch 10/30
 - 24s - loss: 0.0420 - acc: 0.9971 - val_loss: 0.2942 - val_acc: 0.8954
Epoch 11/30
 - 24s - loss: 0.0340 - acc: 0.9983 - val_loss: 0.3009 - val_acc: 0.8953
Epoch 12/30
 - 24s - los

<keras.callbacks.History at 0x7fe63707e7f0>

In [42]:
# train over both validation and train

allgram_model.fit(train_allgram, IMDB_train_y, 
          validation_data=None, batch_size=200, epochs=5, verbose=1, callbacks=[earlystop], shuffle=True)

Epoch 1/5
Epoch 2/5




Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe62c844710>

In [75]:
allgram_model.save('adam_model.h5')

In [44]:
# apply on test set to see performance:
score = allgram_model.evaluate(test_allgram, IMDB_test_y, verbose=2)

In [25]:
print('Test loss: ', score[0])
print('Test accuracy: ', score[1])

Test loss:  0.3021923314857483
Test accuracy:  0.88028


##### RMSprop Optimizer

In [27]:
from keras.regularizers import l2
from keras.optimizers import RMSprop

def baseline_model(learn_rate=0.01):
    model = Sequential()
    model.add(Dense(500, input_shape=(30000,), activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = RMSprop(lr=learn_rate)
    # compile network
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [28]:
# ALLGRAM:

from sklearn.model_selection import RandomizedSearchCV
from scipy import stats
import time
from scipy.stats import expon
from keras.wrappers.scikit_learn import KerasClassifier


# below, dictionary of params being modified
lr_range = [1e-4, 1e-2, 1e-1, 1]
param_grid = dict(learn_rate=lr_range) 

# random search
model = KerasClassifier(build_fn=baseline_model) 
random = RandomizedSearchCV(estimator=model, param_distributions=param_grid, cv=3, n_iter=3)

start_time = time.time()
random_result = random.fit(train_allgram[:15000], IMDB_train_y[:15000], 
          validation_data=(train_allgram[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=10, verbose=0)

# Summarize results
print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Best: 0.777733 using {'learn_rate': 0.0001}
Execution time: 1928.99249625206 ms


In [None]:
import numpy as np

train_allgram, IMDB_train_y = unison_shuffled_copies(train_allgram, IMDB_train_y)
train_allgram = np.asarray(train_allgram)
IMDB_train_y = np.asarray(IMDB_train_y)

In [30]:
# train with best learning rate
best_lr = random_result.best_params_['learn_rate']
allgram_model_test = baseline_model(best_lr)

# create early-stopping callback
earlystop = EarlyStopping(monitor='val_acc', min_delta=0, patience=5, verbose=2, 
                          mode='auto', baseline=None, restore_best_weights=True)

allgram_model_test.fit(train_allgram[:15000], IMDB_train_y[:15000], 
          validation_data=(train_allgram[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=20, verbose=2, callbacks=[earlystop])

Train on 15000 samples, validate on 10000 samples
Epoch 1/20
 - 23s - loss: 0.5369 - acc: 0.8105 - val_loss: 0.4261 - val_acc: 0.8621
Epoch 2/20
 - 24s - loss: 0.3368 - acc: 0.9002 - val_loss: 0.3334 - val_acc: 0.8869
Epoch 3/20
 - 23s - loss: 0.2450 - acc: 0.9285 - val_loss: 0.2978 - val_acc: 0.8915
Epoch 4/20
 - 23s - loss: 0.1906 - acc: 0.9449 - val_loss: 0.2812 - val_acc: 0.8956
Epoch 5/20
 - 24s - loss: 0.1540 - acc: 0.9569 - val_loss: 0.2741 - val_acc: 0.8962
Epoch 6/20
 - 24s - loss: 0.1261 - acc: 0.9665 - val_loss: 0.2735 - val_acc: 0.8957
Epoch 7/20
 - 23s - loss: 0.1040 - acc: 0.9731 - val_loss: 0.2756 - val_acc: 0.8959
Epoch 8/20
 - 23s - loss: 0.0863 - acc: 0.9790 - val_loss: 0.2815 - val_acc: 0.8959
Epoch 9/20
 - 24s - loss: 0.0720 - acc: 0.9824 - val_loss: 0.2879 - val_acc: 0.8951
Epoch 10/20
 - 24s - loss: 0.0597 - acc: 0.9857 - val_loss: 0.2969 - val_acc: 0.8957
Restoring model weights from the end of the best epoch
Epoch 00010: early stopping


<keras.callbacks.History at 0x7fe6399ab9b0>

In [33]:
# train over both validation and train

allgram_model_test.fit(train_allgram, IMDB_train_y, 
          validation_data=None, batch_size=200, epochs=10, verbose=1, callbacks=[earlystop], shuffle=True)

Epoch 1/2
Epoch 2/2






<keras.callbacks.History at 0x7fe641a0e630>

In [76]:
allgram_model_test.save('rmsprop_model.h5')

In [35]:
# apply on test set to see performance:
score = allgram_model_test.evaluate(test_allgram, IMDB_test_y, verbose=2)

In [36]:
print('Test loss: ', score[0])
print('Test accuracy: ', score[1])

Test loss:  0.41618166644096377
Test accuracy:  0.86836


### Sixth Attempt: Change architecture

In [37]:
from keras.regularizers import l2

def baseline_model():
    model = Sequential()
    model.add(Dense(500, input_shape=(30000,), activation='relu', kernel_regularizer=l2(1e-1)))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [51]:
four_layer_model = baseline_model()
# train unigram model w/o stop words
four_layer_model.fit(train_allgram[:15000], IMDB_train_y[:15000], 
          validation_data=(train_allgram[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=20, verbose=2)

Train on 15000 samples, validate on 10000 samples
Epoch 1/20
 - 30s - loss: 7.2849 - acc: 0.7903 - val_loss: 0.9111 - val_acc: 0.8195
Epoch 2/20
 - 26s - loss: 0.9164 - acc: 0.8289 - val_loss: 0.9520 - val_acc: 0.8382
Epoch 3/20
 - 27s - loss: 0.9126 - acc: 0.8394 - val_loss: 0.9040 - val_acc: 0.8424
Epoch 4/20
 - 26s - loss: 0.8716 - acc: 0.8511 - val_loss: 0.8277 - val_acc: 0.8534
Epoch 5/20
 - 27s - loss: 0.8045 - acc: 0.8576 - val_loss: 0.7767 - val_acc: 0.8543
Epoch 6/20
 - 26s - loss: 0.7469 - acc: 0.8703 - val_loss: 0.7942 - val_acc: 0.8563
Epoch 7/20
 - 27s - loss: 0.6973 - acc: 0.8807 - val_loss: 0.7788 - val_acc: 0.8675
Epoch 8/20
 - 26s - loss: 0.6691 - acc: 0.8874 - val_loss: 0.7280 - val_acc: 0.8651
Epoch 9/20
 - 27s - loss: 0.6363 - acc: 0.8977 - val_loss: 0.7243 - val_acc: 0.8663
Epoch 10/20
 - 27s - loss: 0.5791 - acc: 0.9073 - val_loss: 0.6943 - val_acc: 0.8676
Epoch 11/20
 - 27s - loss: 0.5443 - acc: 0.9183 - val_loss: 0.7375 - val_acc: 0.8609
Epoch 12/20
 - 26s - los

<keras.callbacks.History at 0x7fe63f248be0>

In [61]:
# train on both validation and test
four_layer_model.fit(train_allgram, IMDB_train_y, batch_size=200, epochs=20, verbose=2)

Epoch 1/5
 - 34s - loss: 0.4139 - acc: 0.9214
Epoch 2/5
 - 34s - loss: 0.4186 - acc: 0.9235
Epoch 3/5
 - 34s - loss: 0.4066 - acc: 0.9244
Epoch 4/5
 - 34s - loss: 0.4120 - acc: 0.9237
Epoch 5/5
 - 34s - loss: 0.4053 - acc: 0.9226


<keras.callbacks.History at 0x7fe63f468a90>

In [70]:
# update regularization

four_layer_model.layers[0].kernel_regularizer.l2 = 1e-3

In [71]:
four_layer_model.layers[0].kernel_regularizer.l2

0.001

In [72]:
# train on both validation and test
four_layer_model.fit(train_allgram, IMDB_train_y, batch_size=200, epochs=5, verbose=2)

Epoch 1/5
 - 34s - loss: 0.4001 - acc: 0.9257
Epoch 2/5
 - 34s - loss: 0.3992 - acc: 0.9236
Epoch 3/5
 - 34s - loss: 0.4041 - acc: 0.9219
Epoch 4/5
 - 34s - loss: 0.3970 - acc: 0.9255
Epoch 5/5
 - 34s - loss: 0.4055 - acc: 0.9220


<keras.callbacks.History at 0x7fe6288f2c50>

In [73]:
# apply on test set to see performance:
score = four_layer_model.evaluate(test_allgram, IMDB_test_y, verbose=2)

In [74]:
print('Test loss: ', score[0])
print('Test accuracy: ', score[1])

Test loss:  0.41618166644096377
Test accuracy:  0.86836


In [38]:
four_layer_model = baseline_model()
# train unigram model w/ stop words
four_layer_model.fit(train_allgram_w_sw[:15000], IMDB_train_y[:15000], 
          validation_data=(train_allgram_w_sw[15000:], IMDB_train_y[15000:]), batch_size=200, epochs=20, verbose=2)

Train on 15000 samples, validate on 10000 samples
Epoch 1/20
 - 32s - loss: 7.3090 - acc: 0.7878 - val_loss: 0.7889 - val_acc: 0.8338
Epoch 2/20
 - 27s - loss: 0.8378 - acc: 0.8275 - val_loss: 0.9127 - val_acc: 0.7980
Epoch 3/20
 - 27s - loss: 0.7939 - acc: 0.8347 - val_loss: 0.7426 - val_acc: 0.8490
Epoch 4/20
 - 27s - loss: 0.7824 - acc: 0.8432 - val_loss: 0.7718 - val_acc: 0.8377
Epoch 5/20
 - 27s - loss: 0.7911 - acc: 0.8439 - val_loss: 0.7493 - val_acc: 0.8358
Epoch 6/20
 - 27s - loss: 0.7444 - acc: 0.8633 - val_loss: 0.7569 - val_acc: 0.8563
Epoch 7/20
 - 27s - loss: 0.7366 - acc: 0.8592 - val_loss: 0.7125 - val_acc: 0.8505
Epoch 8/20
 - 27s - loss: 0.6861 - acc: 0.8679 - val_loss: 0.7203 - val_acc: 0.8556
Epoch 9/20
 - 27s - loss: 0.6728 - acc: 0.8747 - val_loss: 0.7226 - val_acc: 0.8594
Epoch 10/20
 - 27s - loss: 0.6419 - acc: 0.8781 - val_loss: 0.6485 - val_acc: 0.8708
Epoch 11/20
 - 27s - loss: 0.6137 - acc: 0.8859 - val_loss: 0.6396 - val_acc: 0.8630
Epoch 12/20
 - 28s - los

<keras.callbacks.History at 0x7f1dd2de73c8>

In [40]:
# train on both validation and test
four_layer_model.fit(train_allgram_w_sw, IMDB_train_y, batch_size=200, epochs=20, verbose=2)

Epoch 1/20
 - 38s - loss: 0.5182 - acc: 0.8999
Epoch 2/20
 - 37s - loss: 0.4849 - acc: 0.9070
Epoch 3/20
 - 36s - loss: 0.4675 - acc: 0.9064
Epoch 4/20
 - 36s - loss: 0.4618 - acc: 0.9092
Epoch 5/20
 - 37s - loss: 0.4530 - acc: 0.9124
Epoch 6/20
 - 36s - loss: 0.4568 - acc: 0.9098
Epoch 7/20
 - 36s - loss: 0.4433 - acc: 0.9134
Epoch 8/20
 - 36s - loss: 0.4508 - acc: 0.9125
Epoch 9/20
 - 36s - loss: 0.4400 - acc: 0.9168
Epoch 10/20
 - 37s - loss: 0.4372 - acc: 0.9172
Epoch 11/20
 - 36s - loss: 0.4263 - acc: 0.9154
Epoch 12/20
 - 36s - loss: 0.4196 - acc: 0.9187
Epoch 13/20
 - 36s - loss: 0.4253 - acc: 0.9193
Epoch 14/20
 - 36s - loss: 0.4168 - acc: 0.9155
Epoch 15/20
 - 36s - loss: 0.4143 - acc: 0.9217
Epoch 16/20
 - 36s - loss: 0.4212 - acc: 0.9182
Epoch 17/20
 - 36s - loss: 0.4220 - acc: 0.9214
Epoch 18/20
 - 36s - loss: 0.4192 - acc: 0.9227
Epoch 19/20
 - 37s - loss: 0.4068 - acc: 0.9231
Epoch 20/20
 - 36s - loss: 0.4159 - acc: 0.9208


<keras.callbacks.History at 0x7f1dcf4b5cc0>

In [None]:
# update regularization

four_layer_model.layers[0].kernel_regularizer.l2 = 1e-3

In [None]:
four_layer_model.layers[0].kernel_regularizer.l2

In [None]:
# train on both validation and test
four_layer_model.fit(train_allgram, IMDB_train_y, batch_size=200, epochs=5, verbose=2)

In [42]:
# apply on test set to see performance:
score = four_layer_model.evaluate(test_allgram_w_sw, IMDB_test_y, verbose=2)

In [43]:
print('Test loss: ', score[0])
print('Test accuracy: ', score[1])

Test loss:  0.5208094476318359
Test accuracy:  0.87516
