# Deep Learning Arch #1
## Embedding->CNN->LSTM->Dense

In [1]:
from keras.layers import Embedding, Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import one_hot
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import os
import nltk

Using TensorFlow backend.
  return f(*args, **kwds)


# data load and preparation

In [2]:
dataset_path = '../data/aclImdb'
train_positive_files = ['train/pos/'+f for f in os.listdir(dataset_path+'/train/pos') \
                        if os.path.isfile(os.path.join(dataset_path+'/train/pos', f))]
train_negative_files = ['train/neg/'+f for f in os.listdir(dataset_path+'/train/neg') \
                        if os.path.isfile(os.path.join(dataset_path+'/train/neg', f))]
test_positive_files = ['test/pos/'+f for f in os.listdir(dataset_path+'/test/pos') \
                       if os.path.isfile(os.path.join(dataset_path+'/test/pos', f))]
test_negative_files = ['test/neg/'+f for f in os.listdir(dataset_path+'/test/neg') \
                       if os.path.isfile(os.path.join(dataset_path+'/test/neg', f))]
all_files = list(set().union(train_positive_files,train_negative_files, test_positive_files, test_negative_files))

dataset = {'trainset':[], 
           'polarity':[], 
           'review':[]}

for file in all_files:
    polarity = file.split('.')[0].split('_')[1]
    with open(os.path.join(dataset_path, file), 'r') as text_file:
        dataset['trainset'].append(file.split('/')[0])
        dataset['polarity'].append(polarity)
        dataset['review'].append(text_file.readlines()[0])
dataframe = pd.DataFrame(data=dataset)

dataframe.head()

Unnamed: 0,trainset,polarity,review
0,test,4,"It must be remembered that the Gammera movies,..."
1,test,1,No Holds Barred is a movie that should in no w...
2,train,10,I consider myself a huge movie buff. I was sic...
3,test,9,I caught this one on cable and I was very surp...
4,train,1,Turkish Cinema has a big problem. Directors ar...


# preparation

In [3]:
stemmer = nltk.stem.porter.PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')
stopwords = [word for word in stopwords if word not in ['very', 'no', 'nor', 'not', 'few', 'more',
                                                        'most', 'just', 'doesn',  'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]]
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 's', 't', 'can', 'will']


In [4]:
def preprocess_reviews(review):
    tokens = nltk.tokenize.word_tokenize(review.lower())
    stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stopwords]
    return " ".join(stemmed_tokens)

dataframe['clean_review'] = dataframe['review'].apply(preprocess_reviews)
dataframe.head()

Unnamed: 0,trainset,polarity,review,clean_review
0,test,4,"It must be remembered that the Gammera movies,...","must rememb gammera movi , like mani first-ser..."
1,test,1,No Holds Barred is a movie that should in no w...,no hold bar movi should no way ever taken seri...
2,train,10,I consider myself a huge movie buff. I was sic...,consid huge movi buff . sick couch pop film . ...
3,test,9,I caught this one on cable and I was very surp...,caught one cabl veri surpris . steadi direct g...
4,train,1,Turkish Cinema has a big problem. Directors ar...,turkish cinema big problem . director n't inte...


In [5]:
dataframe.to_pickle('../data/dataframe_processed_reviews.pkl')

In [6]:
train_reviews = dataframe[dataframe['trainset'] == 'train']['clean_review'].values.tolist()
train_classes = dataframe[dataframe['trainset'] == 'train']['polarity'].values.tolist()
labels_train = np.array(train_classes)
test_reviews = dataframe[dataframe['trainset'] == 'test']['clean_review'].values.tolist()
test_classes = dataframe[dataframe['trainset'] == 'test']['polarity'].values.tolist()
labels_test = np.array(test_classes)

labels_categorical_train = to_categorical(labels_train, num_classes=11)
labels_categorical_test = to_categorical(labels_test, num_classes=11)

In [7]:
# find the vocab size
vocab = {}
max_length = 0
for review in train_reviews:
    tokens = review.split()
    
    if len(tokens) > max_length:
        max_length = len(tokens)
    
    for token in tokens:
        if token in vocab:
            vocab[token] += 1
        else:
            vocab[token] = 0

print(len(vocab))
print(max_length)

89401
1796


In [12]:
vocab_size = len(vocab) + 100
encoded_docs = [one_hot(d, vocab_size) for d in train_reviews]
print(encoded_docs)
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[[75882 44527 52217 ...     0     0     0]
 [28224 70033 50182 ...     0     0     0]
 [82201 81466 44722 ...     0     0     0]
 ...
 [54567  4434 12497 ...     0     0     0]
 [ 5162 59188 11044 ...     0     0     0]
 [50936 67188 85032 ...     0     0     0]]


In [13]:
vocab_size = len(vocab) + 100
encoded_docs_test = [one_hot(d, vocab_size) for d in test_reviews]
print(encoded_docs_test)
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')
print(padded_docs_test)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[[88110 39503 66478 ...     0     0     0]
 [60865 51889 30522 ...     0     0     0]
 [20294  6003 82966 ...     0     0     0]
 ...
 [71454 52925 22192 ...     0     0     0]
 [ 6451  6003 58122 ...     0     0     0]
 [84333 31870 83348 ...     0     0     0]]


In [14]:
model1 = Sequential()
model1.add(Embedding(vocab_size, 128, input_length=max_length))
model1.add(Flatten())
model1.add(Dense(32, activation='relu'))
model1.add(Dense(11, activation='softmax'))
# compile the model
model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# summarize the model
print('################# Model 1 ###############')
print(model1.summary())

################# Model 1 ###############
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1796, 128)         11456128  
_________________________________________________________________
flatten_1 (Flatten)          (None, 229888)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                7356448   
_________________________________________________________________
dense_2 (Dense)              (None, 11)                363       
Total params: 18,812,939
Trainable params: 18,812,939
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
model1.fit(padded_docs, labels_categorical_train, epochs=5, verbose=1, validation_split=0.2)
# evaluate the model
loss, accuracy = model1.evaluate(padded_docs_test, labels_categorical_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Train on 20000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 27.252000


In [17]:
model2 = Sequential()
model2.add(Embedding(vocab_size, 128, input_length=max_length))
model2.add(Conv1D(64, 5, activation='relu'))
model2.add(MaxPooling1D(pool_size=4))
model2.add(LSTM(100))
model2.add(Dense(11, activation='softmax'))
# compile the model
model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# summarize the model
print('################# Model 2 ###############')
print(model2.summary())

################# Model 2 ###############
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1796, 128)         11456128  
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1792, 64)          41024     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 448, 64)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_4 (Dense)              (None, 11)                1111      
Total params: 11,564,263
Trainable params: 11,564,263
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
model2.fit(padded_docs, labels_categorical_train, epochs=5, verbose=1, validation_split=0.2)
# evaluate the model
loss, accuracy = model2.evaluate(padded_docs_test, labels_categorical_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Train on 20000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 20.088000


In [None]:
model1.save()