# sentiment analysis(opinion mining) on IMDb(Internet Movie Database) comment

## Global setting

In [None]:
import logging

# Global setting
LOG_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'

# Constants
IMDB_MLP_MODEL_NAME = 'imdb_mlp.model'
IMDB_MLP_MODEL_WEIG = 'imdb_mlp.h5'

logging.basicConfig(format=LOG_FORMAT)
logger = logging.getLogger('IMDBb')
logger.setLevel(logging.INFO)  

In [None]:
from urllib.request import urlretrieve
import os
import tarfile

url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"  
# 84.1 mb num:50,000, train/test 25,000
# path = "aclImdb_v1.tar.gz"

filepath = 'datas/aclImdb_v1.tar.gz'
dataPath = 'datas/aclImdb'

if not os.path.isfile(filepath):
    print('Downloading from {}...'.format(url))
    result = urlretrieve(url, filepath)
    print('download: {}'.format(result))

if not os.path.isdir(dataPath):
    print('Extracting {} to datas...'.format(filepath))
    tfile = tarfile.open(filepath, 'r:gz')
    result = tfile.extractall('datas/')

```
├── README
├── imdb.vocab
├── imdbEr.txt
├── test
│   ├── labeledBow.feat
│   ├── neg
│   ├── pos
│   ├── urls_neg.txt
│   └── urls_pos.txt
└── train
    ├── labeledBow.feat
    ├── neg
    ├── pos
    ├── unsup
    ├── unsupBow.feat
    ├── urls_neg.txt
    ├── urls_pos.txt
    └── urls_unsup.txt

```

## data preprocessing

In [7]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

In [None]:
import re


def remove_tag(text):
    # Remove HTML markers
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)


'''
    Read data from IMDb folders

    @param filetype(str):
        "train" or "test"

    @return:
        Tuple(List of labels, List of articles)
'''
def read_files(filetype):
    file_list = []
    positive_path = os.path.join(os.path.join(dataPath, filetype), 'pos')
    for f in os.listdir(positive_path):
        file_list.append(os.path.join(positive_path, f))

    negative_path = os.path.join(os.path.join(dataPath, filetype), 'neg')
    for f in os.listdir(negative_path):
        file_list.append(os.path.join(negative_path, f))

    logger.debug('Read {} with {} files...'.format(filetype, len(file_list)))
    all_labels = ([1] * 12500 + [0] * 12500)
    all_texts = []
    for fi in file_list:
        logger.debug('Read {}...'.format(fi))
        with open(fi, encoding='utf8') as fh:
            all_texts += [remove_tag(" ".join(fh.readlines()))]

    return all_labels, all_texts

logger.info('Reading training data...')
train_labels, train_text = read_files('train')
logger.info('Reading testing data...')
test_labels, test_text = read_files('test')


In [11]:
# check data 
print("training data size:%d \n" % (len(train_text)))
print("testing data size:%d \n" % (len(test_text)))

print("feature :%s \n" % (test_text[0]))
print("label :%s \n" % (test_labels[0]))


training data size:25000
testing data size:25000
feature :Bromwell High is a cartoon comedy It ran at the same time as some other programs about school life such as "Teachers" My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers" The scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools I knew and their students When I saw the episode in which a student repeatedly tried to burn down the school I immediately recalled  at  High A classic line: INSPECTOR: I'm here to sack one of your teachers STUDENT: Welcome to Bromwell High I expect that many adults of my age think that Bromwell High is far fetched What a pity that it isn't!
label :1


### Tokenize

In [15]:
MAX_LEN_OF_TOKEN = 100

logger.info('Tokenizing document...')

# Create a dictionary of 2,000 words
token = Tokenizer(num_words=2000)
# Read in all training text and select top 2,000 words according to frequency sorting descendingly
token.fit_on_texts(train_text)

logger.info('Total {} document being handled...'.format(token.document_count))
logger.info('Top 10 word index:')
c = 0
for t, i in token.word_index.items():
    print("\t'{}'\t{}".format(t, i))
    c += 1
    if c == 10:
        break
print("")
logger.info('Translating raw text into token number list...')
# convert text to vector
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

logger.info('Padding/Trimming the token number list to length={}...'.format(MAX_LEN_OF_TOKEN))
# padding 
x_train = sequence.pad_sequences(x_train_seq, maxlen=MAX_LEN_OF_TOKEN)
x_test = sequence.pad_sequences(x_test_seq, maxlen=MAX_LEN_OF_TOKEN)


In [23]:
# compare the text and vector
print(train_text[0])
print('\n')
print(x_train_seq[0])
print('\n')

# show padding result
print("before length: %d" % (len(x_train_seq[0])))
print("before sequence: %s" % (x_train_seq[0]))
print("\n")
print("after length: %d" % (len(x_train[0])))
print("after sequence: %s" % (x_train[0]))

Bromwell High is a cartoon comedy It ran at the same time as some other programs about school life such as "Teachers" My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers" The scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools I knew and their students When I saw the episode in which a student repeatedly tried to burn down the school I immediately recalled  at  High A classic line: INSPECTOR: I'm here to sack one of your teachers STUDENT: Welcome to Bromwell High I expect that many adults of my age think that Bromwell High is far fetched What a pity that it isn't!


[301, 6, 3, 1070, 212, 8, 29, 1, 168, 56, 13, 45, 81, 40, 388, 113, 134, 13, 58, 149, 7, 1, 472, 68, 5, 256, 11, 1984, 6, 72, 5, 636, 70, 6, 1, 5, 1, 1515, 33, 66, 64, 203, 140, 63, 1248, 1, 4, 1, 218, 915, 28, 68, 4, 1, 10

before length: 107
before sequence: [301, 6, 3, 1070, 212, 8, 29, 1, 168, 56, 13, 45, 81, 40, 388, 113, 134, 13, 58, 149, 7, 1, 472, 68, 5, 256, 11, 1984, 6, 72, 5, 636, 70, 6, 1, 5, 1, 1515, 33, 66, 64, 203, 140, 63, 1248, 1, 4, 1, 218, 915, 28, 68, 4, 1, 10, 683, 2, 63, 1515, 51, 10, 209, 1, 391, 7, 59, 3, 1463, 789, 5, 178, 1, 388, 10, 1223, 29, 301, 3, 354, 341, 146, 132, 5, 27, 4, 124, 1463, 5, 301, 10, 525, 11, 106, 1487, 4, 58, 555, 100, 11, 301, 6, 225, 46, 3, 11, 8, 210]


after length: 100
after sequence: [   1  168   56   13   45   81   40  388  113  134   13   58  149    7
    1  472   68    5  256   11 1984    6   72    5  636   70    6    1
    5    1 1515   33   66   64  203  140   63 1248    1    4    1  218
  915   28   68    4    1   10  683    2   63 1515   51   10  209    1
  391    7   59    3 1463  789    5  178    1  388   10 1223   29  301
    3  354  341  146  132    5   27    4  124 1463    5  301   10  525
   11  106 1487    4   58  555  100   11  301    6  2

## build pure RNN model

In [24]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.models import model_from_json

In [25]:
MODEL_TYPE = 'mlp'
IS_RELOAD = False
if MODEL_TYPE == 'mlp':
    if os.path.isfile(IMDB_MLP_MODEL_NAME):
        # Reload model
        logger.debug('Reloading model from {}...'.format(IMDB_MLP_MODEL_NAME))
        IS_RELOAD = True
        with open(IMDB_MLP_MODEL_NAME, 'r') as f:
            loaded_model_json = f.read()
        model = model_from_json(loaded_model_json)
        model.load_weights(IMDB_MLP_MODEL_WEIG)
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    else:
        model = Sequential()
        model.add(Embedding(output_dim=32,
                            input_dim=2000,
                            input_length=100))
        model.add(Dropout(0.2))
        '''Drop 20% neuron during training '''
        model.add(Flatten())
        model.add(Dense(units=256, activation='relu'))
        ''' Total 256 neuron in hidden layers'''
        model.add(Dropout(0.35))
        model.add(Dense(units=1, activation='sigmoid'))
        ''' Define output layer with 'sigmoid activation' '''

logger.info('Model summary:\n{}\n'.format(model.summary()))


## train model

In [28]:
if not IS_RELOAD:
    logger.info('Start training process...')
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    train_history = model.fit(x_train, train_labels, batch_size=100, epochs=10, verbose=2, validation_split=0.2)
    print("")
    # Serialized model
    print("\t[Info] Serialized Keras model to %s..." % (IMDB_MLP_MODEL_NAME))
    with open(IMDB_MLP_MODEL_NAME, 'w') as f:
        f.write(model.to_json())
    model.save_weights(IMDB_MLP_MODEL_WEIG)


Train on 20000 samples, validate on 5000 samples
Epoch 1/10


 - 38s - loss: 0.5487 - acc: 0.7169 - val_loss: 0.4495 - val_acc: 0.8188


Epoch 2/10


 - 15s - loss: 0.3755 - acc: 0.8394 - val_loss: 0.5606 - val_acc: 0.7502


Epoch 3/10


 - 15s - loss: 0.3280 - acc: 0.8635 - val_loss: 0.4177 - val_acc: 0.8174


Epoch 4/10


 - 16s - loss: 0.2867 - acc: 0.8827 - val_loss: 0.4529 - val_acc: 0.8180


Epoch 5/10


 - 15s - loss: 0.2566 - acc: 0.8958 - val_loss: 0.5137 - val_acc: 0.8096


Epoch 6/10


 - 15s - loss: 0.2302 - acc: 0.9080 - val_loss: 0.6379 - val_acc: 0.7386


Epoch 7/10


 - 15s - loss: 0.2027 - acc: 0.9195 - val_loss: 0.8288 - val_acc: 0.7586


Epoch 8/10


 - 15s - loss: 0.1843 - acc: 0.9279 - val_loss: 0.5798 - val_acc: 0.8348


Epoch 9/10


 - 15s - loss: 0.1704 - acc: 0.9339 - val_loss: 0.6810 - val_acc: 0.7698


Epoch 10/10


 - 15s - loss: 0.1478 - acc: 0.9424 - val_loss: 0.8034 - val_acc: 0.7438


## Evaluation

In [None]:
logger.info('Start evaluation...')
scores = model.evaluate(x_test, test_labels, verbose=1)
print("")
logger.info('Score={}'.format(scores[1]))

predict_classes = model.predict_classes(x_test).reshape(-1)
print("")
sentiDict = {1: 'Pos', 0: 'Neg'}


def display_test_Sentiment(i):
    r'''
    Show prediction on i'th test data
    '''
    logger.debug('{}\'th test data:\n{}\n'.format(i, test_text[i]))
    logger.info(
        'Ground truth: {}; prediction result: {}'.format(sentiDict[test_labels[i]], sentiDict[predict_classes[i]]))


logger.info('Show prediction on 2\'th test data:')
display_test_Sentiment(2)
