# sentiment analysis(opinion mining) on IMDb(Internet Movie Database) comment

## Global setting

In [3]:
import logging

# Global setting
LOG_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'

# Constants
IMDB_MLP_MODEL_NAME = 'imdb_mlp.model'
IMDB_MLP_MODEL_WEIG = 'imdb_mlp.h5'

logging.basicConfig(format=LOG_FORMAT)
logger = logging.getLogger('IMDBb')
logger.setLevel(logging.INFO)  

In [4]:
from urllib.request import urlretrieve
import os
import tarfile

url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"  
# 84.1 mb num:50,000, train/test 25,000
# path = "aclImdb_v1.tar.gz"

filepath = 'datas/aclImdb_v1.tar.gz'
dataPath = 'datas/aclImdb'

if not os.path.isfile(filepath):
    print('Downloading from {}...'.format(url))
    result = urlretrieve(url, filepath)
    print('download: {}'.format(result))

if not os.path.isdir(dataPath):
    print('Extracting {} to datas...'.format(filepath))
    tfile = tarfile.open(filepath, 'r:gz')
    result = tfile.extractall('datas/')

Downloading from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz...
download: ('datas/aclImdb_v1.tar.gz', <http.client.HTTPMessage object at 0x000001A07A9D4518>)
Extracting datas/aclImdb_v1.tar.gz to datas...


```
├── README
├── imdb.vocab
├── imdbEr.txt
├── test
│   ├── labeledBow.feat
│   ├── neg
│   ├── pos
│   ├── urls_neg.txt
│   └── urls_pos.txt
└── train
    ├── labeledBow.feat
    ├── neg
    ├── pos
    ├── unsup
    ├── unsupBow.feat
    ├── urls_neg.txt
    ├── urls_pos.txt
    └── urls_unsup.txt

```

## data preprocessing

In [5]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
import re


def remove_tag(text):
    # Remove HTML markers
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)


'''
    Read data from IMDb folders

    @param filetype(str):
        "train" or "test"

    @return:
        Tuple(List of labels, List of articles)
'''
def read_files(filetype):
    file_list = []
    positive_path = os.path.join(os.path.join(dataPath, filetype), 'pos')
    for f in os.listdir(positive_path):
        file_list.append(os.path.join(positive_path, f))

    negative_path = os.path.join(os.path.join(dataPath, filetype), 'neg')
    for f in os.listdir(negative_path):
        file_list.append(os.path.join(negative_path, f))

    logger.debug('Read {} with {} files...'.format(filetype, len(file_list)))
    all_labels = ([1] * 12500 + [0] * 12500)
    all_texts = []
    for fi in file_list:
        logger.debug('Read {}...'.format(fi))
        with open(fi, encoding='utf8') as fh:
            all_texts += [remove_tag(" ".join(fh.readlines()))]

    return all_labels, all_texts

logger.info('Reading training data...')
train_labels, train_text = read_files('train')
logger.info('Reading testing data...')
test_labels, test_text = read_files('test')


2018-09-16 23:42:14,455 - IMDBb - INFO - Reading training data...
2018-09-16 23:43:42,961 - IMDBb - INFO - Reading testing data...


In [7]:
# check data 
print("training data size:%d \n" % (len(train_text)))
print("testing data size:%d \n" % (len(test_text)))

print("feature :%s \n" % (test_text[0]))
print("label :%s \n" % (test_labels[0]))


training data size:25000 

testing data size:25000 

feature :I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge. 

label :1 



### Tokenize

In [8]:
MAX_LEN_OF_TOKEN = 100

logger.info('Tokenizing document...')

# Create a dictionary of 2,000 words
token = Tokenizer(num_words=2000)
# Read in all training text and select top 2,000 words according to frequency sorting descendingly
token.fit_on_texts(train_text)

logger.info('Total {} document being handled...'.format(token.document_count))
logger.info('Top 10 word index:')
c = 0
for t, i in token.word_index.items():
    print("\t'{}'\t{}".format(t, i))
    c += 1
    if c == 10:
        break
print("")
logger.info('Translating raw text into token number list...')
# convert text to vector
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

logger.info('Padding/Trimming the token number list to length={}...'.format(MAX_LEN_OF_TOKEN))
# padding 
x_train = sequence.pad_sequences(x_train_seq, maxlen=MAX_LEN_OF_TOKEN)
x_test = sequence.pad_sequences(x_test_seq, maxlen=MAX_LEN_OF_TOKEN)


2018-09-16 23:44:31,001 - IMDBb - INFO - Tokenizing document...
2018-09-16 23:44:35,570 - IMDBb - INFO - Total 25000 document being handled...
2018-09-16 23:44:35,571 - IMDBb - INFO - Top 10 word index:
2018-09-16 23:44:35,572 - IMDBb - INFO - Translating raw text into token number list...


	'the'	1
	'and'	2
	'a'	3
	'of'	4
	'to'	5
	'is'	6
	'in'	7
	'it'	8
	'i'	9
	'this'	10



2018-09-16 23:44:42,424 - IMDBb - INFO - Padding/Trimming the token number list to length=100...


In [9]:
# compare the text and vector
print(train_text[0])
print('\n')
print(x_train_seq[0])
print('\n')

# show padding result
print("before length: %d" % (len(x_train_seq[0])))
print("before sequence: %s" % (x_train_seq[0]))
print("\n")
print("after length: %d" % (len(x_train[0])))
print("after sequence: %s" % (x_train[0]))

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


[308, 6, 3, 1068, 208, 8, 29, 1, 168, 54, 13, 45, 81, 40, 391, 109, 137, 13, 57, 149, 7, 1, 481, 68, 5, 260, 11, 6, 72, 5, 631, 70, 6, 1, 5, 1, 1530, 33, 66, 63, 204, 139, 64, 1229, 1, 4, 1, 

## build pure RNN model

In [10]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.models import model_from_json

In [11]:
MODEL_TYPE = 'mlp'
IS_RELOAD = False
if MODEL_TYPE == 'mlp':
    if os.path.isfile(IMDB_MLP_MODEL_NAME):
        # Reload model
        logger.debug('Reloading model from {}...'.format(IMDB_MLP_MODEL_NAME))
        IS_RELOAD = True
        with open(IMDB_MLP_MODEL_NAME, 'r') as f:
            loaded_model_json = f.read()
        model = model_from_json(loaded_model_json)
        model.load_weights(IMDB_MLP_MODEL_WEIG)
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    else:
        model = Sequential()
        model.add(Embedding(output_dim=32,
                            input_dim=2000,
                            input_length=100))
        model.add(Dropout(0.2))
        '''Drop 20% neuron during training '''
        model.add(Flatten())
        model.add(Dense(units=256, activation='relu'))
        ''' Total 256 neuron in hidden layers'''
        model.add(Dropout(0.35))
        model.add(Dense(units=1, activation='sigmoid'))
        ''' Define output layer with 'sigmoid activation' '''

logger.info('Model summary:\n{}\n'.format(model.summary()))


2018-09-16 23:44:45,275 - IMDBb - INFO - Model summary:
None



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 32)           64000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               819456    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 883,713
Trainable params: 883,713
Non-trainable params: 0
_________________________________________________________________


## train model

In [12]:
if not IS_RELOAD:
    logger.info('Start training process...')
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    train_history = model.fit(x_train, train_labels, batch_size=100, epochs=10, verbose=2, validation_split=0.2)
    print("")
    # Serialized model
    print("\t[Info] Serialized Keras model to %s..." % (IMDB_MLP_MODEL_NAME))
    with open(IMDB_MLP_MODEL_NAME, 'w') as f:
        f.write(model.to_json())
    model.save_weights(IMDB_MLP_MODEL_WEIG)


2018-09-16 23:44:45,283 - IMDBb - INFO - Start training process...


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 22s - loss: 0.4783 - acc: 0.7592 - val_loss: 0.5382 - val_acc: 0.7432
Epoch 2/10
 - 1s - loss: 0.2662 - acc: 0.8902 - val_loss: 0.5053 - val_acc: 0.7796
Epoch 3/10
 - 1s - loss: 0.1545 - acc: 0.9456 - val_loss: 0.6859 - val_acc: 0.7410
Epoch 4/10
 - 1s - loss: 0.0818 - acc: 0.9724 - val_loss: 0.8090 - val_acc: 0.7594
Epoch 5/10
 - 1s - loss: 0.0484 - acc: 0.9835 - val_loss: 1.0459 - val_acc: 0.7334
Epoch 6/10
 - 1s - loss: 0.0379 - acc: 0.9861 - val_loss: 0.9912 - val_acc: 0.7714
Epoch 7/10
 - 1s - loss: 0.0298 - acc: 0.9895 - val_loss: 1.1950 - val_acc: 0.7488
Epoch 8/10
 - 1s - loss: 0.0262 - acc: 0.9901 - val_loss: 1.1282 - val_acc: 0.7694
Epoch 9/10
 - 1s - loss: 0.0302 - acc: 0.9886 - val_loss: 1.2108 - val_acc: 0.7614
Epoch 10/10
 - 1s - loss: 0.0238 - acc: 0.9914 - val_loss: 1.0297 - val_acc: 0.7950

	[Info] Serialized Keras model to imdb_mlp.model...


## Evaluation

In [13]:
logger.info('Start evaluation...')
scores = model.evaluate(x_test, test_labels, verbose=1)
print("")
logger.info('Score={}'.format(scores[1]))

predict_classes = model.predict_classes(x_test).reshape(-1)
print("")
sentiDict = {1: 'Pos', 0: 'Neg'}


def display_test_Sentiment(i):
    r'''
    Show prediction on i'th test data
    '''
    logger.debug('{}\'th test data:\n{}\n'.format(i, test_text[i]))
    logger.info(
        'Ground truth: {}; prediction result: {}'.format(sentiDict[test_labels[i]], sentiDict[predict_classes[i]]))


logger.info('Show prediction on 2\'th test data:')
display_test_Sentiment(2)


2018-09-16 23:45:19,139 - IMDBb - INFO - Start evaluation...




2018-09-16 23:45:20,369 - IMDBb - INFO - Score=0.81884





2018-09-16 23:45:21,238 - IMDBb - INFO - Show prediction on 2'th test data:
2018-09-16 23:45:21,239 - IMDBb - INFO - Ground truth: Pos; prediction result: Pos



