## Importing libraries and packages

In [1]:
import numpy as np
np.random.seed(1337)  # for reproducibility

In [2]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Embedding
from keras.layers import Convolution1D, MaxPooling1D
from keras.datasets import imdb
from keras import backend as K

Using Theano backend.


In [3]:
#embedding
max_features = 5000
maxlen = 400

embedding_dims = 50

#convolution
nb_filter = 250
filter_length = 3

hidden_dims = 250

#training
nb_epoch = 2
batch_size = 32

## Loading Data

In [4]:
import pandas as pd
data = pd.read_csv( "labeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3, encoding="utf-8" )

In [5]:
data['review'][0:5], data['sentiment'][0:5]

(0    "With all this stuff going down at the moment ...
 1    "\"The Classic War of the Worlds\" by Timothy ...
 2    "The film starts with a manager (Nicholas Bell...
 3    "It must be assumed that those who praised thi...
 4    "Superbly trashy and wondrously unpretentious ...
 Name: review, dtype: object, 0    1
 1    1
 2    0
 3    0
 4    1
 Name: sentiment, dtype: int64)

In [6]:
data['review'][0]

u'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finall

## Cleaning data

In [7]:
import re
k = []
for i in data['review']:
    k.append(i.encode('utf-8'))

In [8]:
from keras.preprocessing.text import Tokenizer, base_filter

max_vocab_size = 50000

# `filters` specify what characters to get rid of
# `base_filter()` includes basic punctuation;
# I like to extend it with common unicode punctuation
tokenizer = Tokenizer(nb_words=max_vocab_size,
                     filters=base_filter()+'“”–')

# fit the tokenizer
tokenizer.fit_on_texts(k)
X_train = []
# we also want to keep track of the actual vocab size
# we'll need this later
# note: we add one because `0` is a reserved index in keras' tokenizer
vocab_size = len(tokenizer.word_index) + 1
for seq in tokenizer.texts_to_sequences(k):
       X_train.append(seq)

In [9]:
type(X_train)

list

In [10]:
print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=100)

Pad sequences (samples x time)


In [11]:
type(X_train)

numpy.ndarray

In [12]:
Y_train = np.asarray(list(data['sentiment']))
print(Y_train)

[1 1 0 ..., 0 0 1]


## Building model

In [13]:
print('Build model...')
model = Sequential()

Build model...


In [14]:
# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(50000,
                    embedding_dims,
                    input_length=100,
                    dropout=0.2))

In [15]:
# we add a Convolution1D, which will learn nb_filter
# word group filters of size filter_length:
model.add(Convolution1D(nb_filter=32,
                        filter_length=filter_length,
                        border_mode='valid',
                        activation='relu',
                        subsample_length=1))

In [16]:
# we use max pooling:
model.add(MaxPooling1D(pool_length=model.output_shape[1]))

In [17]:
# We flatten the output of the conv layer,
# so that we can add a vanilla dense layer:
model.add(Flatten())


In [18]:
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))


In [19]:
# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train, Y_train,
          batch_size=batch_size,
          nb_epoch=nb_epoch,
          validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x129335510>

## Prediction

In [20]:
k1=model.predict(X_train)

In [21]:
k1

array([[ 0.70430762],
       [ 0.65062225],
       [ 0.08837233],
       ..., 
       [ 0.01092899],
       [ 0.4599663 ],
       [ 0.97542989]], dtype=float32)

In [22]:
k2=model.predict_classes(X_train)



In [23]:
k2

array([[1],
       [1],
       [0],
       ..., 
       [0],
       [0],
       [1]], dtype=int32)

In [None]:
# def predict_classes(model, x, batch_size=32, verbose=1):
#     '''Generate class predictions for the input samples
#     batch by batch.
#     # Arguments
#         x: input data, as a Numpy array or list of Numpy arrays
#             (if the model has multiple inputs).
#         batch_size: integer.
#         verbose: verbosity mode, 0 or 1.
#     # Returns
#         A numpy array of class predictions.
#     '''
#     proba = model.predict(x, batch_size=batch_size, verbose=verbose)
#     if proba.shape[-1] > 1:
#         return proba.argmax(axis=-1)
#     else:
#         return (proba > 0.8).astype('int32')

In [None]:
# k3 = predict_classes(model, X_train)

In [None]:
# k3

In [None]:
# X_train[1]

In [None]:
# data['review'][1]

## Test set

In [25]:
import re
def run():
	file = open("all.txt")
 	ret = []
	while 1:
		line = file.readline()
		if not line:
			break
		if line[:8] == 'Customer':
			line = line[8:]
			line = line.strip()
			line = line.replace(',',';')
			line = line.replace('.',';')
			line = line.replace(':',';')
			ret.extend(line.split(';'))
	res = []
	for k in ret:
		if k.strip() != "":
			res.append(k.strip())
	print res
	return res

def main():
	run()



if __name__ == '__main__':
	main()

['hello', 'I bought simcity and told I cannot install it due to the product code already being used', 'is there anyway to get the code reset or am i out the money paid?', 'this is a very bad policy as I bought it NEW', 'and very bad customer service', 'I already bought it! What does a 15% discount do when I already paid $60 for it?', 'may i speak with your supervisor', 'please do', 'EA has just lost a customer for life', 'I will gladly post to every forum social networking site about this issue', 'and considering I asked to SPEAK to your supervisor and you come back with a reply', 'that is very bad customer service', 'shows where EA rates its customers', 'really?', 'i need the phone number for your corporate office and your employee id number', 'Wanting to ask about upgrades', 'I am doing ok', 'yourself?', 'I am wanting to inquire about adding HD service and/or DVR to my existing plan', 'I am deciding whether to stay with comcast or move to AT&T', 'How much extra would it be to add HD 

In [26]:
op = run()

['hello', 'I bought simcity and told I cannot install it due to the product code already being used', 'is there anyway to get the code reset or am i out the money paid?', 'this is a very bad policy as I bought it NEW', 'and very bad customer service', 'I already bought it! What does a 15% discount do when I already paid $60 for it?', 'may i speak with your supervisor', 'please do', 'EA has just lost a customer for life', 'I will gladly post to every forum social networking site about this issue', 'and considering I asked to SPEAK to your supervisor and you come back with a reply', 'that is very bad customer service', 'shows where EA rates its customers', 'really?', 'i need the phone number for your corporate office and your employee id number', 'Wanting to ask about upgrades', 'I am doing ok', 'yourself?', 'I am wanting to inquire about adding HD service and/or DVR to my existing plan', 'I am deciding whether to stay with comcast or move to AT&T', 'How much extra would it be to add HD 

In [27]:
import re
l = []
for i in op:
    l.append(i)#.encode('utf-8'))

In [28]:
from keras.preprocessing.text import Tokenizer, base_filter

max_vocab_size = 50000

# `filters` specify what characters to get rid of
# `base_filter()` includes basic punctuation;
# I like to extend it with common unicode punctuation
tokenizer = Tokenizer(nb_words=max_vocab_size,
                     filters=base_filter()+'“”–')

# fit the tokenizer
tokenizer.fit_on_texts(l)
test2 = []
# we also want to keep track of the actual vocab size
# we'll need this later
# note: we add one because `0` is a reserved index in keras' tokenizer
vocab_size = len(tokenizer.word_index) + 1
for seq in tokenizer.texts_to_sequences(l):
       test2.append(seq)

In [29]:
print('Pad sequences (samples x time)')
test2 = sequence.pad_sequences(test2, maxlen=100)

Pad sequences (samples x time)


In [30]:
k5=model.predict(test2)

In [31]:
k5

array([[ 0.6150341 ],
       [ 0.93774295],
       [ 0.82692462],
       [ 0.26927882],
       [ 0.35284719],
       [ 0.68346202],
       [ 0.5176912 ],
       [ 0.87635094],
       [ 0.70256692],
       [ 0.68085206],
       [ 0.90621799],
       [ 0.40417346],
       [ 0.59264529],
       [ 0.49847844],
       [ 0.88083476],
       [ 0.76779288],
       [ 0.32717764],
       [ 0.67836237],
       [ 0.89299691],
       [ 0.459539  ],
       [ 0.90199977],
       [ 0.45718169],
       [ 0.76030016],
       [ 0.69038332],
       [ 0.25563088],
       [ 0.92387486],
       [ 0.79054242],
       [ 0.69038332],
       [ 0.8355037 ],
       [ 0.61352098],
       [ 0.51107204],
       [ 0.8019473 ],
       [ 0.65174818],
       [ 0.59006035],
       [ 0.70040226],
       [ 0.71724451],
       [ 0.81642812],
       [ 0.37185359],
       [ 0.52999276],
       [ 0.59694958],
       [ 0.68980432],
       [ 0.81612951],
       [ 0.89074898],
       [ 0.64480108],
       [ 0.68236721],
       [ 0

In [32]:
k5 = model.predict_classes(test2)



In [33]:
k5

array([[1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
    

In [34]:
def bin2Scaled(polarity, magnitude):
	# This function converts a binary measurement (of sentiment analysis)
	# into a 7-point assesment as is used currently at Google.
	if polarity==1:
		return 5 + magnitude*2
	else:
		return 5 - magnitude*5

In [44]:
bin2Scaled(1,k5)

array([[7],
       [7],
       [7],
       [5],
       [5],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [5],
       [7],
       [5],
       [7],
       [7],
       [5],
       [7],
       [7],
       [5],
       [7],
       [5],
       [7],
       [7],
       [5],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [5],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [5],
       [7],
       [7],
       [7],
       [5],
       [7],
       [5],
       [5],
       [7],
       [7],
       [7],
       [7],
       [5],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [7],
       [5],
       [7],
       [7],
       [7],
    