In [17]:
import pandas as pd

# 0	label	critical/trivial
# 1	title	string
# 2	review	string
# 3	Overall rating	1-5
# 4	aspect rating-Value	1-5
# 5	aspect rating-Location	1-5
# 6	aspect rating-Rooms	1-5
# 7	aspect rating-Cleanliness	1-5
# 8	aspect rating-Service	1-5
# 9	aspect rating-Sleep Quality	1-5
# 10	aspect rating-Check in / front desk	1-5
# 11	aspect rating-Business service (e.g., internet access)
# 12	travel_type	no fill in : 0
        # traveled solo：1
        # traveled as a couple：2
        # traveled with family：3
        # traveled with friends：4
        # traveled on business：5

def get_hotelreview_dataframe(filepath):
    with open(filepath,'r', encoding='UTF-8') as f :
        label = []
        title = []
        content = []
        rating =[]
        aspect_value = []
        aspect_location = []
        aspect_rooms = []
        aspect_cleaness = []
        aspect_service = []
        aspect_sleep = []
        traveler_type = []
        response_day = []
        text = []
        
        for line in f.read().splitlines() :
            items = line.split('\t')
            
            label.append(items[0])
            title.append(items[1])
            content.append(items[2])
            rating.append(items[3])
            aspect_value.append(items[4])
            aspect_location.append(items[5])
            aspect_rooms.append(items[6])
            aspect_cleaness.append(items[7])
            aspect_service.append(items[8])
            aspect_sleep.append(items[9])
            traveler_type.append(items[10])
            response_day.append(items[11])
            text.append(items[1] + " " + items[2])
            
        hotel_review = {
            "label" : label, "title" : title, "content" : content, "rating" : rating, 
            "aspect_value" : aspect_value,
            "aspect_location" : aspect_location,
            "aspect_rooms" : aspect_rooms,
            "aspect_cleaness" : aspect_cleaness,
            "aspect_service" : aspect_service,
            "aspect_sleep" : aspect_sleep,
            "traveler_type" : traveler_type,
            "text" : text
        }
        return pd.DataFrame(hotel_review)
    
def preprocess_data(raw_df):
    raw_df['label'] =  raw_df['label'].map({'critical' : 0, 'trivial' : 1}).astype(int)
    #raw_df['traveler_type'] =  raw_df['traveler_type'].map({0 : 'no fill in', 1 : 'traveled with family', 2 : 'traveled with friends', 3 : 'traveled as a couple', 4 : 'traveled on business' , 5 : 'traveled solo'}).astype(int)
    return raw_df

In [25]:
df_training =  get_hotelreview_dataframe('review_train_0608.txt')
df_test =  get_hotelreview_dataframe('review_test_0608.txt')

cols = ['label', 'title', 'content', 'rating', 'aspect_value', 'aspect_location',  'aspect_rooms', 'aspect_cleaness',  'aspect_service', 'aspect_sleep', 'traveler_type'] #define df order
print(len(df_training))
df_training = df_training[cols]


trainingData = preprocess_data(df_training)
testData = preprocess_data(df_test)


print(trainingData[:5])

9745
   label                         title  \
0      0  Exceptional customer service   
1      0                   Outstanding   
2      1                          stay   
3      0                  Weekend away   
4      0    Convenient docklands hotel   

                                             content rating aspect_value  \
0  This stay hotel rating anniversary stay perfec...      5            5   
1  We stayed nights June room product excellent d...      3            3   
2  Stayed June Sun 7th June concert o2 love moder...      5            4   
3  Spent enjoyable weekend Hilton Canary Wharf st...      5            0   
4  We stayed going concert O2 comfortable hotel e...      5            5   

  aspect_location aspect_rooms aspect_cleaness aspect_service aspect_sleep  \
0               5            4               5              5            0   
1               4            5               4              2            4   
2               4            5               5     

In [19]:
import numpy as np
embeddings_index = dict()
f = open('./glove.6B.300d.txt', encoding='UTF-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


Found 400000 word vectors.


In [21]:
from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras import optimizers
from keras.utils import np_utils

from sklearn.model_selection import KFold
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Embedding, Activation, merge, Input, Lambda, Reshape
from keras.layers import Convolution1D, GlobalMaxPooling1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D
from keras.layers import LSTM, GRU, TimeDistributed, Bidirectional, BatchNormalization
from keras.layers.merge import concatenate


m_max_seq_len =30
m_max_num_vocab = 20000
tokenizer = Tokenizer(num_words = m_max_num_vocab, lower=True, split=" ", char_level=False)
EMBEDDING_DIM = 100

predicted = []
expected = []

tokenizer.fit_on_texts(trainingData['text'])
x_train_seq = tokenizer.texts_to_sequences(trainingData['text'])
x_train = sequence.pad_sequences(x_train_seq, maxlen = m_max_seq_len)
y_trainOneHot = np_utils.to_categorical(trainingData['label'])

x_test_seq = tokenizer.texts_to_sequences(testData['text'])
x_test = sequence.pad_sequences(x_test_seq, maxlen = m_max_seq_len)
y_testOneHot = np_utils.to_categorical(testData['label'])




# prepare embedding matrix
word_index = tokenizer.word_index
num_words = min(m_max_num_vocab, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= m_max_num_vocab:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector



main_input = Input(shape=(m_max_seq_len,), dtype='float64')
embedder = Embedding(m_max_num_vocab, 
                    EMBEDDING_DIM, 
                    input_length=m_max_seq_len, 
                    weights=[embedding_matrix], 
                    trainable=False)


embed = embedder(main_input)
# cnn1，kernel_size = 3
conv1_1 = Convolution1D(256, 3, padding='same')(embed)
bn1_1 = BatchNormalization()(conv1_1)
relu1_1 = Activation('relu')(bn1_1)
drop1_1 = Dropout(0.35)(relu1_1 )
conv1_2 = Convolution1D(128, 3, padding='same')(drop1_1)
bn1_2 = BatchNormalization()(conv1_2)
relu1_2 = Activation('relu')(bn1_2)
drop1_2 = Dropout(0.35)(relu1_2 )
cnn1 = MaxPool1D(pool_size=4)(drop1_2)

# cnn2，kernel_size = 4
conv2_1 = Convolution1D(256, 4, padding='same')(embed)
bn2_1 = BatchNormalization()(conv2_1)
relu2_1 = Activation('relu')(bn2_1)
drop2_1 = Dropout(0.35)(relu2_1 )
conv2_2 = Convolution1D(128, 4, padding='same')(drop2_1)
bn2_2 = BatchNormalization()(conv2_2)
relu2_2 = Activation('relu')(bn2_2)
drop2_2 = Dropout(0.35)(relu2_2 )
cnn2 = MaxPool1D(pool_size=4)(drop2_2)

# cnn3，kernel_size = 5
conv3_1 = Convolution1D(256, 5, padding='same')(embed)
bn3_1 = BatchNormalization()(conv3_1)
relu3_1 = Activation('relu')(bn3_1)
drop3_1 = Dropout(0.35)(relu3_1 )
conv3_2 = Convolution1D(128, 5, padding='same')(drop3_1)
bn3_2 = BatchNormalization()(conv3_2)
relu3_2 = Activation('relu')(bn3_2)
drop3_2 = Dropout(0.35)(relu3_2 )
cnn3 = MaxPool1D(pool_size=4)(drop3_2)

# concatenate above 3 convolution layers
cnn = concatenate([cnn1,cnn2,cnn3], axis=-1)
flat = Flatten()(cnn)
drop = Dropout(0.5)(flat)
fc = Dense(512)(drop)
bn = BatchNormalization()(fc)
main_output = Dense(2, activation='sigmoid')(bn)
model = Model(inputs = main_input, outputs = main_output)



main_input = Input(shape=(m_max_seq_len,), dtype='float64')
embed = Embedding(m_max_num_vocab, 
                    EMBEDDING_DIM, 
                    input_length=m_max_seq_len, 
                    weights=[embedding_matrix], 
                    trainable=False)(main_input)

cnn = Convolution1D(256, 3, padding='same', strides = 1, activation='relu')(embed)
cnn = MaxPool1D(pool_size=4)(cnn)
cnn = Flatten()(cnn)
cnn = Dense(256)(cnn)
rnn = Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1))(embed)
rnn = Dense(256)(rnn)
con = concatenate([cnn,rnn], axis=-1)
main_output = Dense(2, activation='softmax')(con)
model = Model(inputs = main_input, outputs = main_output)



model.summary()
optmzr = optimizers.Adam(lr=0.001)
model.compile(loss = 'categorical_crossentropy', optimizer=optmzr, metrics = ['accuracy'])
train_history = model.fit(x_train, y_trainOneHot, batch_size = 32, epochs = 10, verbose = 2, validation_data=(x_test, y_testOneHot))
print('model training completed!!')

KeyError: 'text'

In [None]:

pre_probability = model.predict(x_test_review)
prediction = pre_probability.argmax(axis=-1)


from sklearn import metrics
print("Classification report for classifier:\n%s\n"
    % ( metrics.classification_report(y_test_review, prediction)))

import pandas_ml
from pandas_ml import ConfusionMatrix
confusion_matrix = ConfusionMatrix(y_test_review, prediction)
print("Confusion matrix:\n%s" % confusion_matrix)

In [None]:
from sklearn.metrics import accuracy_score
print('accuracy = ', accuracy_score(y_test_review, prediction))

In [None]:
from keras import backend as K  
from keras.layers import Layer  
from keras import initializers, regularizers, constraints  
  
def dot_product(x, kernel):  
    """ 
    Wrapper for dot product operation, in order to be compatible with both 
    Theano and Tensorflow 
    Args: 
        x (): input 
        kernel (): weights 
    Returns: 
    """  
    if K.backend() == 'tensorflow':  
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)  
    else:  
        return K.dot(x, kernel)  
  
  
class AttentionWithContext(Layer):  
    """ 
    Attention operation, with a context/query vector, for temporal data. 
    Supports Masking. 
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf] 
    "Hierarchical Attention Networks for Document Classification" 
    by using a context vector to assist the attention 
    # Input shape 
        3D tensor with shape: `(samples, steps, features)`. 
    # Output shape 
        2D tensor with shape: `(samples, features)`. 
    How to use: 
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True. 
    The dimensions are inferred based on the output shape of the RNN. 
    Note: The layer has been tested with Keras 2.0.6 
    Example: 
        model.add(LSTM(64, return_sequences=True)) 
        model.add(AttentionWithContext()) 
        # next add a Dense layer (for classification/regression) or whatever... 
    """  
  
    def __init__(self,  
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,  
                 W_constraint=None, u_constraint=None, b_constraint=None,  
                 bias=True, **kwargs):  
  
        self.supports_masking = True  
        self.init = initializers.get('glorot_uniform')  
  
        self.W_regularizer = regularizers.get(W_regularizer)  
        self.u_regularizer = regularizers.get(u_regularizer)  
        self.b_regularizer = regularizers.get(b_regularizer)  
  
        self.W_constraint = constraints.get(W_constraint)  
        self.u_constraint = constraints.get(u_constraint)  
        self.b_constraint = constraints.get(b_constraint)  
  
        self.bias = bias  
        super(AttentionWithContext, self).__init__(**kwargs)  
  
    def build(self, input_shape):  
        assert len(input_shape) == 3  
  
        self.W = self.add_weight((input_shape[-1], input_shape[-1],),  
                                 initializer=self.init,  
                                 name='{}_W'.format(self.name),  
                                 regularizer=self.W_regularizer,  
                                 constraint=self.W_constraint)  
        if self.bias:  
            self.b = self.add_weight((input_shape[-1],),  
                                     initializer='zero',  
                                     name='{}_b'.format(self.name),  
                                     regularizer=self.b_regularizer,  
                                     constraint=self.b_constraint)  
  
        self.u = self.add_weight((input_shape[-1],),  
                                 initializer=self.init,  
                                 name='{}_u'.format(self.name),  
                                 regularizer=self.u_regularizer,  
                                 constraint=self.u_constraint)  
  
        super(AttentionWithContext, self).build(input_shape)  
  
    def compute_mask(self, input, input_mask=None):  
        # do not pass the mask to the next layers  
        return None  
  
    def call(self, x, mask=None):  
        uit = dot_product(x, self.W)  
  
        if self.bias:  
            uit += self.b  
  
        uit = K.tanh(uit)  
        ait = dot_product(uit, self.u)  
  
        a = K.exp(ait)  
  
        # apply mask after the exp. will be re-normalized next  
        if mask is not None:  
            # Cast the mask to floatX to avoid float64 upcasting in theano  
            a *= K.cast(mask, K.floatx())  
  
        # in some cases especially in the early stages of training the sum may be almost zero  
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.  
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())  
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())  
  
        a = K.expand_dims(a)  
        weighted_input = x * a  
        return K.sum(weighted_input, axis=1)  
  
    def compute_output_shape(self, input_shape):  
        return input_shape[0], input_shape[-1] 