In [79]:
import pandas as pd
import numpy as np

from keras.preprocessing.text import *
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import CuDNNLSTM
from keras.layers.embeddings import Embedding
from keras.metrics import categorical_accuracy

In [11]:
data = pd.read_csv('dataset/preProcessSentiment.csv', index_col=0)
data = data[['text', 'FOOD', 'PRICE', 'SERVICE', 'AMBIENCE']]

def convert(elem):
    elem[0] = "FOOD_" + elem[0]
    elem[1] = "PRICE_" + elem[1]
    elem[2] = "SERVICE_" + elem[2]
    elem[3] = "AMBIENCE_" + elem[3]
    return elem

corpus = data.values[:,0]
label = list(map(convert,data.values[:,1:].tolist()))
label

[['FOOD_POSITIVE', 'PRICE_UNKNOWN', 'SERVICE_UNKNOWN', 'AMBIENCE_UNKNOWN'],
 ['FOOD_POSITIVE', 'PRICE_UNKNOWN', 'SERVICE_UNKNOWN', 'AMBIENCE_UNKNOWN'],
 ['FOOD_POSITIVE', 'PRICE_UNKNOWN', 'SERVICE_UNKNOWN', 'AMBIENCE_POSITIVE'],
 ['FOOD_POSITIVE', 'PRICE_UNKNOWN', 'SERVICE_UNKNOWN', 'AMBIENCE_UNKNOWN'],
 ['FOOD_POSITIVE', 'PRICE_UNKNOWN', 'SERVICE_UNKNOWN', 'AMBIENCE_POSITIVE'],
 ['FOOD_POSITIVE', 'PRICE_UNKNOWN', 'SERVICE_POSITIVE', 'AMBIENCE_UNKNOWN'],
 ['FOOD_POSITIVE', 'PRICE_UNKNOWN', 'SERVICE_UNKNOWN', 'AMBIENCE_UNKNOWN'],
 ['FOOD_UNKNOWN', 'PRICE_UNKNOWN', 'SERVICE_UNKNOWN', 'AMBIENCE_POSITIVE'],
 ['FOOD_POSITIVE', 'PRICE_UNKNOWN', 'SERVICE_POSITIVE', 'AMBIENCE_UNKNOWN'],
 ['FOOD_POSITIVE', 'PRICE_NEGATIVE', 'SERVICE_UNKNOWN', 'AMBIENCE_UNKNOWN'],
 ['FOOD_POSITIVE', 'PRICE_UNKNOWN', 'SERVICE_NEGATIVE', 'AMBIENCE_NEGATIVE'],
 ['FOOD_POSITIVE', 'PRICE_UNKNOWN', 'SERVICE_UNKNOWN', 'AMBIENCE_UNKNOWN'],
 ['FOOD_POSITIVE', 'PRICE_UNKNOWN', 'SERVICE_UNKNOWN', 'AMBIENCE_POSITIVE'],
 ['F

In [42]:
encoder  = MultiLabelBinarizer()
labels = encoder.fit_transform(label)

In [60]:
test = pd.read_csv('dataset/gold_sentiment.csv', index_col=0)
test = test[['text', 'FOOD', 'PRICE', 'SERVICE', 'AMBIENCE']]

test_label = list(map(convert,test.values[:,1:].tolist()))
test_corpus = test.values[:,0]

test_label = encoder.transform(test_label)

In [61]:
vocab_size = 20000
max_length = 150
encoded_docs = [one_hot(d, vocab_size) for d in corpus]
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

encoded_test = [one_hot(d, vocab_size) for d in test_corpus]
padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post')

In [80]:
model = Sequential()
model.add(Embedding(vocab_size, 512, input_length=max_length))
model.add(CuDNNLSTM(128))
model.add(Dense(12, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc', categorical_accuracy])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 150, 512)          10240000  
_________________________________________________________________
cu_dnnlstm_4 (CuDNNLSTM)     (None, 128)               328704    
_________________________________________________________________
dense_10 (Dense)             (None, 12)                1548      
Total params: 10,570,252
Trainable params: 10,570,252
Non-trainable params: 0
_________________________________________________________________
None


In [81]:
model.fit(padded_docs, np.array(labels), epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x25941d49828>

In [99]:
model.evaluate(padded_test, np.array(test_label))



[0.5276323922475179, 0.7727777711550394, 0.06]

In [112]:
result = model.predict(padded_test)
result

array([[0.07373809, 0.67728895, 0.14945637, ..., 0.03831172, 0.01302223,
        0.95357513],
       [0.2307323 , 0.10283428, 0.6246008 , ..., 0.05008041, 0.00980868,
        0.985068  ],
       [0.16644225, 0.12394237, 0.7075251 , ..., 0.40931708, 0.74638486,
        0.01592148],
       ...,
       [0.04349829, 0.9497142 , 0.03308911, ..., 0.07488237, 0.9564393 ,
        0.0051264 ],
       [0.11160324, 0.38139918, 0.5535289 , ..., 0.3862949 , 0.90450525,
        0.00458602],
       [0.00577857, 0.06332741, 0.9666051 , ..., 0.05448318, 0.46222126,
        0.17723331]], dtype=float32)

In [121]:
normalize = lambda x: 1 if x >= 0.5 else 0
normalize_arr = lambda arr : [normalize(elem) for elem in arr]
result_normal = np.array(list(map(normalize_arr, result)))
result_normal = encoder.inverse_transform(result_normal)

In [122]:
result_normal

[('AMBIENCE_POSITIVE', 'FOOD_POSITIVE', 'SERVICE_UNKNOWN'),
 ('AMBIENCE_UNKNOWN', 'FOOD_POSITIVE', 'PRICE_POSITIVE', 'SERVICE_UNKNOWN'),
 ('AMBIENCE_UNKNOWN', 'FOOD_POSITIVE', 'PRICE_POSITIVE', 'SERVICE_POSITIVE'),
 ('AMBIENCE_POSITIVE', 'FOOD_POSITIVE', 'PRICE_UNKNOWN', 'SERVICE_UNKNOWN'),
 ('AMBIENCE_POSITIVE', 'FOOD_POSITIVE', 'PRICE_UNKNOWN', 'SERVICE_POSITIVE'),
 ('AMBIENCE_UNKNOWN', 'FOOD_POSITIVE', 'PRICE_POSITIVE'),
 ('AMBIENCE_UNKNOWN', 'FOOD_POSITIVE', 'PRICE_UNKNOWN', 'SERVICE_UNKNOWN'),
 ('AMBIENCE_POSITIVE', 'FOOD_POSITIVE', 'PRICE_POSITIVE', 'SERVICE_POSITIVE'),
 ('AMBIENCE_POSITIVE', 'FOOD_POSITIVE', 'PRICE_POSITIVE', 'SERVICE_UNKNOWN'),
 ('AMBIENCE_POSITIVE', 'FOOD_POSITIVE', 'PRICE_UNKNOWN', 'SERVICE_UNKNOWN'),
 ('PRICE_UNKNOWN', 'SERVICE_NEGATIVE'),
 ('AMBIENCE_POSITIVE', 'FOOD_POSITIVE', 'PRICE_POSITIVE', 'SERVICE_POSITIVE'),
 ('AMBIENCE_POSITIVE', 'FOOD_POSITIVE', 'SERVICE_NEGATIVE'),
 ('FOOD_POSITIVE', 'PRICE_UNKNOWN', 'SERVICE_UNKNOWN'),
 ('AMBIENCE_UNKNOWN', 'PRI