In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
# fix random seed for reproducibility
np.random.seed(7)

In [6]:
yelp_reviews=pd.read_csv("/contents/My Drive/review_sample.csv",usecols=["useful","text", "cuisine"])

In [7]:
yelp_reviews.head(10)

Unnamed: 0,cuisine,useful,text
0,Asian,0,We've tried a few different Chinese delivery p...
1,Asian,6,My expectations of chinese delivery places in ...
2,Asian,2,This place only gets one star because the syst...
3,Asian,0,This place is exactly why I rarely eat Chinese...
4,Asian,0,I could only give this place a 3 out of 5. Th...
5,Asian,3,This hole in the wall is more than meets the e...
6,Asian,2,"Ok, 4 stars because of the food alone on this ..."
7,Asian,1,Possibly the worst chinese I've ever had. Very...
8,Asian,0,"For Chinese take out food, this is my place. O..."
9,Asian,0,Really REALLY bad food! This is NOT a good chi...


In [8]:
yelp_reviews['cuisine'].value_counts()

Asian    203189
Name: cuisine, dtype: int64

In [None]:
#yelp_reviews[yelp_reviews['cuisine'] == 'American'].to_csv('/home/jia_lu/project/review_data_american.csv',index=False, header=True)

In [None]:
#yelp_reviews[yelp_reviews['cuisine'] == 'Asian'].to_csv('/home/jia_lu/project/review_data_asian.csv',index=False, header=True)

In [None]:
#yelp_reviews[yelp_reviews['cuisine'] == 'Mexican'].to_csv('/home/jia_lu/project/review_data_mexican.csv',index=False, header=True)

In [9]:
yelp_reviews['useful'].value_counts()

0     115926
1      44747
2      19069
3       9158
4       4833
       ...  
88         1
89         1
90         1
91         1
84         1
Name: useful, Length: 119, dtype: int64

In [10]:
yelp_reviews.text[5]

"This hole in the wall is more than meets the eye.  I''ve tried my luck at nearly every chinese takeout spot in the East Valley, and I can definitively say the food at China Gourmet impressed me the most.\n\nPROS: Made from scratch.  Heaping portions at very reasonable prices.  Quality far exceeds expectations.  Single-handedly raises the bar for local take-out.\n\nCONS:  Made from scratch.  So the wait time is fairly substantial (20-25 minutes in my case). Doesn't look like much from the outside.  Very small, only three tables and no restroom. Definitely intended to be a take-out only kind of place.\n\nRecommendation: \nThe teriyaki beef was incredible, and not at all what I was expecting.  Instead of chopped beef in the thick teriyaki glaze I'm accustomed to, theirs was a thinner marinade that soaked completely through the meat.  Every piece was consistently tender and loaded with flavor.  I'd never had teriyaki of this caliber, and definitely didn't expect it from a little hole in t

In [11]:
yelp_reviews.isnull().any()

cuisine    False
useful     False
text       False
dtype: bool

In [12]:
yelp_reviews["labels"]= yelp_reviews["useful"].apply(lambda x: 1 if x >= 1  else 0)

In [13]:
yelp_reviews['labels'].value_counts()

0    115926
1     87263
Name: labels, dtype: int64

In [14]:
yelp_reviews.head(10)

Unnamed: 0,cuisine,useful,text,labels
0,Asian,0,We've tried a few different Chinese delivery p...,0
1,Asian,6,My expectations of chinese delivery places in ...,1
2,Asian,2,This place only gets one star because the syst...,1
3,Asian,0,This place is exactly why I rarely eat Chinese...,0
4,Asian,0,I could only give this place a 3 out of 5. Th...,0
5,Asian,3,This hole in the wall is more than meets the e...,1
6,Asian,2,"Ok, 4 stars because of the food alone on this ...",1
7,Asian,1,Possibly the worst chinese I've ever had. Very...,1
8,Asian,0,"For Chinese take out food, this is my place. O...",0
9,Asian,0,Really REALLY bad food! This is NOT a good chi...,0


In [15]:
texts = yelp_reviews["text"].values
labels = yelp_reviews["labels"].values

In [16]:
yelp_reviews["labels"].values[:5]

array([0, 1, 1, 0, 0])

In [17]:
print(texts.shape)
print(labels.shape)

(203189,)
(203189,)


In [18]:
vocab_size=10000
max_len=500

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 73129 unique tokens.


In [19]:
print(sequences[:1])

[[502, 152, 4, 192, 230, 108, 405, 195, 318, 1478, 100, 21, 4, 329, 471, 9, 108, 14, 16, 13, 22, 25, 68, 23, 14, 3, 41, 1, 134, 720, 498, 2, 713, 2949, 134, 1, 134, 6, 1515, 1561, 68, 86, 195, 12, 152, 8, 7, 71, 4, 23, 176, 17, 329, 259, 93, 62, 1, 6596, 6596, 1365, 228, 1027, 9, 1, 309, 11, 8, 28, 23]]


In [20]:
print(list(word_index.items())[0])
print(list(word_index.items())[73128])

('the', 1)
('schzwan', 73129)


In [21]:
min(list(word_index.values()))

1

In [22]:
data = sequence.pad_sequences(sequences, 
                              maxlen=max_len,
                              padding='post', 
                              truncating='post'
                             )

In [23]:
data[0:5]

array([[ 502,  152,    4, ...,    0,    0,    0],
       [  15, 1071,    9, ...,    0,    0,    0],
       [  13,   22,   79, ...,    0,    0,    0],
       [  13,   22,    7, ...,    0,    0,    0],
       [   3,  138,   79, ...,    0,    0,    0]], dtype=int32)

In [24]:
labels_b = to_categorical(np.asarray(labels))
print('Shape of data:', data.shape)
print('Shape of label:', labels.shape)
print('Shape of label:', labels_b.shape)

Shape of data: (203189, 500)
Shape of label: (203189,)
Shape of label: (203189, 2)


In [37]:
VALIDATION_SPLIT=0.2

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
labels_b = labels_b[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
y_train_b = labels_b[:-nb_validation_samples]
x_test = data[-nb_validation_samples:]
y_test = labels[-nb_validation_samples:]
y_test_b = labels_b[-nb_validation_samples:]

In [38]:
print('Shape of x_train:', x_train.shape)
print('Shape of x_test:', x_test.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of y_train_b:', y_train_b.shape)
print('Shape of y_test:', y_test.shape)
print('Shape of y_test_b:', y_test_b.shape)

Shape of x_train: (162552, 500)
Shape of x_test: (40637, 500)
Shape of y_train: (162552,)
Shape of y_train_b: (162552, 2)
Shape of y_test: (40637,)
Shape of y_test_b: (40637, 2)


In [28]:
print(x_train[0:3])
print(y_train[0:3])

[[   3  189 1986 ...    0    0    0]
 [  31   35   26 ...    0    0    0]
 [ 376   49 4926 ...    0    0    0]]
[0 1 0]


### Simple LSTM Classifier

In [39]:
#Simple LSTM Classifier
embedding_units = 30
rnn_units = 256
#attn_units=128

#Simple LSTM Classifier
sequence_input = layers.Input(shape=(max_len,),name="input_layer", dtype='int32')
embeddings = keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_units, input_length=max_len,name="embedding_layer")(sequence_input)
rnn_output=tf.keras.layers.LSTM(rnn_units,name='LSTM')(embeddings)
output = keras.layers.Dense(1, activation='sigmoid',name='output_layer')(rnn_output)
model = keras.Model(inputs=sequence_input, outputs=output)

model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     [(None, 500)]             0         
_________________________________________________________________
embedding_layer (Embedding)  (None, 500, 30)           300000    
_________________________________________________________________
LSTM (LSTM)                  (None, 256)               293888    
_________________________________________________________________
output_layer (Dense)         (None, 1)                 257       
Total params: 594,145
Trainable params: 594,145
Non-trainable params: 0
_________________________________________________________________


In [40]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

early_stopping_callback = keras.callbacks.EarlyStopping(monitor='val_loss',
                                                        min_delta=0,
                                                        patience=3,
                                                        verbose=0, mode='auto')

In [41]:
history = model.fit(x_train,
                    y_train,
                    epochs=10,
                    batch_size=200,
                    validation_split=.3, verbose=1, callbacks=[early_stopping_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [42]:
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 63.88%


### Bidirectional LSTM Classifier

In [43]:
from tensorflow.keras.layers import Bidirectional


sequence_input = layers.Input(shape=(max_len,),name="input_layer", dtype='int32')
embeddings = keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_units, input_length=max_len,name="embedding_layer")(sequence_input)
rnn_output=Bidirectional(tf.keras.layers.LSTM(rnn_units,name='LSTM'))(embeddings)
output = keras.layers.Dense(1, activation='sigmoid',name='output_layer')(rnn_output)
model = keras.Model(inputs=sequence_input, outputs=output)

model.summary()



In [44]:
model.fit(x_train, y_train_b, validation_split=.3,
          epochs=5, batch_size=200);

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [46]:
scores = model.evaluate(x_test, y_test_b, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 63.61%


### LSTM Classifier with attention

In [47]:
#our attention layer, uses Bahdanau Attention from 2015 paper that's essentially weighted sum, also known as "additive attention"
class BahdanauAttention(layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        self._name="Attention"
 
    def call(self, features, hidden):
    # hidden state shape == (batch_size, hidden size)
    # hidden_with_time_axis shape == (batch_size, 1, hidden size)
    # values shape == (batch_size, max_len, hidden size)
    # we are doing this to broadcast addition along the time axis to calculate the score
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        
    # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
    # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
 
        return context_vector, attention_weights

In [49]:
#LSTM Classifier with Attention
attn_units=128

sequence_input = layers.Input(shape=(max_len,),name="input_layer", dtype='int32')
embeddings = keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_units, input_length=max_len,name="embedding_layer")(sequence_input)
lstm_output,hidden_h, hidden_c=tf.keras.layers.LSTM(rnn_units,name='LSTM',return_sequences=True,
                                      return_state=True)(embeddings)
context_vector, attention_weights = BahdanauAttention(attn_units)(lstm_output, hidden_h)
output = keras.layers.Dense(1, activation='sigmoid',name='output_layer')(context_vector)
attn_model = keras.Model(inputs=sequence_input, outputs=output)
 
# summarize layers
model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     [(None, 500)]             0         
_________________________________________________________________
embedding_layer (Embedding)  (None, 500, 30)           300000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 500, 100)          32400     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 102 

In [50]:
attn_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

early_stopping_callback = keras.callbacks.EarlyStopping(monitor='val_loss',
                                                        min_delta=0,
                                                        patience=3,
                                                        verbose=0, mode='auto')

In [51]:
history = attn_model.fit(x_train,
                    y_train,
                    epochs=2,
                    batch_size=200,
                    validation_split=.3, verbose=1, callbacks=[early_stopping_callback])

Epoch 1/2
Epoch 2/2


In [52]:
result = attn_model.evaluate(x_test, y_test)
print(result)

[0.6286572217941284, 0.6521150469779968]


### Bidirectional LSTM Classifier with attention and GloVe embedding