# LSTMs

http://colah.github.io/posts/2015-08-Understanding-LSTMs/

Yelp's dataset: https://www.yelp.com/dataset/download

https://towardsdatascience.com/understanding-lstm-and-its-quick-implementation-in-keras-for-sentiment-analysis-af410fd85b47

In [1]:
import pandas as pd

In [2]:
filename = 'data/yelp_dataset/yelp_academic_dataset_review.json'

In [3]:
# !cat $filename

In [4]:
data = pd.read_json(filename, lines=True)

In [5]:
data.iloc[0, :]

business_id                               iCQpiavjjPzJ5_3gPD5Ebg
cool                                                           0
date                                         2011-02-25 00:00:00
funny                                                          0
review_id                                 x7mDIiDB3jEiPGPHOmDzyw
stars                                                          2
text           The pizza was okay. Not the best I've had. I p...
useful                                                         0
user_id                                   msQe1u7Z_XuqjGoqhB0J5g
Name: 0, dtype: object

In [6]:
# Rating above 3 stars will be considered as positive and less than or equal to 3 as negative.
data['sentiment'] = [1 if x > 3 else 0 for x in data['stars']]

In [7]:
data['sentiment'].value_counts()

1    3977837
0    2019159
Name: sentiment, dtype: int64

In [8]:
data['text'][:5]

0    The pizza was okay. Not the best I've had. I p...
1    I love this place! My fiance And I go here atl...
2    Terrible. Dry corn bread. Rib tips were all fa...
3    Back in 2005-2007 this place was my FAVORITE t...
4    Delicious healthy food. The steak is amazing. ...
Name: text, dtype: object

In [9]:
example = data.iloc[0, :]['text']

In [10]:
example

u"The pizza was okay. Not the best I've had. I prefer Biaggio's on Flamingo / Fort Apache. The chef there can make a MUCH better NY style pizza. The pizzeria @ Cosmo was over priced for the quality and lack of personality in the food. Biaggio's is a much better pick if youre going for italian - family owned, home made recipes, people that actually CARE if you like their food. You dont get that at a pizzeria in a casino. I dont care what you say..."

In [11]:
data['text'].values[:3]

array([ u"The pizza was okay. Not the best I've had. I prefer Biaggio's on Flamingo / Fort Apache. The chef there can make a MUCH better NY style pizza. The pizzeria @ Cosmo was over priced for the quality and lack of personality in the food. Biaggio's is a much better pick if youre going for italian - family owned, home made recipes, people that actually CARE if you like their food. You dont get that at a pizzeria in a casino. I dont care what you say...",
       u'I love this place! My fiance And I go here atleast once a week. The portions are huge! Food is amazing. I love their carne asada. They have great lunch specials... Leticia is super nice and cares about what you think of her restaurant. You have to try their cheese enchiladas too the sauce is different And amazing!!!',
       u'Terrible. Dry corn bread. Rib tips were all fat and mushy and had no flavor. If you want bbq in this neighborhood go to john mulls roadkill grill. Trust me.',
       ...,
       u'Loved the location, 

In [12]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [None]:
NUM_WORDS = 2500

In [13]:
tokenizer = Tokenizer(num_words=NUM_WORDS, 
                      lower=True, 
                      split=' ')



In [None]:
tokenizer.fit_on_texts(data['text'].values)

In [16]:
# Dictionary
tokenizer.word_index['awesome']

204

In [17]:
X = tokenizer.texts_to_sequences(data['text'].values)

In [23]:
', '.join([str(i)for i in X[0]])

'1, 170, 6, 510, 22, 1, 85, 92, 23, 3, 1144, 20, 1, 730, 36, 72, 119, 4, 113, 131, 1921, 490, 170, 1, 6, 111, 613, 10, 1, 222, 2, 1016, 7, 2446, 11, 1, 28, 9, 4, 113, 131, 503, 39, 125, 10, 674, 272, 1502, 203, 118, 108, 13, 275, 281, 39, 19, 43, 48, 28, 19, 1221, 46, 13, 26, 4, 11, 4, 770, 3, 1221, 281, 62, 19, 148'

In [25]:
# pad_sequences?

In [None]:
X = pad_sequences(X)

In [1]:
X[:3]

NameError: name 'X' is not defined

In [None]:
# Hyperparameteres
EMBED_DIM = 128
LSTM_OUT = 200
batch_size = 32

In [None]:
model = Sequential()
model.add(Embedding(NUM_WORDS, EMBED_DIM, input_length = X.shape[1], dropout = 0.2))
model.add(LSTM(LSTM_OUT, dropout_U = 0.2, dropout_W = 0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())

In [None]:
Y = pd.get_dummies(data['sentiment']).values

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size = 0.20, random_state = 36)

In [None]:
model.fit(X_train, Y_train, batch_size = batch_size, nb_epoch = 1,  verbose = 5)

In [None]:
test = 'This place was NOT great'

In [None]:
test = 'Amazing restaurant'

In [39]:
data['text'].apply((lambda x: re.sub('[^a-zA-Z0-9\s]', '', x)))

0          The pizza was okay Not the best Ive had I pref...
1          I love this place My fiance And I go here atle...
2          Terrible Dry corn bread Rib tips were all fat ...
3          Back in 20052007 this place was my FAVORITE th...
4          Delicious healthy food The steak is amazing Fi...
5          This place sucks The customer service is horri...
6          If you like Thai food you have to try the orig...
7          AMAZING\n\n I was referred here by a friend an...
8          Ribs  amazing\n2 hour wait time not so amazing...
9          Food is pretty good not gonna lie BUT you have...
10         I have been an Emerald Club member for a numbe...
11         The score should be negative Its HORRIBLE The ...
12         I went there twice and I am pretty happy with ...
13         Great for the price only 25 for long hair Was ...
14         Sansotei serves some top notch ramen They take...
15         Of the several places to get bagels in Oakland...
16         Cool little p

In [31]:
for idx, row in data.iterrows():
    print idx, row[0]
    break

0 iCQpiavjjPzJ5_3gPD5Ebg


In [None]:

data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-Z0-9\s]', '', x)))
for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
data['text'] = [x.encode('ascii') for x in data['text']]

In [None]:
tokenizer = Tokenizer(nb_words=2500, lower=True, split=' ')
tokenizer.fit_on_texts(data['text'].values)
#print(tokenizer.word_index)  # To see the dicstionary
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

In [None]:
model = Sequential()
model.add(Embedding(2500, embed_dim, input_length = X.shape[1], dropout = 0.2))
model.add(LSTM(lstm_out, dropout_U = 0.2, dropout_W = 0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())

In [None]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size = 0.20, random_state = 36)

#Here we train the Network.

model.fit(X_train, Y_train, batch_size =batch_size, nb_epoch = 1,  verbose = 5)