In this notebook we'll try out an LSTM on the same data as our baseline model, and see how it performs. Our hypothesis is that it won't perform any better, yet, becuase the baseline does such a good job at keying off pronouns to do dialect classification. 

In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split

import nltk
import numpy as np
import pandas as pd
import csv

Using TensorFlow backend.


In [55]:
# Load data from CSV file
reviews = pd.read_csv('./data/reviews.csv', 
                      names=[
                          'business_id',
                          'review_id',
                          'user_id',
                          'latitude',
                          'longitude',
                          'region',
                          'name',
                          'postal_code',
                          'city',
                          'state',
                          'neighborhood',
                          'text',
                      ],
                      dtype={
                          'business_id': str,
                          'review_id': str,
                          'text': str,
                          'user_id': str,
                          'city': str,
                          'latitude': np.float32,
                          'longitude': np.float32,
                          'region': str,
                          'name': str,
                          'neighborhood': str,
                          'postal_code': str,
                          'state': str
                      },
                      header=None, 
                      encoding='utf-8',
                      nrows=1000000,
#                       skiprows=3000000,
                      sep='|',
                      quoting=csv.QUOTE_MINIMAL,
                      index_col=None,
                      error_bad_lines=False
                     )
print(reviews.shape)
reviews = reviews.reset_index()

(100000, 12)


In [58]:
reviews.dropna(subset=['text'], inplace=True)
print(reviews.shape)

(100000, 13)


In [59]:
import re
import string
import nltk
from nltk.corpus import stopwords

In [60]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=<>]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text

In [61]:
%%time

# Tokenize reviews for POS tagging
sents = []
for i, review in enumerate(reviews['text'].values):
    sents.append(nltk.word_tokenize(review))
    if i % 10000 == 0:
        print('Completed {0} rows'.format(str(i)))

Completed 0 rows
Completed 10000 rows
Completed 20000 rows
Completed 30000 rows
Completed 40000 rows
Completed 50000 rows
Completed 60000 rows
Completed 70000 rows
Completed 80000 rows
Completed 90000 rows
CPU times: user 1min 55s, sys: 0 ns, total: 1min 55s
Wall time: 1min 55s


In [62]:
len(sents)

100000

In [63]:
# Tag all words with their parts of speech using nltk
tags = []
for i in range(0, len(sents), 10000):
    end = i+10000
    if end > len(sents):
        end = len(sents)
    batch = sents[i:end]
    tags.extend(nltk.pos_tag_sents(batch))
    print('Completed {0} rows'.format(str(i+10000)))

Completed 10000 rows
Completed 20000 rows
Completed 30000 rows
Completed 40000 rows
Completed 50000 rows
Completed 60000 rows
Completed 70000 rows
Completed 80000 rows
Completed 90000 rows
Completed 100000 rows


In [64]:
len(tags)

100000

In [65]:
# Remove all proper nouns and clean reviews
cleaned_sents = []
stops = set(stopwords.words("english"))
for i, review in enumerate(tags):
    
    ## Remove stop words
    review = [w for w in review if not w[0].lower() in stops and len(w[0]) >= 3]
    
    ## Replace proper nouns with <NNP>
    review = [w[0] if not (w[1] in ['NNP', 'NNPS']) else '<NNP>' for w in review]
    
    cleaned_sents.append(clean_text(" ".join(review)))
    
    if i % 10000 == 0:
        print(i)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000


In [66]:
len(cleaned_sents)

100000

In [70]:
reviews = reviews.join(pd.DataFrame(cleaned_sents, columns=['clean_text']))

In [71]:
reviews.head()

Unnamed: 0,index,business_id,review_id,user_id,latitude,longitude,region,name,postal_code,city,state,neighborhood,text,clean_text
0,0,0W4lkclzZThpx3V65bVgig,v0i_UHJMo_hPBq9bxWvW4w,bv2nCi5Qv5vroFiqKGopiw,45.516373,-73.577538,canada,Schwartz's,H2W 1X9,Montréal,QC,Plateau-Mont-Royal,"Love the staff, love the meat, love the place....",Love staff love meat love place <NNP> long lin...
1,1,AEx2SYEUJmTxVVB18LlCwA,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,45.523335,-73.594856,canada,Wilensky's,H2T 2M1,Montréal,QC,Plateau-Mont-Royal,Super simple place but amazing nonetheless. It...,<NNP> simple place amazing nonetheless around ...
2,2,VR6GpWIda3SfvPC-lg9H3w,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,45.4729,-73.588318,canada,Tuck Shop,H4C 1S7,Montréal,QC,Sud-Ouest,Small unassuming place that changes their menu...,Small unassuming place changes menu every ofte...
3,3,CKC0-MOWMqoeWf6s-szl8g,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,45.522144,-73.607079,canada,Lester's Deli,H2V 1V1,Outremont,QC,Outremont,Lester's is located in a beautiful neighborhoo...,<NNP> located beautiful neighborhood since 195...
4,4,ACFtxLv8pGrrxMm6EgjreA,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,45.50251,-73.570122,canada,Five Guys,H3B 1B9,Montréal,QC,Ville-Marie,Love coming here. Yes the place always needs t...,Love coming Yes place always needs floor swept...


In [72]:
### Create sequence
vocabulary_size = 20000 # only includes 20K most frequently occurring words
tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(reviews['clean_text'])
sequences = tokenizer.texts_to_sequences(reviews['clean_text'])

# Longest review is 589 words
data = pad_sequences(sequences)
data = pd.DataFrame(data)

In [73]:
# Keras requires categorical data type 
reviews['region'] = reviews.region.astype('category')
categorical_labels = to_categorical(reviews['region'].cat.codes, num_classes=None)

In [74]:
labels_train, labels_test, features_train, features_test = train_test_split(categorical_labels, data, test_size=0.20, random_state=42)

In [77]:
## Network architecture, default batch_size is 32

model = Sequential()
model.add(Embedding(20000, 100, input_length=508))
model.add(Conv1D(64, 3, activation='relu'))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Conv1D(128, 3, activation='relu'))
model.add(Conv1D(128, 3, activation='relu'))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.5))
model.add(Dense(5, activation='sigmoid'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [78]:
%%time

## Fit the model
model.fit(features_train, labels_train, validation_split=0.4, epochs=3)

Train on 48000 samples, validate on 32000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
CPU times: user 46min 9s, sys: 13min 12s, total: 59min 22s
Wall time: 5min 29s


<keras.callbacks.History at 0x7f03a04e4518>

In [79]:
score, acc = model.evaluate(features_test, labels_test)
print(score)
print(acc)

0.945718603706
0.71225


In [80]:
predictions = model.predict(features_test)
predictions_df = pd.DataFrame(predictions, index=features_test.index)
predictions_df.head()

Unnamed: 0,0,1,2,3,4
75721,0.033879,0.012018,0.084861,0.058903,0.984009
80184,0.074692,0.039797,0.127749,0.115569,0.922808
19864,0.099427,0.064776,0.189058,0.163031,0.861757
76699,0.049512,0.015762,0.075506,0.063349,0.96392
92991,0.119332,0.052644,0.152893,0.134075,0.855669


In [89]:
predictions_df.head()

Unnamed: 0,0,1,2,3,4
75721,0.033879,0.012018,0.084861,0.058903,0.984009
80184,0.074692,0.039797,0.127749,0.115569,0.922808
19864,0.099427,0.064776,0.189058,0.163031,0.861757
76699,0.049512,0.015762,0.075506,0.063349,0.96392
92991,0.119332,0.052644,0.152893,0.134075,0.855669


In [120]:
predictions_df.idxmax(axis=1).head()

75721    4
80184    4
19864    4
76699    4
92991    4
dtype: int64

In [125]:
predictions_col_numbers = predictions_df.idxmax(axis=1)

predictions_ca = predictions_col_numbers[predictions_col_numbers == 0]
predictions_mw = predictions_col_numbers[predictions_col_numbers == 1]
predictions_so = predictions_col_numbers[predictions_col_numbers == 2]
predictions_ne = predictions_col_numbers[predictions_col_numbers == 3]
predictions_we = predictions_col_numbers[predictions_col_numbers == 4]

print(predictions_ca.shape)

predictions_ca = predictions_df.loc[predictions_ca.index, 0].sort_values(ascending=False).index.values[0]

print(predictions_ca.shape)
predictions_ca

(1878,)
()


68361

In [126]:
print(predictions_col_numbers[predictions_col_numbers == 0].shape)
print(predictions_col_numbers[predictions_col_numbers == 1].shape)
print(predictions_col_numbers[predictions_col_numbers == 2].shape)
print(predictions_col_numbers[predictions_col_numbers == 3].shape)
print(predictions_col_numbers[predictions_col_numbers == 4].shape)

(1878,)
(0,)
(55,)
(0,)
(18067,)


In [154]:
predictions_col_numbers = predictions_df.idxmax(axis=1)

predictions_ca = predictions_col_numbers[predictions_col_numbers == 0]
predictions_mw = predictions_col_numbers[predictions_col_numbers == 1]
predictions_ne = predictions_col_numbers[predictions_col_numbers == 2]
predictions_so = predictions_col_numbers[predictions_col_numbers == 3]
predictions_we = predictions_col_numbers[predictions_col_numbers == 4]

predictions_ca_idx = predictions_df.loc[predictions_ca.index, 0].sort_values(ascending=False).index.values[3]
# predictions_mw_idx = predictions_df.loc[predictions_mw.index, 1].sort_values(ascending=False).index.values[0]
predictions_ne_idx = predictions_df.loc[predictions_ne.index, 2].sort_values(ascending=False).index.values[3]
# predictions_so_idx = predictions_df.loc[predictions_ne.index, 3].sort_values(ascending=False).index.values[0]
predictions_we_idx = predictions_df.loc[predictions_we.index, 4].sort_values(ascending=False).index.values[3]

print(predictions_ca_idx)
# print(top_prediction_mw_idx)
print(predictions_ne_idx)
# print(top_prediction_so_idx)
print(predictions_we_idx)

71214
88984
66552


In [155]:
top_prediction_ca = reviews.loc[predictions_ca_idx, 'clean_text']
# top_prediction_mw = reviews.loc[predictions_mw_idx, 'clean_text']
top_prediction_ne = reviews.loc[predictions_ne_idx, 'clean_text']
# top_prediction_so = reviews.loc[predictions_so_idx, 'clean_text']
top_prediction_we = reviews.loc[predictions_we_idx, 'clean_text']

print(top_prediction_ca)
print()
# print(top_prediction_mw)
# print()
print(top_prediction_ne)
print()
# print(top_prediction_so)
# print()
print(top_prediction_we)

<NNP> <NNP> <NNP> completely shut <NNP> <NNP> wandering street kept eye <NNP> local recommended not open first walked window shopped hard Tropical ice cream meaning Asian flavours say <NNP> kind random Closed <NNP> everything else 1 : 00pm 8 : 00pm assume will change sort kinks <NNP> Vietnamese iced coffees get ice cream Sounds like magical combo <NNP> recon beforehand raspberry lychee not display case came back trying samples asked one workers turns prepping new batch careful deliberation settled two lots interesting Asian flavours <NNP> yuzu corossol ginger name <NNP> lychee passionfruit 3 85 double scoop cup shared double scoop raspberry lychee passionfruit light refreshing full flavour Good consistency although started melt headed towards metro favourite two raspberry lychee Next time would try soft serve great spot grab quick cone <NNP> dessert are busy boulevard <NNP> <NNP>

<NNP> <NNP> sticks <NNP> <NNP> neighborhood right reasons - laid back atmosphere pleasant service casual f

In [119]:
reviews.loc[178, 'region']

'northeast'