In this notebook we'll try out an LSTM on the same data as our baseline model, and see how it performs. Our hypothesis is that it won't perform any better, yet, becuase the baseline does such a good job at keying off pronouns to do dialect classification. 

In [20]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split


import nltk
import numpy as np
import pandas as pd
import csv

In [6]:
# Load data from CSV file
reviews = pd.read_csv('./data/reviews.csv', 
                      names=[
                          'business_id',
                          'review_id',
                          'user_id',
                          'latitude',
                          'longitude',
                          'region',
                          'name',
                          'postal_code',
                          'city',
                          'state',
                          'neighborhood',
                          'text',
                      ],
                      dtype={
                          'business_id': str,
                          'review_id': str,
                          'text': str,
                          'user_id': str,
                          'city': str,
                          'latitude': np.float32,
                          'longitude': np.float32,
                          'region': str,
                          'name': str,
                          'neighborhood': str,
                          'postal_code': str,
                          'state': str
                      },
                      header=None, 
                      encoding='utf-8',
                      nrows=1000000,
#                       skiprows=3000000,
                      sep='|',
                      quoting=csv.QUOTE_MINIMAL,
                      error_bad_lines=False
                     )
print(reviews.shape)

(1000000, 12)


In [7]:
reviews.dropna(subset=['text'], inplace=True)
print(reviews.shape)

(999992, 12)


In [8]:
### Create sequence
vocabulary_size = 20000 # only includes 20K most frequently occurring words
tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(reviews['text'])
sequences = tokenizer.texts_to_sequences(reviews['text'])

# Longest review is 589 words, tutorial reduced to 50
data = pad_sequences(sequences)

In [18]:
reviews['region'] = reviews.region.astype('category')
categorical_labels = to_categorical(reviews['region'].cat.codes, num_classes=None)

In [21]:
labels_train, labels_test, features_train, features_test = train_test_split(categorical_labels, data, test_size=0.20, random_state=42)

In [27]:
## Network architecture, default batch_size is 32
model = Sequential()
model.add(Embedding(20000, 100, input_length=511))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(5, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
%%time

## Fit the model
model.fit(features_train, labels_train, validation_split=0.4, epochs=3)\

In [None]:
score, acc = model.evaluate(features_test, labels_test)
print(score)
print(acc)