In [1]:
import numpy as np
import pandas as pd
import json
import nltk
nltk.download("stopwords") # add downloader for stopwords
nltk.download("wordnet") # add downloader for wordnet
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

import tqdm
import keras_metrics # for recall and precision metrics
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
import time
import pickle

import tensorflow as tf
tf.compat.v1.reset_default_graph()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/esther/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/esther/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Using TensorFlow backend.


In [2]:
with open("reviewSelected100.json", 'r') as read_file:
    data = [json.loads(line) for line in read_file]

In [3]:
#Store all reviews in a Pandas DataFrame 
reviews = []
for rev in range(len(data)):
    reviews.append(data[rev]['text'])
    
rev_df = pd.DataFrame(reviews, columns = ['Reviews'])
rev_df.head()

Unnamed: 0,Reviews
0,We had my Mother's Birthday Party here on 10/2...
1,Good Korean grill near Eaton Centre. The marin...
2,Was recommended to try this place by few peopl...
3,Ambience: Would not expect something this nice...
4,Absolutely the WORST pool company that I have ...


In [4]:
#Clean data 
import re

#1. Removes Punctuations
def remove_punctuations(data):
    punct_tag=re.compile(r'[^\w\s]')
    data=punct_tag.sub(r'',data)
    return data

#2. Removes HTML syntaxesn(if reviews provide business links)
def remove_html(data):
    html_tag=re.compile(r'<.*?>')
    data=html_tag.sub(r'',data)
    return data

#3. Removes URL data (if reviews provide business links)
def remove_url(data):
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

#4. Removes Emojis (if reviews contains expressions)
def remove_emoji(data):
    emoji_clean= re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    data=emoji_clean.sub(r'',data)
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

#5. Lemmatize the corpus
def lemma_traincorpus(data):
    lemmatizer=WordNetLemmatizer()
    out_data=""
    for words in data:
        out_data+= lemmatizer.lemmatize(words)
    return out_data

rev_df['Reviews']=rev_df['Reviews'].apply(lambda z: remove_punctuations(z))
rev_df['Reviews']=rev_df['Reviews'].apply(lambda z: remove_html(z))
rev_df['Reviews']=rev_df['Reviews'].apply(lambda z: remove_url(z))
rev_df['Reviews']=rev_df['Reviews'].apply(lambda z: remove_emoji(z))
rev_df['Reviews']=rev_df['Reviews'].apply(lambda z: lemma_traincorpus(z))

In [5]:
rev_df_clean = rev_df
rev_df_clean.head()

Unnamed: 0,Reviews
0,We had my Mothers Birthday Party here on 10291...
1,Good Korean grill near Eaton Centre The marina...
2,Was recommended to try this place by few peopl...
3,Ambience Would not expect something this nice ...
4,Absolutely the WORST pool company that I have ...


In [6]:
#Feature Engineering 

'''
Separate the reviews into positive and negative using nltk sentiment vader
'''
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk_sent = SentimentIntensityAnalyzer()

rev_df_clean['ratings'] = rev_df_clean["Reviews"].apply(lambda x: nltk_sent.polarity_scores(x))
rev_df_new = pd.concat([rev_df_clean.drop(['ratings'], axis=1), rev_df_clean['ratings'].apply(pd.Series)], axis=1)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/esther/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [7]:
rev_df_new['comp_rating'] = rev_df_new['compound'].apply(lambda c: 'pos' if c >=0 else 'neg')

rev_df_new.head()

Unnamed: 0,Reviews,neg,neu,pos,compound,comp_rating
0,We had my Mothers Birthday Party here on 10291...,0.0,0.592,0.408,0.93,pos
1,Good Korean grill near Eaton Centre The marina...,0.055,0.736,0.208,0.9448,pos
2,Was recommended to try this place by few peopl...,0.006,0.687,0.307,0.9975,pos
3,Ambience Would not expect something this nice ...,0.094,0.759,0.148,0.8318,pos
4,Absolutely the WORST pool company that I have ...,0.09,0.885,0.026,-0.9402,neg


In [8]:
upd_df = rev_df_new.drop(columns=['neg','neu','pos','compound'])

In [9]:
upd_df.head()

Unnamed: 0,Reviews,comp_rating
0,We had my Mothers Birthday Party here on 10291...,pos
1,Good Korean grill near Eaton Centre The marina...,pos
2,Was recommended to try this place by few peopl...,pos
3,Ambience Would not expect something this nice ...,pos
4,Absolutely the WORST pool company that I have ...,neg


In [10]:
from keras.preprocessing.text import Tokenizer

X = np.array(upd_df['Reviews'])
y = upd_df['comp_rating']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

In [11]:
X = np.array(X)
y = np.array(y)

  """Entry point for launching an IPython kernel.


In [12]:
# to convert labels to integers and vice-versa
label2int = {"pos": 1, "neg": 0}
int2label = {1: "pos", 0: "neg"}

In [13]:
max_len = []
for i in range(len(X)):
    max_len.append(len(X[i]))
max_value = max(max_len)
max_value

967

In [17]:
y = [ label2int[label] for label in y ]
y = to_categorical(y)

In [18]:
SEQUENCE_LENGTH = 300
EMBEDDING_SIZE = 300
TEST_SIZE = 0.25
BATCH_SIZE = 64
EPOCHS = 20

#ensure all sentences are same size 
X = pad_sequences(X, maxlen=SEQUENCE_LENGTH)

In [19]:
train_x,test_x,train_y,test_y=train_test_split(X,y,test_size=TEST_SIZE,random_state=42)

In [20]:
train_x.shape,train_y.shape,test_x.shape,test_y.shape

((11475, 300), (11475, 2), (3825, 300), (3825, 2))

In [21]:
def get_embedding_vectors(tokenizer, dim=300):
    embedding_index = {}
    #cannot use f to open txt file -> use r instead
    with open(r"glove.6B.300d.txt", encoding='utf8') as f:
        for line in tqdm.tqdm(f, "Reading GloVe"):
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = vectors

    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(word_index)+1, dim))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            # words not found will be 0s
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [22]:
def get_model(tokenizer, lstm_units):
    """
    Constructs the model,
    Embedding vectors => LSTM => 2 output Fully-Connected neurons with softmax activation
    """
    # get the GloVe embedding vectors
    embedding_matrix = get_embedding_vectors(tokenizer)
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index)+1,
              EMBEDDING_SIZE,
              weights=[embedding_matrix],
              trainable=False,
              input_length=SEQUENCE_LENGTH))

    model.add(LSTM(lstm_units, recurrent_dropout=0.2))
    model.add(Dropout(0.3))
    model.add(Dense(2, activation="softmax"))
    # compile as rmsprop optimizer
    # aswell as with recall metric
    model.compile(optimizer="rmsprop", loss="categorical_crossentropy",
                  metrics=["accuracy", keras_metrics.precision(), keras_metrics.recall()])
    model.summary()
    return model

In [23]:
# constructs the model with 128 LSTM units
model = get_model(tokenizer=tokenizer, lstm_units=128)

Reading GloVe: 400001it [00:27, 14774.47it/s]


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 300)          10670100  
_________________________________________________________________
lstm (LSTM)                  (None, 128)               219648    
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 2)                 258       
Total params: 10,890,006
Trainable params: 219,906
Non-trainable params: 10,670,100
_________________________________________________________________


In [24]:
# train the model
model.fit(train_x, train_y, validation_data=(test_x, test_y),
          batch_size=BATCH_SIZE, epochs=EPOCHS,
          verbose=1)

Epoch 1/20


AttributeError: in user code:

    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:543 train_step  **
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/engine/compile_utils.py:411 update_state
        metric_obj.update_state(y_t, y_p)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/utils/metrics_utils.py:90 decorated
        update_op = update_state_fn(*args, **kwargs)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/tensorflow/python/keras/metrics.py:603 update_state
        matches = self._fn(y_true, y_pred, **self._fn_kwargs)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/keras_metrics/metrics.py:192 __call__
        tp = self.tp(y_true, y_pred)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/keras_metrics/metrics.py:50 __call__
        tp_update = K.update_add(self.tp, tp)
    /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:977 update_add
        return tf.assign_add(x, increment)

    AttributeError: module 'tensorflow' has no attribute 'assign_add'
