In [1]:
import pandas as pd
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import twokenize
import unidecode

# We now use the data from Santa Barbara and go through the NLP pipeline steps
reviews = pd.read_csv('csv_data/santa_barbara_reviews.csv')

In [2]:
# removing accents and making the text lowercase
reviews['text'] = [unidecode.unidecode(review_text).lower() for review_text in reviews['text']]

# tokenizing with spacy
spacy_tokenizer = English()
reviews['spacy_token'] = [[token.text for token in spacy_tokenizer(review_text)] for review_text in reviews['text']]

# tokenizing with twokenize
reviews['twokenize_token'] = [twokenize.tokenizeRawTweetText(review_text) for review_text in reviews['text']]

reviews[['text', 'spacy_token', 'twokenize_token']].head()  # check results

Unnamed: 0,text,spacy_token,twokenize_token
0,this easter instead of going to lopez lake we ...,"[this, easter, instead, of, going, to, lopez, ...","[this, easter, instead, of, going, to, lopez, ..."
1,had a party of 6 here for hibachi. our waitres...,"[had, a, party, of, 6, here, for, hibachi, ., ...","[had, a, party, of, 6, here, for, hibachi, ., ..."
2,what a great addition to the funk zone! grab ...,"[what, a, great, addition, to, the, funk, zone...","[what, a, great, addition, to, the, funk, zone..."
3,"farmhouse, rustic, chic.helpful staff with gre...","[farmhouse, ,, rustic, ,, chic.helpful, staff,...","[farmhouse, ,, rustic, ,, chic, ., helpful, st..."
4,we were a bit weary about trying the shellfish...,"[we, were, a, bit, weary, about, trying, the, ...","[we, were, a, bit, weary, about, trying, the, ..."


In [3]:
# removing stop words
reviews['spacy_token'] = [list(filter(lambda word: word not in STOP_WORDS, list_of_tokens)) for list_of_tokens in reviews['spacy_token']]
reviews['twokenize_token'] = [list(filter(lambda word: word not in STOP_WORDS, list_of_tokens)) for list_of_tokens in reviews['twokenize_token']]
reviews[['text', 'spacy_token', 'twokenize_token']].head()

Unnamed: 0,text,spacy_token,twokenize_token
0,this easter instead of going to lopez lake we ...,"[easter, instead, going, lopez, lake, went, lo...","[easter, instead, going, lopez, lake, went, lo..."
1,had a party of 6 here for hibachi. our waitres...,"[party, 6, hibachi, ., waitress, brought, sepa...","[party, 6, hibachi, ., waitress, brought, sepa..."
2,what a great addition to the funk zone! grab ...,"[great, addition, funk, zone, !, , grab, bite...","[great, addition, funk, zone, !, grab, bite, ,..."
3,"farmhouse, rustic, chic.helpful staff with gre...","[farmhouse, ,, rustic, ,, chic.helpful, staff,...","[farmhouse, ,, rustic, ,, chic, ., helpful, st..."
4,we were a bit weary about trying the shellfish...,"[bit, weary, trying, shellfish, company, wharf...","[bit, weary, trying, shellfish, company, wharf..."


In [4]:
# the first thing we try is a vectorization using bag of words
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer(max_features=1024, ngram_range=(1,2))
bow_input = [' '.join(review) for review in reviews['spacy_token'].tolist()]
bow_representation = count_vec.fit_transform(bow_input)   # fitting the model
bow_array = bow_representation.toarray()
normalized_bow = [vector/sum(vector) if sum(vector) != 0 else vector for vector in bow_array]

In [13]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.losses import CosineSimilarity
from sklearn.model_selection import train_test_split
from keras.utils.vis_utils import plot_model

model = Sequential()
model.add(Dense(1024, input_dim=1024, activation="relu"))
model.add(Dense(256, activation="relu"))
model.add(Dense(16, activation="relu"))
model.add(Dense(256, activation="relu"))
model.add(Dense(1024, activation="relu"))
model.add(Dense(1024, activation="relu"))
model.summary()

X_train, X_test, y_train, y_test = train_test_split(np.array(normalized_bow), np.array(normalized_bow), test_size = 0.25)
model.compile(loss=CosineSimilarity(axis=1), optimizer='sgd', metrics=['mse'])
plot_model(model, to_file='autoencoder.png', show_shapes=True, show_layer_names=True)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_24 (Dense)            (None, 1024)              1049600   
                                                                 
 dense_25 (Dense)            (None, 256)               262400    
                                                                 
 dense_26 (Dense)            (None, 16)                4112      
                                                                 
 dense_27 (Dense)            (None, 256)               4352      
                                                                 
 dense_28 (Dense)            (None, 1024)              263168    
                                                                 
 dense_29 (Dense)            (None, 1024)              1049600   
                                                                 
Total params: 2,633,232
Trainable params: 2,633,232
No

In [6]:
model.fit(X_train, y_train, epochs=35, validation_data=(X_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fe6e48db3d0>

In [7]:
model.save('autoencoder_v1')

INFO:tensorflow:Assets written to: autoencoder_v1/assets


In [8]:
# we then move to the TF-IDF vectorization
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf = TfidfVectorizer()
# tfidf_representation = tfidf.fit_transform(bow_input)   # fitting the model
# tfidf_array = tfidf_representation.toarray()
# normalized_tfidf = [vector/sum(vector) if sum(vector) != 0 else vector for vector in tfidf_array]

In [9]:
# normalized_tfidf - normalized_bow