In [1]:
import keras
from keras import layers

original_dimension = 1024
encoded_dimension = 64

# create the layers of the model
input_vector = keras.Input(shape=(original_dimension,))
encoded = layers.Dense(encoded_dimension, activation='relu')(input_vector)
decoded = layers.Dense(original_dimension, activation='sigmoid')(encoded)

# create the model mapping the input and its reconstruction
autoencoder = keras.Model(input_vector, decoded)
autoencoder.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1024)]            0         
                                                                 
 dense (Dense)               (None, 64)                65600     
                                                                 
 dense_1 (Dense)             (None, 1024)              66560     
                                                                 
Total params: 132,160
Trainable params: 132,160
Non-trainable params: 0
_________________________________________________________________


In [2]:
# create a model for the encoder
encoder = keras.Model(input_vector, encoded)

# and one for the decoder
encoded_input = keras.Input(shape=(encoded_dimension,))
decoder_layer = autoencoder.layers[-1]
decoder = keras.Model(encoded_input, decoder_layer(encoded_input))

# now we compile the model with the optimizer and loss
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

In [3]:
import pandas as pd

# read file with reviews data for restaurants
reviews = pd.read_csv('../datasets/reviews_restaurants.csv')

# filter out restaurants with price range of 2
reviews = reviews[reviews['stars'] != 3]

# add categories based on price
reviews['quality'] = ''
reviews.loc[reviews['stars'] <= 2, 'quality'] = 'bad'
reviews.loc[reviews['stars'] >= 4, 'quality'] = 'good'

# reduce the dataset to 10000 reviews of each category
reviews = reviews.groupby('quality').apply(lambda x: x.sample(10000, random_state=0).reset_index(drop=True))
reviews = reviews.droplevel(level=0)

# check if the sampling went well
reviews.groupby('quality').count()

Unnamed: 0_level_0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,price_range,state
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
bad,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
good,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000


In [4]:
from sklearn.model_selection import train_test_split
import numpy as np
from deep_learning.text_representation import get_processed_inputs, sum_to_one
from sklearn.feature_extraction.text import CountVectorizer

review_group = reviews['quality'].tolist()
X_train, X_test, y_train, y_test = train_test_split(reviews[['text']], np.array(review_group), test_size=0.25, random_state=0)

# pre-process the inputs
bow_train = get_processed_inputs(X_train)

# create the BoW representation for the set
bow_vec = CountVectorizer(max_features=1024, ngram_range=(1,2))
bow_train = np.array(sum_to_one(bow_vec.fit_transform(bow_train).toarray()))
bow_test = np.array(sum_to_one(bow_vec.transform(get_processed_inputs(X_test)).toarray()))

In [5]:
# and now we train the autoencoder
autoencoder.fit(bow_train, bow_train,
                epochs=50,
                batch_size=256,
                shuffle=True,
                validation_data=(bow_test, bow_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7ff36d04cca0>

In [6]:
# we now obtain the encoded vectors
encoded_train = encoder.predict(bow_train)
encoded_test = encoder.predict(bow_test)

In [7]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# # using SVMs for classification
# svm = SVC(C=0.5).fit(bow_train, y_train)
# print("SVM test score:", svm.score(bow_test, y_test))
# print("SVM train score:", svm.score(bow_train, y_train))

# using random forests for classification
rfc = RandomForestClassifier(max_depth=5, random_state=0).fit(bow_train, y_train)
print("RF test score:", rfc.score(bow_test, y_test))
print("RF train score:", rfc.score(bow_train, y_train))

# using logistic regression for classification
lrc = LogisticRegression(random_state=0, max_iter=125, C=0.5).fit(bow_train, y_train)
print("LR test score:", lrc.score(bow_test, y_test))
print("LR train score:", lrc.score(bow_train, y_train))

RF test score: 0.8754
RF train score: 0.8818666666666667
LR test score: 0.8806
LR train score: 0.8773333333333333


In [8]:
# using SVMs for classification
svm = SVC(C=0.5).fit(encoded_train, y_train)
print("SVM test score:", svm.score(encoded_test, y_test))
print("SVM train score:", svm.score(encoded_train, y_train))

# using random forests for classification
rfc = RandomForestClassifier(max_depth=5, random_state=0).fit(encoded_train, y_train)
print("RF test score:", rfc.score(encoded_test, y_test))
print("RF train score:", rfc.score(encoded_train, y_train))

# using logistic regression for classification
lrc = LogisticRegression(random_state=0, max_iter=125, C=0.5).fit(encoded_train, y_train)
print("LR test score:", lrc.score(encoded_test, y_test))
print("LR train score:", lrc.score(encoded_train, y_train))

SVM test score: 0.7774
SVM train score: 0.8095333333333333
RF test score: 0.6942
RF train score: 0.7269333333333333
LR test score: 0.7162
LR train score: 0.7248666666666667


In [9]:
from keras.models import Sequential

autoencoder = Sequential(
    [
        layers.Input(1024),
        layers.Dense(256, activation='relu'),
        layers.Dense(128, activation='relu'),
        layers.Dense(64, activation='relu', name='hidden'),
        layers.Dense(128, activation='relu'),
        layers.Dense(256, activation='relu'),
        layers.Dense(1024, activation='sigmoid', name='output')
    ]
)

autoencoder.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 256)               262400    
                                                                 
 dense_3 (Dense)             (None, 128)               32896     
                                                                 
 hidden (Dense)              (None, 64)                8256      
                                                                 
 dense_4 (Dense)             (None, 128)               8320      
                                                                 
 dense_5 (Dense)             (None, 256)               33024     
                                                                 
 output (Dense)              (None, 1024)              263168    
                                                                 
Total params: 608,064
Trainable params: 608,064
Non-trai

In [10]:
# now we compile the model with the optimizer and loss
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

In [15]:
# and now we train the autoencoder
autoencoder.fit(bow_train, bow_train,
                epochs=150,
                batch_size=256,
                shuffle=True,
                validation_data=(bow_test, bow_test))

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x7ff350d119a0>

In [16]:
from keras.models import Model

encoder_2 = Model(
    inputs=autoencoder.inputs,
    outputs=autoencoder.get_layer(name='hidden').output,
)

decoder_2 = Model(
    inputs=autoencoder.get_layer(name='hidden').output,
    outputs=autoencoder.get_layer(name='output').output,
)

In [17]:
# we now obtain the encoded vectors
encoded_train = encoder_2.predict(bow_train)
encoded_test = encoder_2.predict(bow_test)

In [18]:
# using SVMs for classification
svm = SVC(C=0.5).fit(encoded_train, y_train)
print("SVM test score:", svm.score(encoded_test, y_test))
print("SVM train score:", svm.score(encoded_train, y_train))

# using random forests for classification
rfc = RandomForestClassifier(max_depth=5, random_state=0).fit(encoded_train, y_train)
print("RF test score:", rfc.score(encoded_test, y_test))
print("RF train score:", rfc.score(encoded_train, y_train))

# using logistic regression for classification
lrc = LogisticRegression(random_state=0, max_iter=125, C=0.5).fit(encoded_train, y_train)
print("LR test score:", lrc.score(encoded_test, y_test))
print("LR train score:", lrc.score(encoded_train, y_train))

SVM test score: 0.9146
SVM train score: 0.9227333333333333
RF test score: 0.8788
RF train score: 0.8883333333333333
LR test score: 0.9118
LR train score: 0.9171333333333334


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
