In [3]:
import pandas as pd

# read file with reviews data for restaurants
reviews = pd.read_csv('datasets/reviews_restaurants.csv')

# filter out restaurants with price range of 2
reviews = reviews[reviews['stars'] != 3]

# add categories based on price
reviews['quality'] = ''
reviews.loc[reviews['stars'] <= 2, 'quality'] = 0
reviews.loc[reviews['stars'] >= 4, 'quality'] = 1

# reduce the dataset to 10000 reviews of each category
reviews = reviews.groupby('quality').apply(lambda x: x.sample(10000, random_state=0).reset_index(drop=True))
reviews = reviews.droplevel(level=0)

# check if the sampling went well
reviews.groupby('quality').count()

Unnamed: 0_level_0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,price_range,state
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
1,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000


In [1]:
import gensim.downloader as api

# download the glove pre-trained model
glove = api.load('glove-twitter-50')



In [29]:
import numpy as np
from text_representation import get_processed_inputs

# processing inputs
glove_inputs = get_processed_inputs(reviews, column='text', mode='list_of_lists')

# create the word2vec list of vectors
glove_array = []

for text in glove_inputs:
    text_matrix = []
    count = 0

    if len(text) >= 50:
        text = text[:50]
        for token in text:
            if token in glove:
                text_matrix.append(glove[token])
            else:
                count += 1
    else:

        for token in text:
            if token in glove:
                text_matrix.append(glove[token])
            else:
                count += 1
        for i in range(50 - len(text)):
            text_matrix.append(np.zeros(50))

    for i in range(count):
        text_matrix.append(np.zeros(50))

    glove_array.append(np.array(text_matrix))

glove_array = np.array(glove_array)

# check shape
glove_array.shape

(20000, 50, 50)

In [30]:
import keras
from keras import layers

input_matrix = keras.Input(shape=(50, 50, 1))

x = layers.Conv2D(16, (3, 3), activation='relu', padding='same')(input_matrix)
x = layers.MaxPooling2D((2, 2), padding='same')(x)
x = layers.Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = layers.MaxPooling2D((3, 3), padding='same')(x)
x = layers.Conv2D(8, (3, 3), activation='relu', padding='same')(x)
encoded = layers.MaxPooling2D((3, 3), padding='same')(x)

# at this point the representation is (3, 3, 8) i.e. 128-dimensional

x = layers.Conv2D(8, (3, 3), activation='relu', padding='same')(encoded)
x = layers.UpSampling2D((3, 3))(x)
x = layers.Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = layers.UpSampling2D((3, 3))(x)
x = layers.Conv2D(16, (3, 3), activation='relu')(x)
x = layers.UpSampling2D((2, 2))(x)
decoded = layers.Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)

autoencoder = keras.Model(input_matrix, decoded)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

encoder = keras.Model(input_matrix, encoded)

autoencoder.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 50, 50, 1)]       0         
                                                                 
 conv2d_28 (Conv2D)          (None, 50, 50, 16)        160       
                                                                 
 max_pooling2d_12 (MaxPoolin  (None, 25, 25, 16)       0         
 g2D)                                                            
                                                                 
 conv2d_29 (Conv2D)          (None, 25, 25, 8)         1160      
                                                                 
 max_pooling2d_13 (MaxPoolin  (None, 9, 9, 8)          0         
 g2D)                                                            
                                                                 
 conv2d_30 (Conv2D)          (None, 9, 9, 8)           584 

In [31]:
from sklearn.model_selection import train_test_split

# produce the X and y for training and testing
quality = np.array(reviews['quality'].tolist())
X_train, X_test, y_train, y_test = train_test_split(glove_array, quality, test_size = 0.25, random_state=0)

# check shape of the test set
X_train.shape

(15000, 50, 50)

In [43]:
autoencoder.fit(X_train, X_train,
                epochs=15,
                batch_size=128,
                shuffle=True,
                validation_data=(X_test, X_test))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fc6da077e20>

In [44]:
encoded_train = encoder.predict(X_train)
encoded_test = encoder.predict(X_test)

encoded_train.shape

(15000, 3, 3, 8)

In [45]:
# flatten the matrices for each document in the train set
temp = []
for encoded_matrix in encoded_train:
    temp.append(encoded_matrix.flatten())
embedding_train = np.array(temp)

# flatten the matrices for each document in the test set
temp = []
for encoded_matrix in encoded_test:
    temp.append(encoded_matrix.flatten())
embedding_test = np.array(temp)

# check results
embedding_train.shape, embedding_test.shape

((15000, 72), (5000, 72))

In [46]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# using SVMs for classification
svm = SVC(C=0.5).fit(embedding_train, y_train)
print("SVM test score:", svm.score(embedding_test, y_test))
print("SVM train score:", svm.score(embedding_train, y_train))

# using random forests for classification
rfc = RandomForestClassifier(max_depth=5, random_state=0).fit(embedding_train, y_train)
print("RF test score:", rfc.score(embedding_test, y_test))
print("RF train score:", rfc.score(embedding_train, y_train))

# using logistic regression for classification
lrc = LogisticRegression(random_state=0, max_iter=125, C=0.5).fit(embedding_train, y_train)
print("LR test score:", lrc.score(embedding_test, y_test))
print("LR train score:", lrc.score(embedding_train, y_train))

SVM test score: 0.6814
SVM train score: 0.6802666666666667
RF test score: 0.6546
RF train score: 0.6750666666666667
LR test score: 0.7022
LR train score: 0.7006666666666667


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
