In [9]:
import pandas as pd

# read file with reviews data for restaurants
reviews = pd.read_csv('../datasets/reviews_restaurants.csv')

# filter out restaurants with price range of 2
reviews = reviews[reviews['stars'] != 3]

# add categories based on price
reviews['quality'] = ''
reviews.loc[reviews['stars'] <= 2, 'quality'] = 0
reviews.loc[reviews['stars'] >= 4, 'quality'] = 1

# reduce the dataset to 10000 reviews of each category
reviews = reviews.groupby('quality').apply(lambda x: x.sample(10000, random_state=0).reset_index(drop=True))
reviews = reviews.droplevel(level=0)

# check if the sampling went well
reviews.groupby('quality').count()

Unnamed: 0_level_0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,price_range,state
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
1,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000


In [10]:
from deep_learning.text_representation import word2vec
import numpy as np

# get the matrix representation of each review
matrix_inputs = word2vec(reviews, mode='matrix')

# convert our inputs to an array
matrix_inputs = np.array(matrix_inputs)

# check results
matrix_inputs.shape

(20000, 300, 300)

In [11]:
from sklearn.model_selection import train_test_split

# produce the X and y for training and testing
quality = np.array(reviews['quality'].tolist())
X_train, X_test, y_train, y_test = train_test_split(matrix_inputs, quality, test_size = 0.25, random_state=0)

# check shape of the test set
X_train.shape

(15000, 300, 300)

In [14]:
from keras import layers
from keras.models import Sequential

cnn = Sequential(
    [
        # convolutional part of the network
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=(300, 300, 1)),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),

        # flatten part of the network
        layers.Flatten(),
        layers.Dense(64, activation='sigmoid'),
        layers.Dense(2)
    ]
)

cnn.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_15 (Conv2D)          (None, 298, 298, 32)      320       
                                                                 
 max_pooling2d_10 (MaxPoolin  (None, 149, 149, 32)     0         
 g2D)                                                            
                                                                 
 conv2d_16 (Conv2D)          (None, 147, 147, 64)      18496     
                                                                 
 max_pooling2d_11 (MaxPoolin  (None, 73, 73, 64)       0         
 g2D)                                                            
                                                                 
 conv2d_17 (Conv2D)          (None, 71, 71, 64)        36928     
                                                                 
 flatten_5 (Flatten)         (None, 322624)           

In [15]:
from keras.losses import SparseCategoricalCrossentropy

cnn.compile(optimizer='adam',
            loss=SparseCategoricalCrossentropy(from_logits=True),
            metrics=['accuracy'])

history = cnn.fit(X_train, y_train, epochs=2,
                  validation_data=(X_test, y_test))

Epoch 1/2
Epoch 2/2
