This notebook shows the yelp dataset start rating classification problem. The objective is to predict the star rating based on photos

In [132]:
!pip install --upgrade tensorflow
!pip install pillow
!pip install h5py
!pip install keras

Requirement already up-to-date: tensorflow in /opt/conda/lib/python3.7/site-packages (1.13.1)


In [133]:
import json
import seaborn as sns
import pandas as pd
import numpy as np
import itertools
import operator
from random import shuffle
import os, sys
from PIL import Image

import random

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score, precision_recall_curve, precision_score, recall_score, average_precision_score 

from keras.utils.np_utils import to_categorical # convert to one-hot-encoding
from keras.models import Sequential, load_model, Model
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, Activation
from keras.optimizers import RMSprop
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.callbacks import ReduceLROnPlateau
from keras.layers.normalization import BatchNormalization
from keras import callbacks, applications, optimizers

np.random.seed(42)

## 1. Load Data

In [None]:
df_business = pd.read_json('../../data/business.json',lines=True)
df_photo = pd.read_json('../../data/photo.json', lines=True)

## 2. Exploratory Data Analysis (EDA) and Data Preprocessing

In [None]:
df_business = df_business.dropna()
df_business['review_count'] = df_business['review_count'].fillna(0)
df_business['stars'] = round(df_business['stars'].fillna(0)).apply(int)

In [None]:
list_starts = df_business['stars'].unique()

In [None]:
list_starts

In [None]:
df_business.describe()

In [None]:
df_photo.describe()

In [None]:
df_photo.head()

In [None]:
df_merged = pd.merge(df_business, df_photo, on='business_id', how='left')

In [None]:
df_merged.shape

- Due to limited computational resources, randomly select some samples from the dataframe

In [None]:
df_merged_new = df_merged.sample(frac=0.001, replace=False, random_state=42)

In [None]:
df_merged_new.shape

In [None]:
df_merged_new.dropna(inplace=True)

In [None]:
df_merged_new.head()

In [None]:
list_photo_ids = df_merged_new['photo_id'].values

In [None]:
df_merged_new['photo_id'].values[0]

In [None]:
im = Image.open('../../data/yelp_photos/photos/{}.jpg'.format(df_merged_new['photo_id'].values[0])) 


In [None]:
im

In [None]:
im.size

In [None]:
im = Image.open('../../data/yelp_photos/photos/{}.jpg'.format(df_merged_new['photo_id'].values[1])) 


In [None]:
im

In [None]:
im.size

In [None]:
im1 = Image.open('../../data/yelp_photos/photos/{}.jpg'.format(df_merged_new['photo_id'].values[1]))
# adjust width and height to your needs
width = 128
height = 128
# use one of these filter options to resize the image
#im2 = im1.resize((width, height), Image.NEAREST)      # use nearest neighbour
#im3 = im1.resize((width, height), Image.BILINEAR)     # linear interpolation in a 2x2 environment
#im4 = im1.resize((width, height), Image.BICUBIC)      # cubic spline interpolation in a 4x4 environment
im5 = im1.resize((width, height), Image.ANTIALIAS)    # best down-sizing filter


In [None]:
im5

In [None]:
class_names = list_starts

# Note due to computation cost images are resized to smaller sizes
# img_width, img_height = 80, 80
img_width, img_height = 100, 100


img_x = []
img_y = []

# ------------
# Get training and test images

img_dir_path = "../../data/yelp_photos/photos/"
#dirs = os.listdir(img_dir_path)

# key: product id, value: image path
dict_img_path = dict()

for photo_id in list(list_photo_ids):
#for file_name in dirs:
#    file_path = os.path.join(img_dir_path, file_name)
    file_path = '{}/{}.jpg'.format(img_dir_path, photo_id)
    img = load_img(file_path)         # this is a PIL image
    img_resize = img.resize((img_width, img_height), Image.ANTIALIAS)
    x = img_to_array(img_resize)      # this is a Numpy array with shape (img_width, img_height, 3)
    # x = x.reshape((1,) + x.shape)   # this is a Numpy array with shape (1, 3, img_width, img_height)
    
    img_x.append(x)
    
    y = df_merged_new[df_merged_new['photo_id']==photo_id]['stars'].values[0]
    img_y.append(y)
    
    
#combined = list(zip(img_x, img_y))
#random.shuffle(combined)
#img_x[:], img_y[:] = zip(*combined)
#train_img_x = img_x[:round(0.7*len(list_photo_ids))]
#train_img_y = img_y[:round(0.7*len(list_photo_ids))]
#test_img_x = img_x[round(0.7*len(list_photo_ids)):]
#test_img_y = img_y[round(0.7*len(list_photo_ids)):]


train_img_x, test_img_x, train_img_y, test_img_y = train_test_split(img_x, img_y, test_size=0.3, random_state=42)

train_img_x = np.array(train_img_x)
train_img_y = np.array(train_img_y)
test_img_x = np.array(test_img_x)
test_img_y = np.array(test_img_y)

print(train_img_x.shape)
print(train_img_y.shape)
print(test_img_x.shape)
print(test_img_y.shape)

In [None]:
# ----------
# Transform category to one-hot encoding

le = preprocessing.LabelEncoder()
le.fit(class_names)
train_img_y = le.transform(train_img_y)
test_img_y = le.transform(test_img_y)

plt.hist(train_img_y.tolist(), range(min(train_img_y), max(train_img_y)+1))
plt.show()
print(set(train_img_y))

#plt.hist(test_img_y.tolist(), range(min(test_img_y), max(test_img_y)+1))
#plt.show()
#print(set(test_img_y))

In [None]:
train_img_y = to_categorical(train_img_y, num_classes = len(class_names))
test_img_y = to_categorical(test_img_y, num_classes = len(class_names))

# Split the train and the validation set for the fitting
train_img_x, val_img_x, train_img_y, val_img_y = train_test_split(train_img_x, train_img_y, test_size = 0.1, random_state=42)

print('train set shape')
print(np.array(train_img_x).shape)
print(np.array(train_img_y).shape)
print('validation set shape')
print(np.array(val_img_x).shape)
print(np.array(val_img_y).shape)

In [None]:
np.unique(test_img_y)

In [None]:
# ----------------------
# CNN hyperparameters

epochs = 2
batch_size = 32
filters = [8, 8]
kernel_sizes = [4, 4]
strides = [2, 2]
pooling_sizes = [2]

str_parameters = '[epochs]{}-[batch_size]{}-[filters]{}-[kernel_sizes]{}-[strides]{}-[pooling_sizes]{}'.format(epochs,
                                                                                                                batch_size,
                                                                                                                '_'.join(str(x) for x in filters),
                                                                                                                '_'.join(str(x) for x in kernel_sizes),
                                                                                                                '_'.join(str(x) for x in strides),
                                                                                                                '_'.join(str(x) for x in pooling_sizes),
                                                                                                                )

model_name = 'CNN' 
#model_name = 'VGG16'  # require GPU

In [None]:
# ----------------
# Define the CNN models

model = None
if model_name == 'CNN':
    model = Sequential()

    model.add(Conv2D(filters = filters[0], kernel_size = (kernel_sizes[0], kernel_sizes[0]),
                     padding = 'Same', strides=strides[0],  input_shape = (img_width, img_height, 3)),
                     #activation ='relu',
                    )
    model.add(BatchNormalization())
    model.add(Activation('relu'))

    model.add(Conv2D(filters = filters[1], kernel_size = (kernel_sizes[1], kernel_sizes[1]),
                     padding = 'Same', strides=strides[1],
                     #activation ='relu'
                     ))
    model.add(BatchNormalization())
    model.add(Activation('relu'))

    model.add(MaxPool2D(pool_size=(pooling_sizes[0], pooling_sizes[0])))
    model.add(Dropout(0.2))

    model.add(Flatten())
    #model.add(Dense(256, activation = "relu"))
    model.add(Dense(128, kernel_initializer='glorot_uniform'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.2))

    model.add(Dense(len(class_names), activation = "softmax"))

elif model_name == 'VGG16':
    # use pre-trained VGG16
    base_model = applications.VGG16(weights='imagenet', include_top=False, input_shape=(img_width, img_height, 3))

    add_model = Sequential()
    add_model.add(Flatten(input_shape=base_model.output_shape[1:]))

    add_model.add(Dense(256, kernel_initializer='glorot_uniform'))
    # add_model.add(Dense(1, activation='sigmoid'))
    add_model.add(BatchNormalization())
    add_model.add(Activation('relu'))
    add_model.add(Dropout(0.2))

    add_model.add(Dense(len(class_names), activation="softmax"))

    model = Model(inputs=base_model.input, outputs=add_model(base_model.output))

    # model.compile(loss='binary_crossentropy', optimizer=optimizers.SGD(lr=1e-4, momentum=0.9),
    #              metrics=['accuracy'])

print(model.summary())

optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
#optimizer = optimizers.SGD(lr=1e-3, momentum=0.9) 

model.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])

learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc',
                                            patience=2,
                                            verbose=1,
                                            factor=0.5,
                                            min_lr=0.00001)

In [None]:
# -----------
# Data augmentation
datagen = ImageDataGenerator(
        featurewise_center=False,               # set input mean to 0 over the dataset
        samplewise_center=False,                # set each sample mean to 0
        featurewise_std_normalization=False,    # divide inputs by std of the dataset
        samplewise_std_normalization=False,     # divide each input by its std
        zca_whitening=False,                    # apply ZCA whitening
        rotation_range=10,                      # randomly rotate images in the range (degrees, 0 to 180)
        zoom_range = 0.1,                       # randomly zoom image
        width_shift_range=0.1,                  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0.1,                 # randomly shift images vertically (fraction of total height)
        horizontal_flip=False,                  # randomly flip images
        vertical_flip=False)                    # randomly flip images

datagen.fit(train_img_x)

In [None]:
# -------------
# Train the CNN model
history = model.fit_generator(datagen.flow(train_img_x, train_img_y, batch_size=batch_size),
                              epochs = epochs, validation_data = (val_img_x, val_img_y),
                              verbose = 2, steps_per_epoch=train_img_x.shape[0] // batch_size, callbacks=[learning_rate_reduction])


# Training and validation curves
# Plot the loss and accuracy curves for training and validation
fig, ax = plt.subplots(2,1)
ax[0].plot(history.history['loss'], color='b', label="Training loss")
ax[0].plot(history.history['val_loss'], color='r', label="Validation loss",axes =ax[0])
legend = ax[0].legend(loc='best', shadow=True)

ax[1].plot(history.history['acc'], color='b', label="Training accuracy")
ax[1].plot(history.history['val_acc'], color='r',label="Validation accuracy")
legend = ax[1].legend(loc='best', shadow=True)


plt.show()

In [None]:
# Save the model
model_dir = './models/'
if not os.path.exists(model_dir):
    os.mkdir(model_dir)

model_path = ''
if model_name == 'CNN':
    model_path = '{}cat-cnn-model-{}.h5'.format(model_dir, str_parameters)
elif model_name == 'VGG16':
    model_path = '{}cat-vgg16-{}-{}.h5'.format(model_dir, epochs, batch_size)
model.save(model_path)
print('save model to {}'.format(model_path))

model_weights_path = ''
if model_name == 'CNN':
    model_weights_path = '{}cat-cnn-weights-{}.h5'.format(model_dir, str_parameters)
elif model_name == 'VGG16':
    model_weights_path = '{}cat-vgg16-weights-{}-{}.h5'.format(model_dir, epochs, batch_size)
model.save_weights(model_weights_path)
print('save weights to {}'.format(model_weights_path))

In [None]:
model = load_model(model_path)
model.load_weights(model_weights_path)

test_pred = model.predict(test_img_x)

In [None]:
test_pred_classes = np.argmax(test_pred, axis = 1)

In [None]:
test_pred_classes

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
# convert predictions classes to one hot vectors
test_pred_classes = np.argmax(test_pred, axis = 1)
# results = pd.Series(test_pred_classes, name="Label")

# convert test observations to one hot vectors
test_true_classes = np.argmax(test_img_y, axis = 1)

acc_score = accuracy_score(test_true_classes, test_pred_classes)
print('accuracy {}'.format(acc_score))

# compute the confusion matrix
test_true_classes = le.inverse_transform(test_true_classes)
test_pred_classes = le.inverse_transform(test_pred_classes)


#print(classification_report(test_true_classes, test_pred_classes, target_names=class_names))
print(classification_report(test_true_classes, test_pred_classes, target_names=[str(i) for i in list(set(test_true_classes))]))


confusion_mtx = confusion_matrix(test_true_classes, test_pred_classes)
#display(confusion_mtx[0:5][0:5])

#plot the confusion matrix
#confusion_mtx = confusion_mtx
#plot_confusion_matrix(confusion_mtx, classes = range(len(class_names)))
#plt.show()

precision = precision_score(test_true_classes, test_pred_classes, average=None)
recall = recall_score(test_true_classes, test_pred_classes, average=None)
f1 = f1_score(test_true_classes, test_pred_classes, average=None)

print('precision')
print(precision)

print('recall')
print(recall)

print('f1')
print(f1)