In [1]:
import gzip
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Layer, Input, Flatten, Dense, Embedding, Dropout, Multiply, Concatenate
# from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Mean, RootMeanSquaredError, MeanAbsoluteError

# Data Generate

In [4]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [5]:
df = getDF('Industrial_and_Scientific_5.json.gz')

In [6]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"11 27, 2017",A1JB7HFWHRYHT7,B0000223SI,{'Size:': ' 1-(Pack)'},Alex W.,This worked really well for what I used it for...,Couldn't have been happier with it's performance,1511740800,,
1,5.0,True,"11 4, 2017",A2FCLJG5GV8SD6,B0000223SI,{'Size:': ' 1-(Pack)'},Randall Harris,Fast cutting and good adheasive.,Good paper.,1509753600,,
2,5.0,False,"10 27, 2017",A3IT9B33NWYQSL,B0000223SI,{'Size:': ' 1-(Pack)'},A. C.,Worked great for my lapping bench. I would li...,Handy!,1509062400,,
3,4.0,True,"01 13, 2018",AUL5LCV4TT73P,B0000223SK,{'Size:': ' 1-Pack'},TnT,As advertised,As advertised,1515801600,,
4,5.0,True,"10 7, 2017",A1V3I3L5JKO7TM,B0000223SK,{'Size:': ' 1-Pack'},John Jones,seems like a pretty good value as opposed to b...,seems like a pretty good value as opposed to b...,1507334400,,


In [7]:
df = df[["reviewerID", "asin", "overall"]]

In [8]:
le = LabelEncoder()
df["reviewerID"] = le.fit_transform(df["reviewerID"].values)
df["asin"] = le.fit_transform(df["asin"].values)

In [9]:
df.head()

Unnamed: 0,reviewerID,asin,overall
0,1557,0,5.0
1,4282,0,5.0
2,7415,0,5.0
3,10602,1,4.0
4,2574,1,5.0


In [10]:
user_num = df["reviewerID"].max() + 1
item_num = df["asin"].max() + 1
print(f"user_num: {user_num}")
print(f"item_num: {item_num}")

user_num: 11041
item_num: 5334


In [11]:
def preprocessing_data(df): # Split data into 8/2
    TRAIN_SIZE = 0.8
    VALIDADTION_SIZE = 0.5
    ratings = shuffle(df, random_state=1)
    cutoff = int(TRAIN_SIZE * len(ratings))
    train_df = ratings.iloc[:cutoff]
    validation_test_df = ratings.iloc[cutoff:]
    validation_cutoff = int(VALIDADTION_SIZE * len(validation_test_df))
    validation_df = validation_test_df.iloc[:validation_cutoff]
    test_df = validation_test_df.iloc[validation_cutoff:]

    return train_df, validation_df, test_df

In [12]:
train_df, validation_df, test_df = preprocessing_data(df)

In [13]:
print(f"train_df.shape: {train_df.shape}")
print(f"validation.shape: {validation_df.shape}")
print(f"test_df.shape: {test_df.shape}")

train_df.shape: (61656, 3)
validation.shape: (7707, 3)
test_df.shape: (7708, 3)


In [14]:
train_dict = {"User": train_df["reviewerID"].values.astype(np.float32),
              "Item": train_df["asin"].values.astype(np.float32),
              "Rating": train_df["overall"].values.astype(np.float32)}

validation_dict = {"User": validation_df["reviewerID"].values.astype(np.float32),
              "Item": validation_df["asin"].values.astype(np.float32),
              "Rating": validation_df["overall"].values.astype(np.float32)}

test_dict = {"User": test_df["reviewerID"].values.astype(np.float32),
              "Item": test_df["asin"].values.astype(np.float32),
              "Rating": test_df["overall"].values.astype(np.float32)}

In [15]:
print(train_dict["User"])
print(train_dict["Item"])
print(train_dict["Rating"])

[ 6572. 10810. 10883. ...  4520.  2452.  5626.]
[5262.  871.  842. ... 1069. 4059. 2690.]
[5. 5. 5. ... 4. 5. 5.]


In [16]:
print(validation_dict["User"])
print(validation_dict["Item"])
print(validation_dict["Rating"])

[7273.  779. 6659. ... 6855. 6185.  551.]
[4186. 4333.  623. ... 3324. 5269. 1056.]
[5. 5. 5. ... 5. 5. 5.]


In [17]:
print(test_dict["User"])
print(test_dict["Item"])
print(test_dict["Rating"])

[7793.  229. 2730. ... 4466.   22. 6359.]
[ 477. 4659. 5289. ... 3109. 3659.  330.]
[5. 5. 5. ... 5. 5. 5.]


In [18]:
train_ds = tf.data.Dataset.from_tensor_slices((train_dict["User"], train_dict["Item"], train_dict["Rating"]))
train_ds = train_ds.shuffle(1000).batch(1024)

validation_ds = tf.data.Dataset.from_tensor_slices((validation_dict["User"], validation_dict["Item"], validation_dict["Rating"]))
validation_ds = validation_ds.shuffle(1000).batch(512)

test_ds = tf.data.Dataset.from_tensor_slices((test_dict["User"], test_dict["Item"], test_dict["Rating"]))
test_ds = test_ds.shuffle(1000).batch(512)

# Utils

In [39]:
class MLP(Layer):
    def __init__(self, first_node, n_layer):
        super(MLP, self).__init__()
        n_node = first_node
        self.mlp_layer = Sequential()
        for i in range(n_layer):
            self.mlp_layer.add(Dense(units = n_node, activation = "relu"))
            self.mlp_layer.add(Dropout(0.1))
            n_node //= 2

    def call(self, input):
        x = self.mlp_layer(input)
        return x

In [40]:
class NeuralCF(Model):
    def __init__(self, N, M, K):
        super(NeuralCF, self).__init__()
        self.user_embedding = Embedding(N, K)
        self.item_embedding = Embedding(M, K)
        self.user_flatten = Flatten()
        self.item_flatten = Flatten()
        self.GMF = Multiply()
        self.first_Concat = Concatenate()
        self.MLP = MLP(32, 3)
        self.second_Concat = Concatenate()
        self.prediction_layer = MLP(32, 3)
        self.output_layer = Dense(1,activation = "linear")

    def call(self, inputs):
        # User, Item Embedding
        user_emb = self.user_embedding(inputs[0])
        item_emb = self.item_embedding(inputs[1])
        user_emb = self.user_flatten(user_emb)
        item_emb = self.item_flatten(item_emb)

        # GMF Layer
        gmf_layer = self.GMF([user_emb, item_emb])

        # MLP Layer
        MLP_layer_input = self.first_Concat([user_emb, item_emb])
        MLP_layer = self.MLP(MLP_layer_input)

        # Prediction Layer
        NeuMF_input = self.second_Concat([gmf_layer, MLP_layer])
        NeuMF_layer = self.prediction_layer(NeuMF_input)
        output = self.output_layer(NeuMF_layer)
        return output

In [41]:
def load_metrics():
    global train_loss, train_acc
    global validation_loss, validation_acc
    global test_loss, test_rmse, test_mae

    train_loss = Mean()
    validation_loss = Mean()
    test_loss = Mean()

    test_rmse = RootMeanSquaredError()
    test_mae = MeanAbsoluteError()

In [42]:
@tf.function
def trainer():
    global train_ds, train_loss, model
    global optimizer, loss_object
    # cardinality = np.sum([1 for i in train_ds])

    for x, y, z in train_ds:
        with tf.GradientTape() as tape:
            predictions = model([x, y])
            # print(f"predictions: {predictions}")
            loss = loss_object(z, predictions)

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        train_loss(loss)

@tf.function
def validation():
    global validation_ds, model, loss_object
    global validation_loss, validation_acc
    # cardinality = np.sum([1 for i in validation_ds])
    for x, y, z in validation_ds:
        predictions = model([x, y])
        loss = loss_object(z, predictions)

        validation_loss(loss)

@tf.function
def tester():
    global test_ds, best_model, loss_object
    global test_loss, test_rmse, test_mae
    # cardinality = np.sum([1 for i in test_ds])
    for x, y, z in test_ds:
        predictions = best_model([x, y])
        loss = loss_object(z, predictions)

        test_loss(loss)
        test_rmse(z, predictions)
        test_mae(z, predictions)

# Initiation

In [44]:
EPOCHS = 100
LR = 0.001

loss_object = tf.keras.losses.MeanSquaredError()
optimizer = Adam(learning_rate = LR)

load_metrics()
loss_list = []
num = 0

# Model

In [45]:
model = NeuralCF(user_num, item_num, 32)

In [46]:
for epoch in tqdm(range(EPOCHS)):
    trainer()
    validation()
    print(f"\nEpoch: {epoch}")
    print(f'train_loss: {(train_loss.result()):.3f}, validation_loss: {(validation_loss.result()):.3f}\n')

    loss_list.append(validation_loss.result())
    min_loss = np.min(loss_list)

    if validation_loss.result() <= min_loss:
        print("Model Save!")
        num = 0
        best_model = model
    else:
        num += 1
        print(f"num: {num}")
        if num == 5:
            print("early stopping")
            break

    train_loss.reset_states()
    validation_loss.reset_states()

  1%|          | 1/100 [00:10<16:46, 10.17s/it]


Epoch: 0
train_loss: 14.256, validation_loss: 1.438

Model Save!


  2%|▏         | 2/100 [00:12<08:49,  5.41s/it]


Epoch: 1
train_loss: 1.048, validation_loss: 0.800

Model Save!


  3%|▎         | 3/100 [00:13<06:01,  3.73s/it]


Epoch: 2
train_loss: 0.629, validation_loss: 0.795

Model Save!


  4%|▍         | 4/100 [00:15<04:46,  2.98s/it]


Epoch: 3
train_loss: 0.568, validation_loss: 0.812

num: 1


  5%|▌         | 5/100 [00:17<03:49,  2.41s/it]


Epoch: 4
train_loss: 0.546, validation_loss: 0.811

num: 2


  6%|▌         | 6/100 [00:18<03:14,  2.07s/it]


Epoch: 5
train_loss: 0.535, validation_loss: 0.802

num: 3


  7%|▋         | 7/100 [00:20<02:56,  1.90s/it]


Epoch: 6
train_loss: 0.517, validation_loss: 0.787

Model Save!


  8%|▊         | 8/100 [00:21<02:51,  1.86s/it]


Epoch: 7
train_loss: 0.495, validation_loss: 0.840

num: 1


  9%|▉         | 9/100 [00:23<02:32,  1.67s/it]


Epoch: 8
train_loss: 0.463, validation_loss: 0.818

num: 2


 10%|█         | 10/100 [00:24<02:25,  1.62s/it]


Epoch: 9
train_loss: 0.430, validation_loss: 0.828

num: 3


 11%|█         | 11/100 [00:25<02:10,  1.46s/it]


Epoch: 10
train_loss: 0.393, validation_loss: 0.918

num: 4


 11%|█         | 11/100 [00:27<03:40,  2.48s/it]


Epoch: 11
train_loss: 0.361, validation_loss: 0.903

num: 5
early stopping





In [49]:
tester()
print(f'Test_loss: {(test_loss.result()):.3f}')
print(f'Test_rmse: {(test_rmse.result()):.3f}')
print(f'Test_mae: {(test_mae.result()):.3f}')

Test_loss: 0.904
Test_rmse: 0.933
Test_mae: 0.587
