# Data Path

In [None]:
%cd # Data Path

In [None]:
path = "" # path for save pickle data

# Package

In [8]:
# Basic
import pandas as pd
import numpy as np
import os
import json
from tqdm import tqdm
import gzip
import pickle

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

# Model
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer, Concatenate, Dense, Input, Embedding, Flatten, Dropout, Multiply
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import MeanSquaredError, BinaryCrossentropy
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.metrics import Mean, RootMeanSquaredError
from sklearn.metrics import mean_absolute_error, mean_squared_error

# BERT, RoBERTa
import torch
from transformers import AutoTokenizer, AutoModel


# Function

In [9]:
# For Amazon Data Load
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [10]:
def Load_data(df, user, item, rating, text): # insert variable's name of Data coresponding each variable of function
    df = df[[user, item, rating, text]] # extract user ID, item ID, rating, reviewtext
    df.rename(columns = {user: "user",
                        item: "item",
                        rating: "rating",
                        text: "text"},
             inplace = True)


    le = LabelEncoder()
    df["user"] = le.fit_transform(df["user"].values)
    df["item"] = le.fit_transform(df["item"].values)

    USER_LEN = df["user"].max() + 1 # number of users
    ITEM_LEN = df["item"].max() + 1 # number of items
    return df, USER_LEN, ITEM_LEN

In [11]:
def Tokenize(data, model_ckpt, batch_size): # function of extracting [CLS] Token embedding from BERT-based model

    """
    model_ckpt: verion of BERT-based model
    batch_size: recommend that the value of this variable be 2 or 4
    """

    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    model = AutoModel.from_pretrained(model_ckpt).to(device)

    embeddings = []
    text_list = data['text'].tolist()

    for i in tqdm(range(0, len(text_list), batch_size)):
        batch_texts = text_list[i:i+batch_size]

        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True) # default of max_length is 512
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)

        with torch.no_grad():
            embedding = model(input_ids=input_ids, attention_mask=attention_mask)
        embeddings.append(embedding.last_hidden_state[:, 0, :])  # append CLS token embedding data

    # Stack embeddings into a tensor
    stacked_embeddings = torch.cat(embeddings, dim=0)

    stacked_embeddings = stacked_embeddings.cpu().numpy()

    result = stacked_embeddings.tolist()

    return result

In [12]:
def preprocessing_data(df): # Split data into 8/2
    TRAIN_SIZE = 0.8
    ratings = shuffle(df, random_state=1)
    cutoff = int(TRAIN_SIZE * len(ratings))
    train_df = ratings.iloc[:cutoff]
    test_df = ratings.iloc[cutoff:]

    return train_df, test_df

# Load Data

In [13]:
df = getDF('Industrial_and_Scientific_5.json.gz')

In [14]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"11 27, 2017",A1JB7HFWHRYHT7,B0000223SI,{'Size:': ' 1-(Pack)'},Alex W.,This worked really well for what I used it for...,Couldn't have been happier with it's performance,1511740800,,
1,5.0,True,"11 4, 2017",A2FCLJG5GV8SD6,B0000223SI,{'Size:': ' 1-(Pack)'},Randall Harris,Fast cutting and good adheasive.,Good paper.,1509753600,,
2,5.0,False,"10 27, 2017",A3IT9B33NWYQSL,B0000223SI,{'Size:': ' 1-(Pack)'},A. C.,Worked great for my lapping bench. I would li...,Handy!,1509062400,,
3,4.0,True,"01 13, 2018",AUL5LCV4TT73P,B0000223SK,{'Size:': ' 1-Pack'},TnT,As advertised,As advertised,1515801600,,
4,5.0,True,"10 7, 2017",A1V3I3L5JKO7TM,B0000223SK,{'Size:': ' 1-Pack'},John Jones,seems like a pretty good value as opposed to b...,seems like a pretty good value as opposed to b...,1507334400,,


In [15]:
df, USER_LEN, ITEM_LEN = Load_data(df, "reviewerID", "asin", "overall", "reviewText")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns = {user: "user",
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["user"] = le.fit_transform(df["user"].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["item"] = le.fit_transform(df["item"].values)


In [16]:
df.head()

Unnamed: 0,user,item,rating,text
0,1557,0,5.0,This worked really well for what I used it for...
1,4282,0,5.0,Fast cutting and good adheasive.
2,7415,0,5.0,Worked great for my lapping bench. I would li...
3,10602,1,4.0,As advertised
4,2574,1,5.0,seems like a pretty good value as opposed to b...


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77071 entries, 0 to 77070
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   user    77071 non-null  int64  
 1   item    77071 non-null  int64  
 2   rating  77071 non-null  float64
 3   text    77060 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 2.9+ MB


# Data Preprocessing

In [18]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77060 entries, 0 to 77070
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   user    77060 non-null  int64  
 1   item    77060 non-null  int64  
 2   rating  77060 non-null  float64
 3   text    77060 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 2.9+ MB


In [19]:
df["bert"] = Tokenize(df, "bert-base-uncased", batch_size = 1)
train_df, test_df = preprocessing_data(df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

cuda


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

100%|██████████| 77060/77060 [14:33<00:00, 88.26it/s]


In [20]:
df.head()

Unnamed: 0,user,item,rating,text,bert
0,1557,0,5.0,This worked really well for what I used it for...,"[-0.1370992660522461, -0.017150066792964935, 0..."
1,4282,0,5.0,Fast cutting and good adheasive.,"[-0.744834303855896, 0.1333298236131668, -0.22..."
2,7415,0,5.0,Worked great for my lapping bench. I would li...,"[-0.2748042047023773, -0.2355891317129135, 0.3..."
3,10602,1,4.0,As advertised,"[-0.4205509424209595, 0.10773316025733948, 0.1..."
4,2574,1,5.0,seems like a pretty good value as opposed to b...,"[0.1854289174079895, 0.12263260036706924, 0.14..."


In [22]:
train_dict = {"train_user": train_df["user"].values,
              "train_item": train_df["item"].values,
              "train_bert": np.array(train_df["bert"].tolist()),
              "train_y": train_df["rating"].values}

test_dict = {"test_user": test_df["user"].values,
              "test_item": test_df["item"].values,
              "test_bert": np.array(test_df["bert"].tolist()),
              "test_y": test_df["rating"].values}

In [24]:
with open(f'{path}/train_data.pkl', 'wb') as f:
    pickle.dump(train_dict, f)
with open(f'{path}/test_data.pkl', 'wb') as f:
    pickle.dump(test_dict, f)

In [39]:
# with open(f'{path}/train_data.pkl', 'rb') as f:
#     train_dict = pickle.load(f)
# with open(f'{path}/test_data.pkl', 'rb') as f:
#     test_dict = pickle.load(f)

In [41]:
train_dict

{'train_user': array([1197,  311, 7656, ..., 4790, 9362, 8195]),
 'train_item': array([2137, 3891, 3887, ..., 3184, 1553, 3922]),
 'train_bert': array([[-0.41393363,  0.30360237,  0.07349061, ..., -0.6887368 ,
          0.23235393,  0.61506504],
        [-0.14099571, -0.17753848,  0.42032188, ...,  0.06730706,
          0.14154784,  0.77117544],
        [-0.28676859, -0.29080957,  0.12472063, ...,  0.07263104,
          0.18267663,  0.59948742],
        ...,
        [-0.12415875, -0.46020871,  0.11656187, ..., -0.18436259,
          0.22264254,  0.48278621],
        [-0.4364177 , -0.04279586,  0.32396358, ..., -0.58488452,
          0.24035648,  0.26697546],
        [-0.18593341, -0.41321024,  1.04692078, ..., -0.42816842,
          0.13796398,  0.77542245]]),
 'train_y': array([5., 5., 3., ..., 5., 1., 5.])}

In [42]:
test_dict

{'test_user': array([ 2187,  4738,  7919, ...,  3559, 10535,  2123]),
 'test_item': array([1826, 2372, 2910, ..., 3110, 3660,  330]),
 'test_bert': array([[-0.0589914 ,  0.42729297,  0.35319245, ..., -0.18437329,
          0.38862976,  0.59597689],
        [ 0.03176583,  0.15274081, -0.08728351, ..., -0.03907591,
          0.13137487,  0.1992085 ],
        [-0.0991699 , -0.02575368,  0.26011905, ..., -0.1640372 ,
          0.29268631,  0.43925187],
        ...,
        [-0.35736123,  0.10893188,  0.31066653, ..., -0.37273636,
         -0.21455623,  0.47709242],
        [-0.28263956, -0.0651039 ,  0.54759252, ..., -0.06900347,
          0.18225716,  0.62800646],
        [-0.25962028,  0.0445751 ,  0.26437986, ..., -0.25715223,
         -0.41800642,  0.38067904]]),
 'test_y': array([4., 4., 5., ..., 5., 5., 3.])}

In [26]:
train_user = train_dict["train_user"]
train_item = train_dict["train_item"]
train_bert = train_dict["train_bert"]
train_y = train_dict["train_y"]


test_user = test_dict["test_user"]
test_item = test_dict["test_item"]
test_bert = test_dict["test_bert"]
test_y = test_dict["test_y"]

In [27]:
print(f"train_user shape: {train_user.shape}")
print(f"train_item shape: {train_item.shape}")
print(f"train_user_bert shape: {train_bert.shape}")

train_user shape: (61648,)
train_item shape: (61648,)
train_user_bert shape: (61648, 768)


In [28]:
print(f"test_user shape: {test_user.shape}")
print(f"test_item shape: {test_item.shape}")
print(f"test_user_bert shape: {test_bert.shape}")

test_user shape: (15412,)
test_item shape: (15412,)
test_user_bert shape: (15412, 768)


# Model

In [29]:
def ModelBuild_Full(user_num, item_num, id_dims):

    # user
    user_input = Input(shape=(1,), dtype='int32', name='UserInput')
    user_embedding = Embedding(user_num, id_dims, input_length=user_input.shape[1], name='UserIDEmb')(user_input)
    user_embedding = Flatten(name='UserFlatten')(user_embedding)

    # item
    item_input = Input(shape=(1,), dtype='int32', name='ItemInput')
    item_embedding = Embedding(item_num, id_dims, input_length=item_input.shape[1], name='ItemIDEmb')(item_input)
    item_embedding = Flatten(name='itemFlatten')(item_embedding)

    # Bert
    bert = Input(shape=(768,), name='Bert')

    # GMF Layer
    GMF = Multiply()([user_embedding, item_embedding])

    # MLP Layer
    MLP_input = Concatenate(name = "MLP_input")([user_embedding, item_embedding, bert])


    MLP_1 = Dense(64, activation = "relu")(MLP_input)
    MLP_1_dropout = Dropout(rate=0.1)(MLP_1)
    MLP_2 = Dense(32, activation = "relu")(MLP_1_dropout)
    MLP_2_dropout = Dropout(rate=0.1)(MLP_2)

    # NeuMF Layer

    NeuMF_input = Concatenate(name = "NeuMF_input")([GMF, MLP_2_dropout])

    MeuMF_1 = Dense(512, activation = "relu")(NeuMF_input)
    MeuMF_1_dropout = Dropout(rate=0.1)(MeuMF_1)
    MeuMF_2 = Dense(128, activation = "relu")(MeuMF_1_dropout)
    MeuMF_2_dropout = Dropout(rate=0.1)(MeuMF_2)
    outputs = Dense(32, activation='linear', name='outputs')(MeuMF_2_dropout)

    model = Model(inputs=[user_input, item_input, bert], outputs=outputs)

    return model

In [30]:
model_full = ModelBuild_Full(user_num = USER_LEN, item_num = ITEM_LEN, id_dims = 128)

In [31]:
model_full.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 UserInput (InputLayer)      [(None, 1)]                  0         []                            
                                                                                                  
 ItemInput (InputLayer)      [(None, 1)]                  0         []                            
                                                                                                  
 UserIDEmb (Embedding)       (None, 1, 128)               1413248   ['UserInput[0][0]']           
                                                                                                  
 ItemIDEmb (Embedding)       (None, 1, 128)               682752    ['ItemInput[0][0]']           
                                                                                              

# Train & Test

In [32]:
adam = Adam(learning_rate=0.0001)

model_full.compile(optimizer=adam, loss=tf.keras.losses.MeanSquaredError())
es = EarlyStopping(monitor='val_loss', mode = 'min', verbose = 1, patience = 5, restore_best_weights = True)

In [36]:
history = model_full.fit([train_user, train_item, train_bert],
              train_y,
              batch_size = 32,
              epochs = 50,
              callbacks=[es],
              validation_split = 0.1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 26: early stopping


In [37]:
prediction = model_full.predict([test_user, test_item, test_bert])



In [38]:
MAE_temp = mean_absolute_error(test_y, prediction)
RMSE_temp = mean_squared_error(test_y, prediction, squared = False)
MSE_temp = mean_squared_error(test_y, prediction, squared = True)

print(f"RMSE : {RMSE_temp:.3f}, MSE: {MSE_temp:.3f}, MAE : {MAE_temp:.3f}")

RMSE : 0.717, MSE: 0.515, MAE : 0.450
