# Package

In [None]:
# Basic
import pandas as pd
import numpy as np
import os
import json
from tqdm import tqdm
import gzip
import pickle

Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

# Model
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Layer, Input, Flatten, Dense, Embedding, Dropout, Concatenate
# from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Mean, RootMeanSquaredError, MeanAbsoluteError

# BERT, RoBERTa
import torch
from transformers import AutoTokenizer, AutoModel

# Utils

In [12]:
# For Amazon Data Load
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [7]:
def Load_data(df, user, item, rating, text): # insert variable's name of Data coresponding each variable of function
    df = df[[user, item, rating, text]] # extract user ID, item ID, rating, reviewtext
    df.rename(columns = {user: "user",
                        item: "item",
                        rating: "rating",
                        text: "text"},
             inplace = True)

    df = df.dropna()
    le = LabelEncoder()
    df["user"] = le.fit_transform(df["user"].values)
    df["item"] = le.fit_transform(df["item"].values)

    USER_LEN = df["user"].max() + 1 # number of users
    ITEM_LEN = df["item"].max() + 1 # number of items
    return df, USER_LEN, ITEM_LEN

In [8]:
def Tokenize(data, model_ckpt, batch_size): # function of extracting [CLS] Token embedding from BERT-based model

    """
    model_ckpt: verion of BERT or RoBERTa model
    col_name: append cls token embedding data column into dataframe
    batch_size: recommend that the value of this variable be 2 or 4
    """

    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    model = AutoModel.from_pretrained(model_ckpt).to(device)

    embeddings = []
    text_list = data['text'].tolist()

    for i in tqdm(range(0, len(text_list), batch_size)):
        batch_texts = text_list[i:i+batch_size]

        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True) # default of max_length is 512
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)

        with torch.no_grad():
            embedding = model(input_ids=input_ids, attention_mask=attention_mask)
        embeddings.append(embedding.last_hidden_state[:, 0, :])  # append CLS token embedding data

    # Stack embeddings into a tensor
    stacked_embeddings = torch.cat(embeddings, dim=0)

    stacked_embeddings = stacked_embeddings.cpu().numpy()

    result = stacked_embeddings.tolist()

    return result

In [9]:
def bert_roberta(train_df, test_df, batch_size = 1):
    global user_grouped, item_grouped
    user_grouped = train_df[["user", "text"]].groupby('user')["text"].apply(" ".join).reset_index()
    item_grouped = train_df[["item", "text"]].groupby('item')["text"].apply(" ".join).reset_index()

    user_grouped["user_bert"] = Tokenize(user_grouped, 'bert-base-uncased', batch_size=batch_size)
    item_grouped["item_bert"] = Tokenize(item_grouped, 'bert-base-uncased', batch_size=batch_size)

    user_grouped["user_roberta"] = Tokenize(user_grouped, 'roberta-base', batch_size=batch_size)
    item_grouped["item_roberta"] = Tokenize(item_grouped, 'roberta-base', batch_size=batch_size)

    def group_merge(user_df, item_df, df):
        bert_user = pd.merge(df, user_df[["user", "user_bert"]], how = "left", on  = "user")
        bert_user_item = pd.merge(bert_user, item_df[["item", "item_bert"]], how = "left", on  = "item")

        bert_roberta_user = pd.merge(bert_user_item, user_df[["user", "user_roberta"]], how = "left", on  = "user")
        final_df = pd.merge(bert_roberta_user, item_df[["item", "item_roberta"]], how = "left", on  = "item")

        return final_df

    train_dataset = group_merge(user_grouped, item_grouped, train_df)
    test_validation_dataset = group_merge(user_grouped, item_grouped, test_df)

    validation_dataset, test_dataset = train_test_split(test_validation_dataset, test_size=0.5, random_state=42)

    return train_dataset, validation_dataset, test_dataset

In [10]:
def train_validation_test(dataset):
    np.random.seed(0)

    # 학습 데이터셋과 테스트 데이터셋 초기화
    train_dataset = pd.DataFrame(columns=['user', 'item', 'rating', 'text'])
    test_dataset = pd.DataFrame(columns=['user', 'item', 'rating', 'text'])

    # 각 유저에 대해 아이템을 8:2로 분할
    for user in tqdm(dataset['user'].unique()):
        user_data = dataset[dataset['user'] == user]
        if len(user_data) > 1:
            train, test = train_test_split(user_data, test_size=0.2, random_state=42)
            train_dataset = pd.concat([train_dataset, train])
            test_dataset = pd.concat([test_dataset, test])
        else:
            train_dataset = pd.concat([train_dataset, user_data])
    test_dataset = test_dataset[test_dataset['item'].isin(train_dataset['item'])]
    test_dataset = test_dataset[test_dataset['user'].isin(train_dataset['user'])]
    # 결과 출력
    print(f"Train dataset shape: {train_dataset.shape}")
    print(f"Test dataset shape: {test_dataset.shape}")

    print(set(test_dataset.user).issubset(set(train_dataset.user)))
    print(set(test_dataset.item).issubset(set(train_dataset.item)))

    return train_dataset, test_dataset

In [11]:
class MLP(Layer):
    def __init__(self, first_node, n_layer):
        super(MLP, self).__init__()
        n_node = first_node
        self.mlp_layer = Sequential()
        for i in range(n_layer):
            self.mlp_layer.add(Dense(units = n_node, activation = "relu"))
            self.mlp_layer.add(Dropout(0.1))
            n_node //= 2

    def call(self, input):
        x = self.mlp_layer(input)
        return x

NameError: name 'Layer' is not defined

In [None]:
class MFNR(Model):
    def __init__(self, N, M, K):
        super(MFNR, self).__init__()
        self.user_embedding = Embedding(N, K)
        self.item_embedding = Embedding(M, K)
        self.user_flatten = Flatten()
        self.item_flatten = Flatten()
        self.user_nlp_concat = Concatenate()
        self.user_nlp_MLP = MLP(512, 4)
        self.item_nlp_concat = Concatenate()
        self.item_nlp_MLP = MLP(512, 4)
        self.user_concat = Concatenate()
        self.item_concat = Concatenate()
        self.rating_concat = Concatenate()
        self.rating_mlp = MLP(64, 3)
        self.output_layer =  Dense(1,activation = "linear")

    def call(self, inputs):
        # User, Item Embedding
        user_emb = self.user_embedding(inputs[0])
        item_emb = self.item_embedding(inputs[1])
        user_emb = self.user_flatten(user_emb)
        item_emb = self.item_flatten(item_emb)

        # User NLP & MLP
        user_nlp = self.user_nlp_concat([inputs[2], inputs[4]])
        user_nlp = self.user_nlp_MLP(user_nlp)
        # Item NLP
        item_nlp = self.item_nlp_concat([inputs[3], inputs[5]])
        item_nlp = self.item_nlp_MLP(item_nlp)

        # User Representation
        user_rep = self.user_concat([user_emb, user_nlp])
        # Item Representation
        item_rep = self.item_concat([item_emb, item_nlp])

        # Rating Prediction
        MLP = self.rating_concat([user_rep, item_rep])
        MLP = self.rating_mlp(MLP)
        output = self.output_layer(MLP)
        return output

In [None]:
def load_metrics():
    global train_loss, train_acc
    global validation_loss, validation_acc
    global test_loss, test_rmse, test_mae

    train_loss = Mean()
    validation_loss = Mean()
    test_loss = Mean()

    test_rmse = RootMeanSquaredError()
    test_mae = MeanAbsoluteError()

In [None]:
@tf.function
def trainer():
    global train_tfds, train_loss, model
    global optimizer, loss_object

    for user, item, user_bert, item_bert, user_roberta, item_roberta, y in train_tfds:
        with tf.GradientTape() as tape:
            predictions = model([user, item, user_bert, item_bert, user_roberta, item_roberta])
            loss = loss_object(y, predictions)

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        train_loss(loss)

@tf.function
def validation():
    global validation_tfds, model, loss_object
    global validation_loss, validation_acc
    for user, item, user_bert, item_bert, user_roberta, item_roberta, y in validation_tfds:
        predictions = model([user, item, user_bert, item_bert, user_roberta, item_roberta])
        loss = loss_object(y, predictions)

        validation_loss(loss)

@tf.function
def tester():
    global test_tfds, best_model, loss_object
    global test_loss, test_rmse, test_mae
    for user, item, user_bert, item_bert, user_roberta, item_roberta, y in test_tfds:
        predictions = best_model([user, item, user_bert, item_bert, user_roberta, item_roberta])
        loss = loss_object(y, predictions)

        test_loss(loss)
        test_rmse(y, predictions)
        test_mae(y, predictions)

# Load Data

In [13]:
df = getDF('./Subscription_Boxes.jsonl.gz')

In [14]:
df.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,1.0,USELESS,Absolutely useless nonsense and a complete was...,[],B07G584SHG,B09WC47S3V,AEMJ2EG5ODOCYUTI54NBXZHDJGSQ,1602133857705,2,True
1,2.0,Manufactured where?,"With a couple of the items, I wasn't quite sur...",[],B07QL1JRCN,B07QL1JRCN,AEEJBFZKUBEEMBZUZJV4UHFVEEBQ,1609110735433,20,True
2,1.0,Little bang for your buck.,Two SMALL stuffed animals and 2 little bags of...,[],B07RBYJN37,B08N5QKX1Y,AGSVZNZBTSGQBKZDZTQYEZHGDPCQ,1609937315319,4,True
3,5.0,New favorite box,"Although I don’t remember signing up for this,...",[],B07KM6T8GV,B07KM6T8GV,AFDERNB6BIR3U2DOR3S2KX7KJJXQ,1616156351887,1,True
4,5.0,Coctique,I loved every thing and could use it all. Thin...,[],B07NVL6TJG,B07NVKNVNM,AE6P2YJ6FKX332MD56GPJFSHXNJQ,1559533206066,0,True


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77071 entries, 0 to 77070
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   overall         77071 non-null  float64
 1   verified        77071 non-null  bool   
 2   reviewTime      77071 non-null  object 
 3   reviewerID      77071 non-null  object 
 4   asin            77071 non-null  object 
 5   style           36037 non-null  object 
 6   reviewerName    77044 non-null  object 
 7   reviewText      77060 non-null  object 
 8   summary         77061 non-null  object 
 9   unixReviewTime  77071 non-null  int64  
 10  vote            9620 non-null   object 
 11  image           1719 non-null   object 
dtypes: bool(1), float64(1), int64(1), object(9)
memory usage: 7.1+ MB


In [11]:
df, USER_LEN, ITEM_LEN = Load_data(df, "reviewerID", "asin", "overall", "reviewText")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns = {user: "user",


In [12]:
df.head()

Unnamed: 0,user,item,rating,text
0,1557,0,5.0,This worked really well for what I used it for...
1,4282,0,5.0,Fast cutting and good adheasive.
2,7415,0,5.0,Worked great for my lapping bench. I would li...
3,10602,1,4.0,As advertised
4,2574,1,5.0,seems like a pretty good value as opposed to b...


In [13]:
train_df, test_df = train_validation_test(df)

100%|██████████| 11041/11041 [00:46<00:00, 236.14it/s]

Train dataset shape: (57957, 4)
Test dataset shape: (19068, 4)
True
True





In [15]:
train_ds, validation_ds, test_ds = bert_roberta(train_df, test_df, batch_size = 2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


cuda


100%|██████████| 5521/5521 [03:46<00:00, 24.38it/s]


cuda


100%|██████████| 2662/2662 [02:27<00:00, 18.02it/s]


cuda


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 5521/5521 [03:37<00:00, 25.44it/s]


cuda


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 2662/2662 [02:19<00:00, 19.14it/s]


In [18]:
train_dict = {"train_user": train_ds["user"].values.astype(np.float32),
              "train_item": train_ds["item"].values.astype(np.float32),
              "train_user_bert": np.array(train_ds["user_bert"].tolist()),
              "train_item_bert": np.array(train_ds["item_bert"].tolist()),
              "train_user_roberta": np.array(train_ds["user_roberta"].tolist()),
              "train_item_roberta": np.array(train_ds["item_roberta"].tolist()),
              "train_y": train_ds["rating"].values.astype(np.float32)}

validation_dict = {"validation_user": validation_ds["user"].values.astype(np.float32),
              "validation_item": validation_ds["item"].values.astype(np.float32),
              "validation_user_bert": np.array(validation_ds["user_bert"].tolist()),
              "validation_item_bert": np.array(validation_ds["item_bert"].tolist()),
              "validation_user_roberta": np.array(validation_ds["user_roberta"].tolist()),
              "validation_item_roberta": np.array(validation_ds["item_roberta"].tolist()),
              "validation_y": validation_ds["rating"].values.astype(np.float32)}

test_dict = {"test_user": test_ds["user"].values.astype(np.float32),
              "test_item": test_ds["item"].values.astype(np.float32),
              "test_user_bert": np.array(test_ds["user_bert"].tolist()),
              "test_item_bert": np.array(test_ds["item_bert"].tolist()),
              "test_user_roberta": np.array(test_ds["user_roberta"].tolist()),
              "test_item_roberta": np.array(test_ds["item_roberta"].tolist()),
              "test_y": test_ds["rating"].values.astype(np.float32)}

In [19]:
train_tfds = tf.data.Dataset.from_tensor_slices((train_dict["train_user"], train_dict["train_item"],
                                               train_dict["train_user_bert"], train_dict["train_item_bert"],
                                               train_dict["train_user_roberta"], train_dict["train_item_roberta"],
                                               train_dict["train_y"]))
train_tfds = train_tfds.shuffle(1000).batch(1024)

validation_tfds = tf.data.Dataset.from_tensor_slices((validation_dict["validation_user"], validation_dict["validation_item"],
                                                    validation_dict["validation_user_bert"], validation_dict["validation_item_bert"],
                                                    validation_dict["validation_user_roberta"], validation_dict["validation_item_roberta"],
                                                    validation_dict["validation_y"]))
validation_tfds = validation_tfds.shuffle(1000).batch(512)

test_tfds = tf.data.Dataset.from_tensor_slices((test_dict["test_user"], test_dict["test_item"],
                                              test_dict["test_user_bert"], test_dict["test_item_bert"],
                                              test_dict["test_user_roberta"], test_dict["test_item_roberta"],
                                              test_dict["test_y"]))
test_tfds = test_tfds.shuffle(1000).batch(512)

# Initiation

In [24]:
EPOCHS = 100
LR = 0.001

loss_object = tf.keras.losses.MeanSquaredError()
optimizer = Adam(learning_rate = LR)

load_metrics()

# Model Train & Test

In [25]:
model = MFNR(USER_LEN, ITEM_LEN, 64)

In [26]:
loss_list = []
num = 0

for epoch in range(EPOCHS):
    trainer()
    validation()
    print(f"Epoch: {epoch}")
    print(f'train_loss: {(train_loss.result()):.3f}, validation_loss: {(validation_loss.result()):.3f}\n')

    loss_list.append(validation_loss.result())
    min_loss = np.min(loss_list)

    if validation_loss.result() <= min_loss:
        print("Model Save!")
        num = 0
        best_model = model
    else:
        num += 1
        print(f"num: {num}")
        if num == 5:
            print("early stopping")
            break

    train_loss.reset_states()
    validation_loss.reset_states()


Epoch: 0
train_loss: 2.080, validation_loss: 0.855

Model Save!

Epoch: 1
train_loss: 0.781, validation_loss: 0.787

Model Save!

Epoch: 2
train_loss: 0.634, validation_loss: 0.773

Model Save!

Epoch: 3
train_loss: 0.556, validation_loss: 0.845

num: 1

Epoch: 4
train_loss: 0.626, validation_loss: 0.910

num: 2

Epoch: 5
train_loss: 0.678, validation_loss: 0.830

num: 3

Epoch: 6
train_loss: 0.813, validation_loss: 0.970

num: 4

Epoch: 7
train_loss: 0.981, validation_loss: 0.791

num: 5
early stopping


In [27]:
tester()
print(f'Test_loss: {(test_loss.result()):.3f}')
print(f'Test_rmse: {(test_rmse.result()):.3f}')
print(f'Test_mae: {(test_mae.result()):.3f}')

Test_loss: 0.777
Test_rmse: 0.883
Test_mae: 0.605
