# Data Path

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd  # Data Path

/content/drive/MyDrive/대학원/논문 주제/meta_embedding/실험/data/new_dataset/5_core


In [None]:
path = "" # path for save pickle data

# Package

In [None]:
# Basic
import pandas as pd
import numpy as np
import os
import json
from tqdm import tqdm
import gzip
import pickle

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

# Model
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer, Concatenate, Dense, Input, Embedding, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import MeanSquaredError, BinaryCrossentropy
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.metrics import Mean, RootMeanSquaredError
from sklearn.metrics import mean_absolute_error, mean_squared_error

# BERT, RoBERTa
import torch
from transformers import AutoTokenizer, AutoModel


# Function

In [None]:
# For Amazon Data Load
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [None]:
def Load_data(df, user, item, rating, text): # insert variable's name of Data coresponding each variable of function
    df = df[[user, item, rating, text]] # extract user ID, item ID, rating, reviewtext
    df.rename(columns = {user: "user",
                        item: "item",
                        rating: "rating",
                        text: "text"},
             inplace = True)


    le = LabelEncoder()
    df["user"] = le.fit_transform(df["user"].values)
    df["item"] = le.fit_transform(df["item"].values)

    USER_LEN = df["user"].max() + 1 # number of users
    ITEM_LEN = df["item"].max() + 1 # number of items
    return df, USER_LEN, ITEM_LEN

In [None]:
def Tokenize(data, model_ckpt, batch_size): # function of extracting [CLS] Token embedding from BERT-based model

    """
    model_ckpt: verion of BERT or RoBERTa model
    col_name: append cls token embedding data column into dataframe
    batch_size: recommend that the value of this variable be 2 or 4
    """

    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    model = AutoModel.from_pretrained(model_ckpt).to(device)

    embeddings = []
    text_list = data['text'].tolist()

    for i in tqdm(range(0, len(text_list), batch_size)):
        batch_texts = text_list[i:i+batch_size]

        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True) # default of max_length is 512
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)

        with torch.no_grad():
            embedding = model(input_ids=input_ids, attention_mask=attention_mask)
        embeddings.append(embedding.last_hidden_state[:, 0, :])  # append CLS token embedding data

    # Stack embeddings into a tensor
    stacked_embeddings = torch.cat(embeddings, dim=0)

    stacked_embeddings = stacked_embeddings.cpu().numpy()

    result = stacked_embeddings.tolist()

    return result

In [None]:
def bert_roberta(df, batch_size = 2):
    df["bert"] = Tokenize(df, "bert-base-uncased", batch_size)
    df["roberta"] = Tokenize(df, "roberta-base", batch_size)

    return df

In [None]:
def preprocessing_data(df): # Split data into 8/2
    TRAIN_SIZE = 0.8
    ratings = shuffle(df, random_state=1)
    cutoff = int(TRAIN_SIZE * len(ratings))
    train_df = ratings.iloc[:cutoff]
    test_df = ratings.iloc[cutoff:]

    return train_df, test_df

# Load Data

In [None]:
df = getDF('Industrial_and_Scientific_5.json.gz')

In [None]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"11 27, 2017",A1JB7HFWHRYHT7,B0000223SI,{'Size:': ' 1-(Pack)'},Alex W.,This worked really well for what I used it for...,Couldn't have been happier with it's performance,1511740800,,
1,5.0,True,"11 4, 2017",A2FCLJG5GV8SD6,B0000223SI,{'Size:': ' 1-(Pack)'},Randall Harris,Fast cutting and good adheasive.,Good paper.,1509753600,,
2,5.0,False,"10 27, 2017",A3IT9B33NWYQSL,B0000223SI,{'Size:': ' 1-(Pack)'},A. C.,Worked great for my lapping bench. I would li...,Handy!,1509062400,,
3,4.0,True,"01 13, 2018",AUL5LCV4TT73P,B0000223SK,{'Size:': ' 1-Pack'},TnT,As advertised,As advertised,1515801600,,
4,5.0,True,"10 7, 2017",A1V3I3L5JKO7TM,B0000223SK,{'Size:': ' 1-Pack'},John Jones,seems like a pretty good value as opposed to b...,seems like a pretty good value as opposed to b...,1507334400,,


In [None]:
df, USER_LEN, ITEM_LEN = Load_data(df, "reviewerID", "asin", "overall", "reviewText")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns = {user: "user",
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["user"] = le.fit_transform(df["user"].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["item"] = le.fit_transform(df["item"].values)


In [None]:
df.head()

Unnamed: 0,user,item,rating,text
0,1557,0,5.0,This worked really well for what I used it for...
1,4282,0,5.0,Fast cutting and good adheasive.
2,7415,0,5.0,Worked great for my lapping bench. I would li...
3,10602,1,4.0,As advertised
4,2574,1,5.0,seems like a pretty good value as opposed to b...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77071 entries, 0 to 77070
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   user    77071 non-null  int64  
 1   item    77071 non-null  int64  
 2   rating  77071 non-null  float64
 3   text    77060 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 2.9+ MB


# Data Preprocessing

In [None]:
df = df.dropna()
bert_roberta_df = bert_roberta(df, batch_size = 1)
train_df, test_df = preprocessing_data(bert_roberta_df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

cuda


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

100%|██████████| 77060/77060 [14:57<00:00, 85.90it/s]


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

cuda


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 77060/77060 [15:03<00:00, 85.29it/s]


In [None]:
train_dict = {"train_user": train_df["user"].values,
              "train_item": train_df["item"].values,
              "train_bert": np.array(train_df["bert"].tolist()),
              "train_roberta": np.array(train_df["roberta"].tolist()),
              "train_y": train_df["rating"].values}

test_dict = {"test_user": test_df["user"].values,
              "test_item": test_df["item"].values,
              "test_bert": np.array(test_df["bert"].tolist()),
              "test_roberta": np.array(test_df["roberta"].tolist()),
              "test_y": test_df["rating"].values}

In [None]:
with open(f'{path}/train_data.pkl', 'wb') as f: # Save train data
    pickle.dump(train_dict, f)
with open(f'{path}/test_data.pkl', 'wb') as f: # Save test data
    pickle.dump(test_dict, f)

In [None]:
# with open(f'{path}/train_data.pkl', 'rb') as f: # Load train data (Not essential)
#     train_dict = pickle.load(f)
# with open(f'{path}/test_data.pkl', 'rb') as f:  # Load test data (Not essential)
#     test_dict = pickle.load(f)

In [None]:
train_user = train_dict["train_user"]
train_item = train_dict["train_item"]
train_bert = train_dict["train_bert"]
train_roberta = train_dict["train_roberta"]
train_y = train_dict["train_y"]


test_user = test_dict["test_user"]
test_item = test_dict["test_item"]
test_bert = test_dict["test_bert"]
test_roberta = test_dict["test_roberta"]
test_y = test_dict["test_y"]

In [None]:
print(f"train_user shape: {train_user.shape}")
print(f"train_item shape: {train_item.shape}")
print(f"train_user_bert shape: {train_bert.shape}")
print(f"train_item_bert shape: {train_roberta.shape}")

train_user shape: (61648,)
train_item shape: (61648,)
train_user_bert shape: (61648, 768)
train_item_bert shape: (61648, 768)


In [None]:
print(f"test_user shape: {test_user.shape}")
print(f"test_item shape: {test_item.shape}")
print(f"test_user_bert shape: {test_bert.shape}")
print(f"test_item_bert shape: {test_roberta.shape}")

test_user shape: (15412,)
test_item shape: (15412,)
test_user_bert shape: (15412, 768)
test_item_bert shape: (15412, 768)


# Model

In [None]:
def ModelBuild_Full(user_num, item_num, id_dims):

    # user
    user_input = Input(shape=(1,), dtype='int32', name='UserInput')
    user_embedding = Embedding(user_num, id_dims, input_length=user_input.shape[1], name='UserIDEmb')(user_input)
    user_embedding = Flatten(name='UserFlatten')(user_embedding)

    # item
    item_input = Input(shape=(1,), dtype='int32', name='ItemInput')
    item_embedding = Embedding(item_num, id_dims, input_length=item_input.shape[1], name='ItemIDEmb')(item_input)
    item_embedding = Flatten(name='itemFlatten')(item_embedding)

    # Bert
    bert = Input(shape=(768,), name='Bert')
    # RoBERTa
    roberta = Input(shape=(768,), name='Roberta')

    # Concatenate Bert Roberta for multi embedding
    bert_roberta = Concatenate(name='BertRobertaConcat')([bert, roberta])

    # rating module
    user_mlp_1 = Dense(64, activation = "relu")(user_embedding)
    user_mlp_1_dropout = Dropout(rate=0.1)(user_mlp_1)
    user_mlp_2 = Dense(32, activation = "relu")(user_mlp_1_dropout)
    user_mlp_2_dropout = Dropout(rate=0.1)(user_mlp_2)


    item_mlp_1 = Dense(64, activation = "relu")(user_embedding)
    item_mlp_1_dropout = Dropout(rate=0.1)(item_mlp_1)
    item_mlp_2 = Dense(32, activation = "relu")(item_mlp_1_dropout)
    item_mlp_2_dropout = Dropout(rate=0.1)(item_mlp_2)

    user_item_output = Concatenate(name = "User_Item_Output")([user_mlp_2_dropout, item_mlp_2_dropout])

    #Reviews MLP
    bert_roberta_mlp_1 = Dense(128, activation = "relu")(bert_roberta)
    bert_roberta_mlp_1_dropout = Dropout(rate=0.1)(bert_roberta_mlp_1)
    bert_roberta_mlp_2 = Dense(64, activation = "relu")(bert_roberta_mlp_1_dropout)
    bert_roberta_mlp_2_dropout = Dropout(rate=0.1)(bert_roberta_mlp_2)

    # Concatenate Reviews User_Item
    user_item_review = Concatenate(name='User_Latent_Factor')([user_item_output, bert_roberta_mlp_2_dropout])

    user_item_review_1 = Dense(512, activation = "relu")(bert_roberta)
    user_item_review_1_dropout = Dropout(rate=0.1)(user_item_review_1)
    user_item_review_2 = Dense(128, activation = "relu")(user_item_review_1_dropout)
    user_item_review_2_dropout = Dropout(rate=0.1)(user_item_review_2)
    user_item_review_2 = Dense(32, activation = "relu")(user_item_review_1_dropout)
    user_item_review_2_dropout = Dropout(rate=0.1)(user_item_review_2)


    outputs = Dense(1, activation='linear', name='outputs')(user_item_review_2_dropout)

    model = Model(inputs=[user_input, item_input, bert, roberta], outputs=outputs)

    return model

In [None]:
model_full = ModelBuild_Full(user_num = USER_LEN, item_num = ITEM_LEN, id_dims = 32)

In [None]:
model_full.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 Bert (InputLayer)           [(None, 768)]                0         []                            
                                                                                                  
 Roberta (InputLayer)        [(None, 768)]                0         []                            
                                                                                                  
 BertRobertaConcat (Concate  (None, 1536)                 0         ['Bert[0][0]',                
 nate)                                                               'Roberta[0][0]']             
                                                                                                  
 dense_6 (Dense)             (None, 64)                   98368     ['BertRobertaConcat[0][0]'

# Train & Test

In [None]:
adam = Adam(learning_rate=0.0001)

model_full.compile(optimizer=adam, loss=tf.keras.losses.MeanSquaredError())
es = EarlyStopping(monitor='val_loss', mode = 'min', verbose = 1, patience = 5, restore_best_weights = True)

In [None]:
history = model_full.fit([train_user, train_item, train_bert, train_roberta], # array형태로 바꿔서 하는 것이 효율적!
              train_y,
              batch_size = 32,
              epochs = 50,
              callbacks=[es],
              validation_split = 0.1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 27: early stopping


In [None]:
prediction = model_full.predict([test_user, test_item, test_bert, test_roberta])



In [None]:
MAE_temp = mean_absolute_error(test_y, prediction)
RMSE_temp = mean_squared_error(test_y, prediction, squared = False)
MSE_temp = mean_squared_error(test_y, prediction, squared = True)

print(f"RMSE : {RMSE_temp:.3f}, MSE: {MSE_temp:.3f}, MAE : {MAE_temp:.3f}")

RMSE : 0.643, MSE: 0.414, MAE : 0.440
