In [121]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.layers import Dropout, Activation
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD, Adam, Adamax
from tensorflow.keras.callbacks import EarlyStopping

In [122]:
rating=pd.read_csv('all_data_final.csv', encoding='utf-8')
rating=rating.iloc[:,[0,1,2,5,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25]]
rating=rating.fillna(0)

In [123]:
ratings=rating[rating['dominant_R']!=0]
none_rgb=rating[rating.dominant_R==0]

In [124]:
#MF data processing
# User encoding
user_dict = {}
for i in set(ratings['User-ID']):
    user_dict[i] = len(user_dict)
n_user = len(user_dict)

# Item encoding
item_dict = {}
start_point = n_user
for i in set(ratings['ISBN']):
    item_dict[i] = start_point + len(item_dict)
n_item = len(item_dict)
start_point += n_item

# author rating group2 encoding
author_dict = {}
for i in set(ratings['Author_3cluster']):
    author_dict[i] = start_point + len(author_dict)
n_author = len(author_dict)
start_point += n_author

dominant_R_index=start_point
start_point += 1

dominant_G_index=start_point
start_point += 1

dominant_B_index=start_point
start_point += 1

num_x = start_point             # Total number of x

# DL data processing
ISBN_dict = dict()

for i, ISBN in enumerate(none_rgb['ISBN'].unique()):
    ISBN_dict[ISBN] = i
none_rgb['ISBN_idx'] = none_rgb['ISBN'].apply(lambda x : ISBN_dict[x])

# make User-ID to int
USER_dict = dict()

for i, USER in enumerate(none_rgb['User-ID'].unique()):
    USER_dict[USER] = i
none_rgb['USER_idx'] = none_rgb['User-ID'].apply(lambda x : USER_dict[x])

r_cols = ['USER_idx', 'ISBN_idx', 'Book-Rating']
u_cols = ['USER_idx', 'Age-Group']
b_cols = ['ISBN_idx', 'Author_3cluster']
y_cols=['ISBN_idx', 'Pub_Year_Group']

book_ratings = none_rgb[r_cols]
users = none_rgb[u_cols].drop_duplicates()
books = none_rgb[b_cols].drop_duplicates()
pub_years=none_rgb[y_cols].drop_duplicates()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [125]:
# train test 분리
TRAIN_SIZE = 0.7
ratings = shuffle(ratings)

cutoff = int(TRAIN_SIZE * len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]
x = shuffle(ratings_train, random_state=1)
x_test = shuffle(ratings_test, random_state=1)

none_rgb = shuffle(none_rgb)
cutoff = int(TRAIN_SIZE * len(none_rgb))
none_rgb_train = none_rgb.iloc[:cutoff]
none_rgb_test = none_rgb.iloc[cutoff:]

test_data=pd.concat([x_test, none_rgb_test], axis=0)

In [126]:
#FM Generate X data
data = []
y = []

R_mean = np.mean(x['dominant_R'])
R_std = np.std(x['dominant_R'])
G_mean = np.mean(x['dominant_G'])
G_std = np.std(x['dominant_G'])
B_mean = np.mean(x['dominant_B'])
B_std = np.std(x['dominant_B'])

w0 = np.mean(x['Book-Rating'])
for i in range(len(x)):
    case = x.iloc[i]
    x_index = []
    x_value = []
    x_index.append(user_dict[case['User-ID']])     # User id encoding
    x_value.append(1.)
    x_index.append(item_dict[case['ISBN']])    # Movie id encoding
    x_value.append(1.)
    x_index.append(author_dict[case['Author_3cluster']])   # author group id encoding
    x_value.append(1.)
    x_index.append(dominant_R_index)
    x_value.append((case['dominant_R']-R_mean)/R_std)
    x_index.append(dominant_G_index)
    x_value.append((case['dominant_G']-G_mean)/G_std)
    x_index.append(dominant_B_index)
    x_value.append((case['dominant_B']-B_mean)/B_std)

    data.append([x_index, x_value])
    y.append(case['Book-Rating'] - w0)
    if (i % 10000) == 0:
        print('Encoding ', i, ' cases...')
        
        
#DL generate data
##GPU : 특정 GPU만 사용할 수 있도록##
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # 텐서플로가 세 번째 GPU만 사용하도록 제한
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU') #gpus 조정(0~3)
    except RuntimeError as e:
        # 프로그램 시작시에 접근 가능한 장치가 설정되어야만 합니다
        print(e)

# Keras model
##GPU : 특정 GPU만 사용할 수 있도록##
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # 텐서플로가 세 번째 GPU만 사용하도록 제한
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU') #gpus 조정(0~3)
    except RuntimeError as e:
        # 프로그램 시작시에 접근 가능한 장치가 설정되어야만 합니다
        print(e)

train_u = pd.merge(none_rgb_train, users, on = 'USER_idx')['Age-Group_x']
test_u = pd.merge(none_rgb_test, users, on = 'USER_idx')['Age-Group_x']

train_b = pd.merge(none_rgb_train, books, on = 'ISBN_idx')['Author_3cluster_x']
test_b = pd.merge(none_rgb_test, books, on = 'ISBN_idx')['Author_3cluster_x']

train_y = pd.merge(none_rgb_train, pub_years, on = 'ISBN_idx')['Pub_Year_Group_x']
test_y = pd.merge(none_rgb_test, pub_years, on = 'ISBN_idx')['Pub_Year_Group_x']

uL = len(users['Age-Group'])
aL = len(books['Author_3cluster'])
yL = len(pub_years['Pub_Year_Group'])
# Variable 초기화 
# Variable 초기화 
K = 20                             # Latent factor 수 
reg = 0.0001                        # Regularization penalty
mu = none_rgb_train['Book-Rating'].mean()    # 전체 평균 
M = none_rgb.USER_idx.max() + 1    # Number of users
N = none_rgb.ISBN_idx.max() + 1    # Number of movies


user = Input(shape=(1, ))
item = Input(shape=(1, ))
P_embedding = Embedding(M, K, embeddings_regularizer=l2(reg))(user) # 20
Q_embedding = Embedding(N, K, embeddings_regularizer=l2(reg))(item) # 20 + 20 = 40
user_bias = Embedding(M, 1, embeddings_regularizer=l2(reg))(user) # 40 + 1 = 41
item_bias = Embedding(N, 1, embeddings_regularizer=l2(reg))(item) # 41 + 1 = 42

# Concatenate layers
P_embedding = Flatten()(P_embedding)
Q_embedding = Flatten()(Q_embedding)
user_bias = Flatten()(user_bias)
item_bias = Flatten()(item_bias)

age = Input(shape=(1, ))
age_embedding = Embedding(uL, 3, embeddings_regularizer = l2())(age) # 42 + 1 = 43
age_layer = Flatten()(age_embedding)

author = Input(shape=(1, ))
author_embedding = Embedding(aL, 3, embeddings_regularizer = l2())(author) # 43 + 1 = 44
author_layer = Flatten()(author_embedding)

year = Input(shape=(1, ))
year_embedding = Embedding(yL, 3, embeddings_regularizer = l2())(year) # 43 + 1 = 44
year_layer = Flatten()(year_embedding)

R = Concatenate()([P_embedding, Q_embedding, user_bias, item_bias, age_layer, author_layer, year_layer])


Encoding  0  cases...
Encoding  10000  cases...


In [127]:
# Defining RMSE measure
def RMSE(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))
def RMSE2(y_true, y_pred):
      return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2)) 

In [128]:
# FM model
class FM():
    def __init__(self, N, K, data, y, alpha, beta, train_ratio=0.75, iterations=100, tolerance=0.005, l2_reg=True, verbose=True):
        self.K = K          # Number of latent factors
        self.N = N          # Number of x (variables)
        self.n_cases = len(data)            # N of observations
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.l2_reg = l2_reg
        self.tolerance = tolerance
        self.verbose = verbose
        # w 초기화
        self.w = np.random.normal(scale=1./self.N, size=(self.N))
        # v 초기화
        self.v = np.random.normal(scale=1./self.K, size=(self.N, self.K))
        # Train/Test 분리
        cutoff = int(train_ratio * len(data))
        self.train_x = data[:cutoff]
        self.test_x = data[cutoff:]
        self.train_y = y[:cutoff]
        self.test_y = y[cutoff:]

    def test(self):                                     # Training 하면서 RMSE 계산 
        # SGD를 iterations 숫자만큼 수행
        best_RMSE = 10000
        best_iteration = 0
        training_process = []
        for i in range(self.iterations):
            rmse1 = self.sgd(self.train_x, self.train_y)        # SGD & Train RMSE 계산
            rmse2 = self.test_rmse(self.test_x, self.test_y)    # Test RMSE 계산     
            training_process.append((i, rmse1, rmse2))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.6f ; Test RMSE = %.6f" % (i+1, rmse1, rmse2))
            if best_RMSE > rmse2:                       # New best record
                best_RMSE = rmse2
                best_iteration = i
            elif (rmse2 - best_RMSE) > self.tolerance:  # RMSE is increasing over tolerance
                break
        print(best_iteration, best_RMSE)
        return training_process
        
    # w, v 업데이트를 위한 Stochastic gradient descent 
    def sgd(self, x_data, y_data):
        y_pred = []
        for data, y in zip(x_data, y_data):
            x_idx = data[0]
            x_0 = np.array(data[1])     # xi axis=0 [1, 2, 3]
            x_1 = x_0.reshape(-1, 1)    # xi axis=1 [[1], [2], [3]]
    
            # biases
            bias_score = np.sum(self.w[x_idx] * x_0)
    
            # score 계산
            vx = self.v[x_idx] * (x_1)
            sum_vx = np.sum(vx, axis=0)
            sum_vx_2 = np.sum(vx * vx, axis=0)
            latent_score = 0.5 * np.sum(np.square(sum_vx) - sum_vx_2)

            # 예측값 계산
            y_hat = bias_score + latent_score
            y_pred.append(y_hat)
            error = y - y_hat
            # w, v 업데이트
            if self.l2_reg:
                self.w[x_idx] += error * self.alpha * (x_0 - self.beta * self.w[x_idx])
                self.v[x_idx] += error * self.alpha * ((x_1) * sum(vx) - (vx * x_1) - self.beta * self.v[x_idx])
            else:
                self.w[x_idx] += error * self.alpha * x_0
                self.v[x_idx] += error * self.alpha * ((x_1) * sum(vx) - (vx * x_1))
        return RMSE2(y_data, y_pred)
            
    def test_rmse(self, x_data, y_data):
        y_pred = []
        for data , y in zip(x_data, y_data):
            y_hat = self.predict(data[0], data[1])
            y_pred.append(y_hat)
        return RMSE2(y_data, y_pred)
    
    #hybrid 예측을 위해서 추가
    def test_rmse2(self, x_data, y_data):
        y_hat = self.predict(x_data[0], x_data[1])
        return y_data, y_hat

    def predict(self, idx, x):
        x_0 = np.array(x)
        x_1 = x_0.reshape(-1, 1)

        # biases
        bias_score = np.sum(self.w[idx] * x_0)

        # score 계산
        vx = self.v[idx] * (x_1)
        sum_vx = np.sum(vx, axis=0)
        sum_vx_2 = np.sum(vx * vx, axis=0)
        latent_score = 0.5 * np.sum(np.square(sum_vx) - sum_vx_2)

        # 예측값 계산
        y_hat = bias_score + latent_score
        return y_hat

In [129]:
# DL model
R = Dense(2048)(R)
R = Activation('relu')(R)
R = Dense(512)(R)
R = Activation('linear')(R)
R = Dense(1)(R)

model = Model(inputs=[user, item, age, author, year], outputs=R)
model.compile(
  loss=RMSE,
  optimizer=Adam(lr=0.001),
  metrics=[RMSE]
)

In [130]:
#train FM
K = 200
fm1 = FM(num_x, K, data, y, alpha=0.000025, beta=0.007, train_ratio=0.8, iterations=100, tolerance=0.0001, l2_reg=True, verbose=True)
result = fm1.test()

Iteration: 10 ; Train RMSE = 1.619285 ; Test RMSE = 1.612121
Iteration: 20 ; Train RMSE = 1.536150 ; Test RMSE = 1.526587
Iteration: 30 ; Train RMSE = 1.506927 ; Test RMSE = 1.494261
Iteration: 40 ; Train RMSE = 1.496288 ; Test RMSE = 1.481489
Iteration: 50 ; Train RMSE = 1.491880 ; Test RMSE = 1.475969
Iteration: 60 ; Train RMSE = 1.489541 ; Test RMSE = 1.473254
Iteration: 70 ; Train RMSE = 1.487887 ; Test RMSE = 1.471697
Iteration: 80 ; Train RMSE = 1.486460 ; Test RMSE = 1.470654
Iteration: 90 ; Train RMSE = 1.485107 ; Test RMSE = 1.469858
Iteration: 100 ; Train RMSE = 1.483779 ; Test RMSE = 1.469188
99 1.4691875048945375


In [131]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=30)

result = model.fit(
  x=[none_rgb_train['USER_idx'].values, none_rgb_train['ISBN_idx'].values, train_u.values, train_b.values, train_y.values],
  y=none_rgb_train['Book-Rating'].values - mu,
  epochs=100,
  batch_size=256,
  validation_data=(
    [none_rgb_test['USER_idx'].values, none_rgb_test['ISBN_idx'].values, test_u.values, test_b.values, test_y.values],
    none_rgb_test['Book-Rating'].values - mu
  ),
  callbacks=[es]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 00031: early stopping


In [132]:
#test
fm_predictions=[]
dl_predictions=[]
weight=[0.3,0.7]
for i in range(len(test_data)):
    case=test_data.iloc[i]
    
    if case['dominant_R']==0.0:
        dl_predictions.append(case)
        
    else:
        x_index = []
        x_value = []
        x_index.append(user_dict[case['User-ID']])     # User id encoding
        x_value.append(1.)
        x_index.append(item_dict[case['ISBN']])    # Movie id encoding
        x_value.append(1.)
        x_index.append(author_dict[case['Author_3cluster']])   # author group id encoding
        x_value.append(1.)
        x_index.append(dominant_R_index)
        x_value.append((case['dominant_R']-R_mean)/R_std)
        x_index.append(dominant_G_index)
        x_value.append((case['dominant_G']-G_mean)/G_std)
        x_index.append(dominant_B_index)
        x_value.append((case['dominant_B']-B_mean)/B_std)
        
        x_data=[x_index, x_value]
        y_data=case['Book-Rating'] - w0
        y_true, y_pred=fm1.test_rmse2(x_data, y_data)
        temp=[y_true*weight[0],y_pred*weight[0]]
        fm_predictions.append(temp)
        
fm_predictions=np.array(fm_predictions).transpose()
fm_y_pred=fm_predictions[0]
fm_y_true=fm_predictions[1]
       
dl_predictions=pd.DataFrame(dl_predictions) 
user_ids = dl_predictions['USER_idx'].values
item_ids = dl_predictions['ISBN_idx'].values
age_ids = dl_predictions['Age-Group'].values
author_ids = dl_predictions['Author_3cluster'].values
year_ids=dl_predictions['Pub_Year_Group'].values
y_pred = model.predict([user_ids, item_ids, test_u, test_b, test_y]) + mu
y_pred = np.ravel(y_pred, order='C') * weight[1]
y_true = np.array(dl_predictions['Book-Rating']) * weight[1]

all_y_pred=np.concatenate((fm_y_pred,y_pred),axis=0)
all_y_true=np.concatenate((fm_y_true,y_true),axis=0)

res_rmse=RMSE2(all_y_pred, all_y_true)

In [133]:
res_rmse

1.0097066077565433

In [134]:
#test
fm_predictions=[]
dl_predictions=[]

for i in range(len(test_data)):
    case=test_data.iloc[i]
    
    if case['dominant_R']==0.0:
        dl_predictions.append(case)
        
    else:
        x_index = []
        x_value = []
        x_index.append(user_dict[case['User-ID']])     # User id encoding
        x_value.append(1.)
        x_index.append(item_dict[case['ISBN']])    # Movie id encoding
        x_value.append(1.)
        x_index.append(author_dict[case['Author_3cluster']])   # author group id encoding
        x_value.append(1.)
        x_index.append(dominant_R_index)
        x_value.append((case['dominant_R']-R_mean)/R_std)
        x_index.append(dominant_G_index)
        x_value.append((case['dominant_G']-G_mean)/G_std)
        x_index.append(dominant_B_index)
        x_value.append((case['dominant_B']-B_mean)/B_std)
        
        x_data=[x_index, x_value]
        y_data=case['Book-Rating'] - w0
        y_true, y_pred=fm1.test_rmse2(x_data, y_data)
        temp=[y_true,y_pred]
        fm_predictions.append(temp)
        
#rmse 계산 형태 만들어주기        
fm_predictions=np.array(fm_predictions).transpose()
fm_y_pred=fm_predictions[0]
fm_y_true=fm_predictions[1]

#dl 모델의 예측을 위한 y_pred, y_true 계산       
dl_predictions=pd.DataFrame(dl_predictions) 
user_ids = dl_predictions['USER_idx'].values
item_ids = dl_predictions['ISBN_idx'].values
age_ids = dl_predictions['Age-Group'].values
author_ids = dl_predictions['Author_3cluster'].values
year_ids=dl_predictions['Pub_Year_Group'].values
y_pred = model.predict([user_ids, item_ids, test_u, test_b, test_y]) + mu
y_pred = np.ravel(y_pred, order='C') 
y_true = np.array(dl_predictions['Book-Rating']) 

#fm+dl
all_y_pred=np.concatenate((fm_y_pred,y_pred),axis=0)
all_y_true=np.concatenate((fm_y_true,y_true),axis=0)

res_rmse=RMSE2(all_y_pred, all_y_true)

In [135]:
res_rmse

1.693770027750958