In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.utils import shuffle
from google.colab import drive
from sklearn.metrics.pairwise import cosine_similarity

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('/content/drive/MyDrive/recomm_sample/u.data.csv', names=r_cols,  sep='\t',encoding='latin-1')
ratings = ratings[['user_id', 'movie_id', 'rating']].astype(int)            

train_size = 0.75
ratings = shuffle(ratings, random_state=1)
cutoff = int(train_size * len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]

In [20]:
def rmse_2(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))
  
def recommender_0(recomm_list):
  recomm = []
  for pair in recomm_list:
    recomm.append(random.random() * 4 + 1)
  return np.array(recomm)

def recommender_1(recomm_list):
  recomm = []
  for pair in recomm_list:
    recomm.append(random.random() * 4 + 1)
  return np.array(recomm)

# 하이브리드 결과 얻기
weight = [0.8, 0.2]
recomm_list = np.array(ratings_test)
prediction_0 = recommender_0(recomm_list)
prediction_1 = recommender_1(recomm_list)
predictions = prediction_0 * weight[0] + prediction_1 * weight[1]  # 각 예측값에 가중치를 두어 하이브리드 점수 예측

rmse_2(recomm_list[:,2],predictions)

1.5501506414743311

# 본격적인 하이브리드 추천시스템 : CF & MF 결합

In [46]:
# CF

rating_matrix = ratings_train.pivot(index='user_id', columns='movie_id', values='rating')
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy,matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,index=rating_matrix.index,columns=rating_matrix.index)

rating_mean = rating_matrix.mean(axis=1)
rating_bias = (rating_matrix.T-rating_mean).T

def cf_knn_bias(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_bias:
        sim_scores = user_similarity[user_id]
        movie_ratings = rating_bias[movie_id]
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            prediction = prediction + rating_mean[user_id]
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                prediction = prediction + rating_mean[user_id]
            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]
    return prediction

In [5]:
# MF
class mf():
  def __init__(self,ratings,k,alpha,beta,iterations,verbose=True):
    self.R = np.array(ratings)
    item_id_index = []
    index_item_id = []
    for i, one_id in enumerate(ratings):
      item_id_index.append([one_id,i])
      index_item_id.append([i,one_id])
    
    self.item_id_index = dict(item_id_index)
    self.index_item_id = dict(index_item_id)

    user_id_index = []
    index_user_id = []
    for i, one_id in enumerate(ratings.T):
      user_id_index.append([one_id,i])
      index_user_id.append([i,one_id])
    
    self.user_id_index = dict(user_id_index)
    self.index_user_id = dict(index_user_id)

    self.num_users, self.num_items = np.shape(self.R)
    self.k = k
    self.alpha = alpha
    self.beta = beta
    self.iterations = iterations
    self.verbose = verbose
  
  def rmse(self):
    xs,ys = self.R.nonzero()
    self.predictions = []
    self.errors = []
    for x,y in zip(xs,ys):
      prediction = self.get_prediction(x,y)
      self.predictions.append(prediction)
      self.errors.append(self.R[x,y] - prediction)
    self.predictions = np.array(self.predictions)
    self.errors = np.array(self.errors)

    return np.sqrt(np.mean(self.errors ** 2))
  
  def get_prediction(self, i, j):
    prediction = self.b + self.b_u[i] + self.b_d[j] + self.p[i,:].dot(self.q[j,:].T)
    return prediction

  def sgd(self):
    for i, j, r in self.samples:
      prediction = self.get_prediction(i,j)
      e = (r - prediction)
    
      self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
      self.b_d[j] += self.alpha * (e - self.beta * self.b_d[j])

      self.p[i,:] += self.alpha * (e * self.q[j,:] - self.beta * self.p[i,:])
      self.q[j,:] += self.alpha * (e * self.p[i,:] - self.beta * self.q[j,:])
    
  def set_test(self,ratings_test):
    test_set = []
    for i in range(len(ratings_test)):
      x = self.user_id_index[ratings_test.iloc[i,0]]
      y = self.item_id_index[ratings_test.iloc[i,1]]
      z = ratings_test.iloc[i,2]
      test_set.append([x,y,z])
      self.R[x,y] = 0
    self.test_set = test_set
    return test_set

  def test_rmse(self):
    error = 0
    for one_set in self.test_set:
      predicted = self.get_prediction(one_set[0], one_set[1])
      error += pow(one_set[2] - predicted, 2)
    return np.sqrt(error/len(self.test_set))
  
  def test(self):
    self.p = np.random.normal(scale=1./self.k, size=(self.num_users, self.k))
    self.q = np.random.normal(scale=1./self.k, size=(self.num_items, self.k))
    self.b_u = np.zeros(self.num_users)
    self.b_d = np.zeros(self.num_items)
    self.b = np.mean(self.R[self.R.nonzero()])
    rows, columns = self.R.nonzero()
    self.samples = [(i, j, self.R[i,j]) for i, j in zip(rows, columns)]
        
    training_process = []
    for i in range(self.iterations):
      np.random.shuffle(self.samples)
      self.sgd()
      rmse1 = self.rmse()
      rmse2 = self.test_rmse()
      training_process.append((i+1, rmse1, rmse2))
      if self.verbose:
        if (i+1) % 10 == 0:
           print("Iteration: %d ; Train RMSE = %.4f ; Test RMSE = %.4f" % (i+1, rmse1, rmse2))
    return training_process

  def get_one_prediction(self, user_id, item_id):
    prediction = self.get_prediction(self.user_id_index[user_id], self.item_id_index[item_id])
    return prediction

  def full_prediction(self):
    return self.b + self.b_u[:,np.newaxis] + self.b_d[np.newaxis,:] + self.p.dot(self.q.T)


R_temp = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
mf = mf(R_temp, k=200, alpha=0.001, beta=0.02, iterations=250, verbose=True)
test_set = mf.set_test(ratings_test)
result = mf.test()

Iteration: 10 ; Train RMSE = 0.9664 ; Test RMSE = 0.9834
Iteration: 20 ; Train RMSE = 0.9420 ; Test RMSE = 0.9644
Iteration: 30 ; Train RMSE = 0.9313 ; Test RMSE = 0.9566
Iteration: 40 ; Train RMSE = 0.9253 ; Test RMSE = 0.9524
Iteration: 50 ; Train RMSE = 0.9214 ; Test RMSE = 0.9497
Iteration: 60 ; Train RMSE = 0.9187 ; Test RMSE = 0.9480
Iteration: 70 ; Train RMSE = 0.9166 ; Test RMSE = 0.9468
Iteration: 80 ; Train RMSE = 0.9148 ; Test RMSE = 0.9459
Iteration: 90 ; Train RMSE = 0.9131 ; Test RMSE = 0.9452
Iteration: 100 ; Train RMSE = 0.9113 ; Test RMSE = 0.9444
Iteration: 110 ; Train RMSE = 0.9091 ; Test RMSE = 0.9436
Iteration: 120 ; Train RMSE = 0.9060 ; Test RMSE = 0.9424
Iteration: 130 ; Train RMSE = 0.9017 ; Test RMSE = 0.9408
Iteration: 140 ; Train RMSE = 0.8955 ; Test RMSE = 0.9384
Iteration: 150 ; Train RMSE = 0.8872 ; Test RMSE = 0.9353
Iteration: 160 ; Train RMSE = 0.8767 ; Test RMSE = 0.9316
Iteration: 170 ; Train RMSE = 0.8645 ; Test RMSE = 0.9277
Iteration: 180 ; Train 

In [47]:
def recommender_0(recomm_list,mf):
  recomm = np.array([mf.get_one_prediction(user,movie) for (user,movie) in recomm_list])
  return recomm

def recommender_1(recomm_list,neighbor_size=0):
  recomm = np.array([cf_knn_bias(user,movie,neighbor_size) for (user,movie) in recomm_list])
  return recomm

recomm_list = np.array(ratings_test.iloc[:,[0,1]])
pred_0 = recommender_0(recomm_list,mf)
print(rmse_2(ratings_test.iloc[:, 2], pred_0))
pred_1 = recommender_1(recomm_list,37)
print(rmse_2(ratings_test.iloc[:, 2], pred_1))

weight = [0.8, 0.2]
predictions = pred_0 * weight[0] + pred_1 * weight[1]
print(rmse_2(ratings_test.iloc[:,2], predictions))


for i in np.arange(0,1,0.01):
  weight = [i, 1.0-i]
  predictions = pred_0 * weight[0] + pred_1 * weight[1]
  print("Weights - %.2f : %.2f ; RMSE = %.7f" % (weight[0], weight[1], rmse_2(ratings_test.iloc[:, 2], predictions)))

0.9095328485264699
0.9467199341641682
0.9092474981603625
Weights - 0.00 : 1.00 ; RMSE = 0.9467199
Weights - 0.01 : 0.99 ; RMSE = 0.9458869
Weights - 0.02 : 0.98 ; RMSE = 0.9450626
Weights - 0.03 : 0.97 ; RMSE = 0.9442470
Weights - 0.04 : 0.96 ; RMSE = 0.9434403
Weights - 0.05 : 0.95 ; RMSE = 0.9426423
Weights - 0.06 : 0.94 ; RMSE = 0.9418532
Weights - 0.07 : 0.93 ; RMSE = 0.9410729
Weights - 0.08 : 0.92 ; RMSE = 0.9403015
Weights - 0.09 : 0.91 ; RMSE = 0.9395390
Weights - 0.10 : 0.90 ; RMSE = 0.9387854
Weights - 0.11 : 0.89 ; RMSE = 0.9380408
Weights - 0.12 : 0.88 ; RMSE = 0.9373051
Weights - 0.13 : 0.87 ; RMSE = 0.9365784
Weights - 0.14 : 0.86 ; RMSE = 0.9358607
Weights - 0.15 : 0.85 ; RMSE = 0.9351520
Weights - 0.16 : 0.84 ; RMSE = 0.9344524
Weights - 0.17 : 0.83 ; RMSE = 0.9337618
Weights - 0.18 : 0.82 ; RMSE = 0.9330803
Weights - 0.19 : 0.81 ; RMSE = 0.9324080
Weights - 0.20 : 0.80 ; RMSE = 0.9317447
Weights - 0.21 : 0.79 ; RMSE = 0.9310906
Weights - 0.22 : 0.78 ; RMSE = 0.9304456


하이브리드 추천 시스템 : MF & DL

In [18]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Add, Flatten
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import SGD, Adamax
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Dense, Concatenate, Activation

def rmse(y_true,y_pred):
  return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))   # tf용 점수 체크 모델
k = 200  # 잠재요인 k
mu = ratings_train['rating'].mean()
m = ratings_train['user_id'].max() + 1  # user 수
n = ratings_train['movie_id'].max() + 1  # item 수

user = Input(shape=(1, ))
item = Input(shape=(1, ))
p_embedding = Embedding(m, k, embeddings_regularizer=l2())(user)        
q_embedding = Embedding(n, k, embeddings_regularizer=l2())(item)        
user_bias = Embedding(m, 1, embeddings_regularizer=l2())(user)          
item_bias = Embedding(n, 1, embeddings_regularizer=l2())(item) 

p_embedding = Flatten()(p_embedding)
q_embedding = Flatten()(q_embedding)
user_bias = Flatten()(user_bias)
item_bias = Flatten()(item_bias)
r = Concatenate()([p_embedding,q_embedding,user_bias,item_bias])

r = Dense(2048)(r)
r = Activation('linear')(r)
r = Dense(256)(r)
r = Activation('linear')(r)
r = Dense(1)(r)

model = Model(inputs=[user,item], outputs=r)
model.compile(loss=rmse, optimizer=SGD(), metrics=[rmse])
result = model.fit(x=[ratings_train['user_id'].values, ratings_train['movie_id'].values],
                   y=ratings_train['rating'].values - mu,
                   epochs=65, batch_size=512,
                   validation_data = ([ratings_test['user_id'].values,ratings_test['movie_id'].values],
                                      ratings_test['rating'].values - mu)
                   )

Epoch 1/65
Epoch 2/65
Epoch 3/65
Epoch 4/65
Epoch 5/65
Epoch 6/65
Epoch 7/65
Epoch 8/65
Epoch 9/65
Epoch 10/65
Epoch 11/65
Epoch 12/65
Epoch 13/65
Epoch 14/65
Epoch 15/65
Epoch 16/65
Epoch 17/65
Epoch 18/65
Epoch 19/65
Epoch 20/65
Epoch 21/65
Epoch 22/65
Epoch 23/65
Epoch 24/65
Epoch 25/65
Epoch 26/65
Epoch 27/65
Epoch 28/65
Epoch 29/65
Epoch 30/65
Epoch 31/65
Epoch 32/65
Epoch 33/65
Epoch 34/65
Epoch 35/65
Epoch 36/65
Epoch 37/65
Epoch 38/65
Epoch 39/65
Epoch 40/65
Epoch 41/65
Epoch 42/65
Epoch 43/65
Epoch 44/65
Epoch 45/65
Epoch 46/65
Epoch 47/65
Epoch 48/65
Epoch 49/65
Epoch 50/65
Epoch 51/65
Epoch 52/65
Epoch 53/65
Epoch 54/65
Epoch 55/65
Epoch 56/65
Epoch 57/65
Epoch 58/65
Epoch 59/65
Epoch 60/65
Epoch 61/65
Epoch 62/65
Epoch 63/65
Epoch 64/65
Epoch 65/65


In [40]:
ratings_test.iloc[:,2]

53670    4
77110    2
69323    4
85968    2
30243    1
        ..
50057    2
98047    4
5192     4
77708    5
98539    3
Name: rating, Length: 25000, dtype: int64

In [48]:
ratings_test.iloc[:,[0,1]]

Unnamed: 0,user_id,movie_id
53670,345,715
77110,92,998
69323,934,195
85968,586,423
30243,336,383
...,...,...
50057,26,840
98047,625,198
5192,56,568
77708,882,172


In [54]:
def recommender_0(recomm_list,mf):
  recomm = np.array([mf.get_one_prediction(user,movie) for (user,movie) in recomm_list])
  return recomm

def recommender_2(recomm_list):
  recomm = model.predict([ratings_test['user_id'].values, ratings_test['movie_id'].values]) + mu
  return recomm

recomm_list = np.array(ratings_test.iloc[:,[0,1]])
pred_0 = recommender_0(recomm_list,mf)
print(rmse_2(ratings_test.iloc[:, 2], pred_0))
pred_1 = np.ravel(recommender_2(recomm_list),order='C')
print(rmse_2(ratings_test.iloc[:, 2], pred_1))

weight = [0.8, 0.2]
predictions = pred_0 * weight[0] + pred_1 * weight[1]
print(rmse_2(ratings_test.iloc[:,2], predictions))


for i in np.arange(0,1,0.01):
  weight = [i, 1.0-i]
  predictions = pred_0 * weight[0] + pred_1 * weight[1]
  print("Weights - %.2f : %.2f ; RMSE = %.7f" % (weight[0], weight[1], rmse_2(ratings_test.iloc[:, 2], predictions)))

0.9095328485264699
0.9437435923452187
0.9101849223748867
Weights - 0.00 : 1.00 ; RMSE = 0.9437436
Weights - 0.01 : 0.99 ; RMSE = 0.9430306
Weights - 0.02 : 0.98 ; RMSE = 0.9423248
Weights - 0.03 : 0.97 ; RMSE = 0.9416260
Weights - 0.04 : 0.96 ; RMSE = 0.9409343
Weights - 0.05 : 0.95 ; RMSE = 0.9402498
Weights - 0.06 : 0.94 ; RMSE = 0.9395724
Weights - 0.07 : 0.93 ; RMSE = 0.9389022
Weights - 0.08 : 0.92 ; RMSE = 0.9382391
Weights - 0.09 : 0.91 ; RMSE = 0.9375833
Weights - 0.10 : 0.90 ; RMSE = 0.9369346
Weights - 0.11 : 0.89 ; RMSE = 0.9362932
Weights - 0.12 : 0.88 ; RMSE = 0.9356590
Weights - 0.13 : 0.87 ; RMSE = 0.9350321
Weights - 0.14 : 0.86 ; RMSE = 0.9344124
Weights - 0.15 : 0.85 ; RMSE = 0.9338001
Weights - 0.16 : 0.84 ; RMSE = 0.9331950
Weights - 0.17 : 0.83 ; RMSE = 0.9325972
Weights - 0.18 : 0.82 ; RMSE = 0.9320067
Weights - 0.19 : 0.81 ; RMSE = 0.9314236
Weights - 0.20 : 0.80 ; RMSE = 0.9308479
Weights - 0.21 : 0.79 ; RMSE = 0.9302795
Weights - 0.22 : 0.78 ; RMSE = 0.9297185
