In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# sparse_matrix 사용하기
# 이전까지는 full matrix를 사용하여 추천시스템 코드를 짰지만,
# 대형 데이터를 full matrix로 다루게 되면 너무 많은 0값을 처리해야 함
# --> sparse matrix 중 csr_matrix를 사용하여 계산을 빠르게 처리
# 적은 데이터인 경우에는 sparse matrix말고 그냥 full matrix 쓰는게 낫다

# 기본 예제
ratings = {'user_id' : [1,2,4],
           'movie_id' : [2,3,7],
           'rating' : [4,3,1]}

ratings = pd.DataFrame(ratings)

rating_matrix = ratings.pivot(index='user_id',columns='movie_id',values='rating').fillna(0)
full_matrix = np.array(rating_matrix)  # full matrix
print(full_matrix)

data = np.array(ratings['rating'])
row_indices = np.array(ratings['user_id'])
col_indices = np.array(ratings['movie_id'])
rating_matrix = csr_matrix((data, (row_indices,col_indices)),dtype=int)
print(rating_matrix)    # csr_matrix
full_matrix_2 = rating_matrix.toarray()
print(full_matrix_2)  # csr --> full   

[[4. 0. 0.]
 [0. 3. 0.]
 [0. 0. 1.]]
  (1, 2)	4
  (2, 3)	3
  (4, 7)	1
[[0 0 0 0 0 0 0 0]
 [0 0 4 0 0 0 0 0]
 [0 0 0 3 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1]]


In [3]:
print(rating_matrix * 2)  # rating에만 2배
print('\n')
print(rating_matrix.T)   # user,item 순서 Transpose
print('\n')
print(rating_matrix.dot(rating_matrix.T))  

  (1, 2)	8
  (2, 3)	6
  (4, 7)	2


  (2, 1)	4
  (3, 2)	3
  (7, 4)	1


  (1, 1)	16
  (2, 2)	9
  (4, 4)	1


In [4]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('/content/drive/MyDrive/recomm_sample/ratings-20m.csv', names=r_cols, sep=',',encoding='latin-1') # 20M data
ratings = ratings[['user_id', 'movie_id', 'rating']].astype(int)            # timestamp 제거
print(ratings.shape)
print(ratings)

(20000263, 3)
          user_id  movie_id  rating
0               1         2       3
1               1        29       3
2               1        32       3
3               1        47       3
4               1        50       3
...           ...       ...     ...
20000258   138493     68954       4
20000259   138493     69526       4
20000260   138493     69644       3
20000261   138493     70286       5
20000262   138493     71619       2

[20000263 rows x 3 columns]


In [5]:
from sklearn.utils import shuffle
train_size = 0.75
ratings = shuffle(ratings, random_state=1)
cutoff = int(train_size * len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]

In [6]:
# sparse matrix with MF
data = np.array(ratings['rating'])
row_indices = np.array(ratings['user_id'])
col_indices = np.array(ratings['movie_id'])
ratings = csr_matrix((data, (row_indices,col_indices)),dtype=int)

class mf():
    def __init__(self, ratings, k, alpha, beta, iterations, verbose=True):
        self.R = ratings    # np.array(ratings) 안해줬음 + sparse_matrix는 원래 값을 보존하므로 user_id_index 뭐 이런거 안함
        self.num_users, self.num_items = np.shape(self.R)
        self.k = k
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.verbose = verbose

    def rmse(self):
        xs, ys = self.R.nonzero()
        self.predictions = []
        self.errors = []
        for x, y in zip(xs, ys):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)
        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        return np.sqrt(np.mean(self.errors**2))

    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (e - self.beta * self.b_d[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    def set_test(self, ratings_test):
        test_set = []
        for i in range(len(ratings_test)):              
            x, y, z = ratings_test.iloc[i]              
            test_set.append([x, y, z])
            self.R[x, y] = 0                            
        self.test_set = test_set
        return test_set                                 

    def test_rmse(self):
        error = 0
        for one_set in self.test_set:
            predicted = self.get_prediction(one_set[0], one_set[1])
            error += pow(one_set[2] - predicted, 2)
        return np.sqrt(error/len(self.test_set))

    def test(self):
        self.P = np.random.normal(scale=1./self.k, size=(self.num_users, self.k))
        self.Q = np.random.normal(scale=1./self.k, size=(self.num_items, self.k))

        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])

        rows, columns = self.R.nonzero()
        self.samples = [(i, j, self.R[i,j]) for i, j in zip(rows, columns)]

        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            rmse1 = self.rmse()
            rmse2 = self.test_rmse()
            training_process.append((i+1, rmse1, rmse2))
            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.4f ; Test RMSE = %.4f" % (i+1, rmse1, rmse2))
        return training_process

    def get_one_prediction(self, user_id, item_id):
        return self.get_prediction(user_id, item_id)   # 바로 각 user_id, item_id별 1개의 예측값 획득
  
R_temp = ratings.copy()
mf = mf(R_temp,k=200,alpha=0.001,beta=0.02,iterations=190,verbose=True)
test_set = mf.set_test(ratings_test)
result = mf.test()

KeyboardInterrupt: ignored