<a href="https://colab.research.google.com/github/donghwi2022/ds-sa-cp2/blob/main/model_MF_SGD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. 필요 라이브러리 import

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.sparse import csr_matrix

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 2. 사용 데이터 확인

In [4]:
df = pd.read_csv("/content/drive/MyDrive/total_movie_rating.csv", index_col = 0)
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,age,gender,occupation,movie_title,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,49,M,writer,Kolya (1996),0,0,...,0,0,0,0,0,0,0,0,0,0
1,305,242,5,886307828,23,M,programmer,Kolya (1996),0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,242,4,883268170,42,M,executive,Kolya (1996),0,0,...,0,0,0,0,0,0,0,0,0,0
3,234,242,4,891033261,60,M,retired,Kolya (1996),0,0,...,0,0,0,0,0,0,0,0,0,0
4,63,242,3,875747190,31,M,marketing,Kolya (1996),0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99991 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      99991 non-null  int64 
 1   movie_id     99991 non-null  int64 
 2   rating       99991 non-null  int64 
 3   timestamp    99991 non-null  int64 
 4   age          99991 non-null  int64 
 5   gender       99991 non-null  object
 6   occupation   99991 non-null  object
 7   movie_title  99991 non-null  object
 8   unknown      99991 non-null  int64 
 9   Action       99991 non-null  int64 
 10  Adventure    99991 non-null  int64 
 11  Animation    99991 non-null  int64 
 12  Children's   99991 non-null  int64 
 13  Comedy       99991 non-null  int64 
 14  Crime        99991 non-null  int64 
 15  Documentary  99991 non-null  int64 
 16  Drama        99991 non-null  int64 
 17  Fantasy      99991 non-null  int64 
 18  Film-Noir    99991 non-null  int64 
 19  Horror       99991 non-nu

# 3. MF-SGD 모델링
1. 학습 데이터와 테스트 데이터 분리
2. 기준모델 생성
3. 모델 생성

## 1. 학습 데이터와 테스트 데이터 분리

In [20]:
# 사용할 데이터셋
mf_sgd_df = df.loc[:, ['user_id', 'movie_title', 'rating']]
mf_sgd_df

Unnamed: 0,user_id,movie_title,rating
0,196,Kolya (1996),3
1,305,Kolya (1996),5
2,6,Kolya (1996),4
3,234,Kolya (1996),4
4,63,Kolya (1996),3
...,...,...,...
99995,863,B. Monkey (1998),3
99996,863,Mat' i syn (1997),1
99997,863,Sliding Doors (1998),2
99998,896,You So Crazy (1994),3


In [24]:
# 학습데이터와 테스트 데이터 분리(유저가 균등하게 들어가기 위해 별도 함수 작성)
def split_df(df) :
    user_list = df['user_id'].unique()
    train_data = pd.DataFrame()
    test_data = pd.DataFrame()
    for user in user_list :
        sub_df = df[df['user_id'] == user]
        train_set, test_set = train_test_split(sub_df, test_size=0.2, shuffle=True, random_state=3)
        train_data = pd.concat([train_data,train_set])
        test_data = pd.concat([test_data,test_set])
    return train_data, test_data

In [25]:
train_data, test_data = split_df(mf_sgd_df)

In [28]:
# 양 데이터의 유저 숫자 확인
print(f"학습데이터의 user 숫자 : {train_data['user_id'].nunique()}")
print(f"테스트데이터의 user 숫자 : {test_data['user_id'].nunique()}")

학습데이터의 user 숫자 : 943
테스트데이터의 user 숫자 : 943


## 2. 기준 모델 생성(최빈값)

In [10]:
# 최빈값으로 기준모델 생성
y_test = test_data['rating']
base = y_test.mode()[0] 
baseline = len(y_test) * [base]

In [11]:
# 기준모델 평가
baseline_mae = mean_absolute_error(y_test, baseline)
baseline_mse = mean_squared_error(y_test, baseline)
baseline_rmse = np.sqrt(baseline_mse)

print('기준모델의 MAE :', baseline_mae)
print('기준모델의 RMSE :', baseline_rmse)

기준모델의 MAE : 0.9055492860997988
기준모델의 RMSE : 1.2294331271926715


## 3. 모델 생성

In [31]:
# 학습과 테스테 Matrix 생성(null값은 0으로 채움)
train_matrix = train_data.pivot_table('rating', index='user_id', columns='movie_title').fillna(0) 
test_matrix = test_data.pivot_table('rating', index='user_id', columns='movie_title').fillna(0)

# 계산 효율을 높이기 위해 csr_matrix 사용
train_matrix_csr = csr_matrix(train_matrix.values).toarray()
test_matrix_csr = csr_matrix(test_matrix.values).toarray()

In [39]:
class MatrixFactorization():

    # MF-SGD 생성자
    def __init__(self, R, k, learning_rate, reg_param, epochs, verbose=False):
        self._R = R                                 # 평점 matrix
        self._num_users, self._num_movies = R.shape # 유저, 영화 수
        self._k = k                                 # 잠재요인 수
        self._learning_rate = learning_rate         # 학습률
        self._reg_param = reg_param                 # 정규화 계수
        self._epochs = epochs                       # 반복횟수
        self._verbose = verbose                     # 중간과정 표시 여부


    # 학습 진행 
    def fit(self):

        # 잠재 행렬 생성(이상치 제거를 위해 정규분포 형태로 초기화)
        self._P = np.random.normal(size=(self._num_users, self._k)) # size = (유저 수, 잠재요인 수)
        self._Q = np.random.normal(size=(self._num_movies, self._k)) # size = (아이템 수, 잠재요인 수)

        # 잠재 편향 생성
        self._b_P = np.zeros(self._num_users) 
        self._b_Q = np.zeros(self._num_movies)

        # 전체 평점의 평균을 계산
        self._b = np.mean(self._R[np.where(self._R != 0)]) 

        # 입력받은 값 만큼 반복 진행
        self._training_process = []
        for epoch in range(self._epochs):  
            # rating이 존재하는 index를 기준으로 training
            xi, yi = self._R.nonzero() 
            for i, j in zip(xi, yi):
                self.gradient_descent(i, j, self._R[i, j])
            
            rmse, mae = self.cost() 
            self._training_process.append((epoch, rmse, mae))

            # 10단위로 학습상태 출력
            if self._verbose == True :
                print(f"Epoch[{epoch + 1}/{self._epochs}] rmse = {rmse.round(4)}, mae = {mae.round(4)}")


    # SGD방식
    def gradient_descent(self, i, j, rating):
        """
        :param i: user index of matrix
        :param j: movie index of matrix
        :param rating: rating of (i,j)
        """

        # 손실값 계산
        prediction = self.get_prediction(i, j)
        error = rating - prediction

        # 편향 update
        self._b_P[i] += self._learning_rate * (error - self._reg_param * self._b_P[i])
        self._b_Q[j] += self._learning_rate * (error - self._reg_param * self._b_Q[j])

        # 잠재요인 update
        dp, dq = self.gradient(error, i, j)
        self._P[i, :] += self._learning_rate * dp
        self._Q[j, :] += self._learning_rate * dq


    # Gradient 계산
    def gradient(self, error, i, j):
        """
        :param error: rating - prediction error
        :param i: user index
        :param j: item index
        """
        dp = (error * self._Q[j, :]) - (self._reg_param * self._P[i, :])
        dq = (error * self._P[i, :]) - (self._reg_param * self._Q[j, :])
        return dp, dq


    # 예측 평점 생성
    def get_prediction(self, i, j):
        # 예측 평점 = 전체 평점의 평균(_b) + 유저가 가진 편향(_b_P[i]) + 영화가 가진 편향(_b_Q[j]) + 두 잠재벡터의 내적(_P[i, :].dot(self._Q[j, :].T))
        return self._b + self._b_P[i] + self._b_Q[j] + self._P[i, :].dot(self._Q[j, :].T)
    

    # rmse, mae 계산
    def cost(self):
        xi, yi = self._R.nonzero()
        error_squares = 0
        errors = 0
        for x, y in zip(xi, yi):
            error_squares += pow(self._R[x, y] - self.get_prediction(x, y), 2)
            errors += self._R[x, y] - self.get_prediction(x, y)
        return np.sqrt(error_squares/len(xi)), errors/len(xi)


    # 완성된 평점행렬 추출
    def get_complete_matrix(self):
        """
        - PXQ 행렬에 b_P[:, np.newaxis]를 더하는 것은 각 열마다 bias를 더해주는 것
        - b_Q[np.newaxis:, ]를 더하는 것은 각 행마다 bias를 더해주는 것
        - b를 더하는 것은 각 element마다 bias를 더해주는 것
        - newaxis: 차원을 추가해줌. 1차원인 Latent들로 2차원의 R에 행/열 단위 연산을 해주기위해 차원을 추가하는 것.
        """
        return self._b + self._b_P[:, np.newaxis] + self._b_Q[np.newaxis:, ] + self._P.dot(self._Q.T)

In [42]:
# 훈련 데이터로 학습
%%time
mf_sgd = MatrixFactorization(train_matrix_csr, k=50, learning_rate=0.01, reg_param=0.01, epochs=10, verbose=True)
mf_sgd.fit()

Epoch[1/10] rmse = 1.3944, mae = -0.0059
Epoch[2/10] rmse = 0.9794, mae = -0.003
Epoch[3/10] rmse = 0.8476, mae = -0.0017
Epoch[4/10] rmse = 0.7829, mae = -0.0014
Epoch[5/10] rmse = 0.7443, mae = -0.0013
Epoch[6/10] rmse = 0.7181, mae = -0.0012
Epoch[7/10] rmse = 0.6988, mae = -0.0012
Epoch[8/10] rmse = 0.6835, mae = -0.0011
Epoch[9/10] rmse = 0.6708, mae = -0.0011
Epoch[10/10] rmse = 0.6599, mae = -0.001
CPU times: user 20 s, sys: 535 ms, total: 20.5 s
Wall time: 20 s


In [43]:
# 완성된 평점행렬 출력
mf_sgd.get_complete_matrix()

array([[ 4.89578766,  3.08236831,  2.68301531, ...,  3.28669665,
         4.88711244,  4.30892295],
       [ 3.01529103, -2.83800905,  3.44656606, ..., -7.69286829,
         4.24781157,  5.55038065],
       [-9.17447926,  1.86268454,  6.17454626, ...,  1.121784  ,
         7.93503482,  0.41938126],
       ...,
       [ 2.92125621,  4.88607594,  4.98533143, ...,  1.36213297,
         3.52849238,  4.57851229],
       [13.73331318,  8.32247368,  3.8742399 , ..., 11.47972585,
        -6.00282826, 14.95706593],
       [-3.09472126,  4.75837427,  4.90881252, ...,  3.45564448,
         4.21015691, -0.06048943]])

# 4. 추천 로직 및 성능 평가
1. 추천 로직
2. 성능 평가

## 1. 추천 로직

## 2. 성능 평가