<a href="https://colab.research.google.com/github/estry/recommend-system/blob/main/Untitled11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 12.4MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1617621 sha256=e416140c4af21ab6b151979a72dec77f311cca1d0811f0d302ae988c2c4f9adc
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [1]:
from surprise import SVD, Dataset, accuracy, Reader
from surprise.model_selection import train_test_split
import pandas as pd

In [3]:
ratings  = pd.read_csv('/content/drive/MyDrive/data/new_rating.csv')
reader = Reader(rating_scale=(0,5.0))

In [5]:
data = Dataset.load_from_df(ratings[['userId', 'id', 'rating']], reader)
train, test = train_test_split(data, test_size=0.25, random_state=42)

In [6]:
model = SVD(n_factors=50, random_state=42)
model.fit(train)
predictions = model.test(test)
accuracy.rmse(predictions)

RMSE: 0.8496


0.8496295526865679

In [7]:
accuracy.mae(predictions)

MAE:  0.6498


0.6498278633325706

factor = 75

RMSE: 0.8504
0.8503766933040544

factor = 50

RMSE: 0.8496
0.8496295526865679



In [9]:
from surprise.model_selection import cross_validate

ratings  = pd.read_csv('/content/drive/MyDrive/data/new_rating.csv')
reader = Reader(rating_scale=(0, 5.0))
data = Dataset.load_from_df(ratings[['userId','id','rating']], reader)

model_cv = SVD(random_state=42)
cross_validate(model_cv, data, measures=['RMSE','MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8472  0.8472  0.8465  0.8486  0.8488  0.8477  0.0009  
MAE (testset)     0.6473  0.6480  0.6471  0.6488  0.6486  0.6480  0.0007  
Fit time          375.07  378.38  383.24  376.24  380.85  378.76  2.98    
Test time         24.65   24.31   24.59   23.31   24.05   24.18   0.49    


{'fit_time': (375.07488560676575,
  378.3756320476532,
  383.236212015152,
  376.2419202327728,
  380.8467094898224),
 'test_mae': array([0.64731056, 0.64801745, 0.64711371, 0.64878839, 0.64859876]),
 'test_rmse': array([0.84716398, 0.84724684, 0.84647018, 0.84855664, 0.84884187]),
 'test_time': (24.650150537490845,
  24.31017756462097,
  24.585002660751343,
  23.306503534317017,
  24.04714035987854)}

In [5]:
ratings  = pd.read_csv('/content/drive/MyDrive/data/new_rating.csv')
reader = Reader(rating_scale=(0, 5.0))
data = Dataset.load_from_df(ratings[['userId','id','rating']], reader)

In [7]:
from surprise.model_selection import GridSearchCV

param = {'n_epochs': [20,40], 'n_factors':[50,100]}

model_gs = GridSearchCV(SVD, param, measures=['rmse', 'mae'], cv=2)
model_gs.fit(data)

print(model_gs.best_score['rmse'])
print(model_gs.best_params['rmse'])
print(model_gs.best_score['mae'])
print(model_gs.best_params['mae'])

0.871874082083368
{'n_epochs': 20, 'n_factors': 50}
0.6696676856503423
{'n_epochs': 20, 'n_factors': 50}


In [1]:
import numpy as np
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 두개의 분해된 행렬 P와 Q.T의 내적 곱으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    # 실제 R 행렬에서 널이 아닌 값의 위치 인덱스 추출하여 실제 R 행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
      
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

In [2]:
def matrix_factorization(R, K, steps=200, learning_rate=0.1, r_lambda = 0.01):
    num_x, num_y = R.shape
    np.random.seed(42)
    P = np.random.normal(scale=1./K, size=(num_x, K))
    Q = np.random.normal(scale=1./K, size=(num_y, K))

    prev_rmse = 10000
    break_count = 0

    non_zeros = [(i,j,R[i,j]) for i in range(num_x) for j in range(num_y)]

    for step in range(steps):
        for i, j, r in non_zeros:
            eij = r - np.dot(P[i, : ], Q[j, :].T)
            P[i,:] = P[i,:] + learning_rate*(eij * Q[j, :] - r_lambda*P[i,:])
            Q[j, :] = Q[j,:] + learning_rate*(eij * P[i,:] - r_lambda*Q[j,:])

    rmse = get_rmse(R,P,Q, non_zeros)
    if (step % 10) == 0:
        print("### iteration step : ", step, " rmse : ", rmse)

    return P, Q

In [None]:
# import pandas as pd
# import numpy as np

# movies = pd.read_csv('/content/drive/MyDrive/data/movieId&title.csv')
# ratings = pd.read_csv('/content/drive/MyDrive/data/new_rating.csv')
# ratings = ratings[['userId', 'id', 'rating']]
# ratings_matrix = ratings.pivot_table('rating', index='userId', columns='id')

# # title 컬럼을 얻기 이해 movies 와 조인 수행
# rating_movies = pd.merge(ratings, movies, on='id')

# # columns='title' 로 title 컬럼으로 pivot 수행. 
# ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='title')

In [None]:
# rating_matrix

In [3]:

# P, Q = matrix_factorization(ratings_matrix.values, K=50, steps=200, learning_rate=0.01, r_lambda = 0.01)
# pred_matrix = np.dot(P, Q.T)

NameError: ignored

In [None]:
# ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index= ratings_matrix.index,
#                                    columns = ratings_matrix.columns)

# ratings_pred_matrix.head(3)

In [None]:
def get_unseen_movies(ratings_matrix, userId):
    # userId로 입력받은 사용자의 모든 영화정보 추출하여 Series로 반환함. 
    # 반환된 user_rating 은 영화명(title)을 index로 가지는 Series 객체임. 
    user_rating = ratings_matrix.loc[userId,:]
    
    # user_rating이 0보다 크면 기존에 관람한 영화임. 대상 index를 추출하여 list 객체로 만듬
    already_seen = user_rating[ user_rating > 0].index.tolist()
    
    # 모든 영화명을 list 객체로 만듬. 
    movies_list = ratings_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 movie는 movies_list에서 제외함. 
    unseen_list = [ movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [None]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    # 예측 평점 DataFrame에서 사용자id index와 unseen_list로 들어온 영화명 컬럼을 추출하여
    # 가장 예측 평점이 높은 순으로 정렬함. 
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

In [None]:
# # 사용자가 관람하지 않는 영화명 추출   
# unseen_list = get_unseen_movies(ratings_matrix, 9)

# # 아이템 기반의 인접 이웃 협업 필터링으로 영화 추천 
# recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 9, unseen_list, top_n=10)

# # 평점 데이타를 DataFrame으로 생성. 
# recomm_movies = pd.DataFrame(data=recomm_movies.values,index=recomm_movies.index,columns=['pred_score'])
# recomm_movies