In [2]:
import pandas as pd
import numpy as np

##### 사용자 기반 추천시스템

In [9]:
df_corredspond = pd.read_csv('../df_correspond.csv', encoding= 'cp949') #26421

df_total = pd.read_csv('../df_total.csv', encoding= 'cp949') #26421
print(df_total)

       user_id  movie_id  scaled_rating
0            1       100       4.111111
1            2       100       2.777778
2            3       100       5.000000
3            4       100       3.666667
4            5       100       5.000000
...        ...       ...            ...
26416     2201        99       3.222222
26417     8454        99       3.222222
26418    13022        99       1.000000
26419     2419        99       5.000000
26420       88        99       1.000000

[26421 rows x 3 columns]


In [1]:
def get_top_n_recommendations(algo, user_movie_rating_df, user_id, n=20):
    # 모든 영화 목록
    all_movie_ids = user_movie_rating_df['movie_id'].unique()
    
    # 사용자가 이미 본 영화 목록
    seen_movie_ids = user_movie_rating_df[user_movie_rating_df['user_id'] == user_id]['movie_id'].unique()
    
    # 사용자가 보지 않은 영화 목록
    unseen_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in seen_movie_ids]
    
    # 사용자가 보지 않은 영화들에 대한 예측 평점 계산
    predictions = [algo.predict(user_id, movie_id) for movie_id in unseen_movie_ids]
    
    # 예측 평점이 높은 순으로 정렬하여 상위 n개 영화 추천
    top_n_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:n]
    
    # 추천 영화 ID 리스트 추출
    top_n_movie_ids = [pred.iid for pred in top_n_predictions]
    
    return top_n_movie_ids

# # 사용자에게 영화 추천하기
# user_id = 1  # 추천을 받을 사용자의 ID
# num_recommendations = 20  # 추천할 영화의 수
# recommended_movie_ids = get_top_n_recommendations(algo, user_id, num_recommendations)
# print("사용자에게 추천할 영화 ID 리스트:", recommended_movie_ids)

SVD

In [4]:
from surprise import SVD, Dataset, Reader, accuracy
from collections import defaultdict

##### 일치하는 데이터셋

In [152]:
user_movie_rating_df = df_corredspond[['user_id', 'movie_id', 'rating']]

r_min = user_movie_rating_df['rating'].min()
r_max = user_movie_rating_df['rating'].max()
reader = Reader(rating_scale = (r_min, r_max))
data = Dataset.load_from_df(user_movie_rating_df[['user_id', 'movie_id', 'rating']], reader)

trainset = data.build_full_trainset()
testset = trainset.build_testset()

algo = SVD()

##trainset으로 SVD 학습
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2d96f4b9550>

In [153]:
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 2.1230


2.12296349734107

###### 그리드 탐색을 통한 최적의 파라미터 출력

In [154]:
from surprise.model_selection import GridSearchCV

param_grid = {'n_factors': [50, 75], 'lr_all': [0.5, 0.05], 'reg_all': [0.06, 0.04]} 
gs = GridSearchCV(algo_class=SVD, measures=['RMSE'], param_grid=param_grid) 
gs.fit(data) 
print('\n###################') 
print('Best Score :', gs.best_score['rmse']) 
print('Best Parameters :', gs.best_params['rmse']) 
print('#####################')


###################
Best Score : 3.3691135952181432
Best Parameters : {'n_factors': 75, 'lr_all': 0.05, 'reg_all': 0.04}
#####################


###### 최적의 파라미터로 최종 모델 생성

In [164]:
best_params = gs.best_params['rmse']

final_algo = SVD(n_factors = best_params['n_factors'], lr_all = best_params['lr_all'], reg_all = best_params['reg_all'])

# svd 학습
final_algo.fit(trainset)

# 최종 모델로 RMSE 측정
predictions = final_algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.0651


0.06514018212962794

In [181]:
predictions_df = pd.DataFrame(predictions)
a = predictions_df.r_ui.unique().tolist()
print(a)

[9.0, 7.0, 6.0, 8.0, 5.0, 10.0, 3.0, 4.0, 1.0, 2.0]


In [None]:
# 사용자가 평가하지 않은 영화 중에서 예측 평점이 가장 높은 영화를 추천하는 함수
def get_recommendations(predictions_df, user_id, top_n=20):
    # 해당 사용자의 예측 평점 데이터 추출
    user_predictions = predictions_df[predictions_df['uid'] == user_id]
    
    # 이미 평가한 영화를 제외한 예측 평점 데이터
    user_predictions = user_predictions[user_predictions['r_ui'] == 0]
    
    # 예측 평점이 높은 순서대로 정렬
    user_predictions = user_predictions.sort_values(by='est', ascending=False)
    
    # 상위 N개의 추천 영화 반환
    return user_predictions.head(top_n)

# 예시: 사용자 '1'에게 추천할 상위 5개의 영화 출력
user_id = '1'  # 사용자 ID
recommended_movies = get_recommendations(predictions_df, user_id, top_n=5)

print(f"Recommended movies for user {user_id}:")
print(recommended_movies)

In [157]:
user_movie_rating_match = df_match[['user_id', 'movie_id', 'rating']]

r_min_match = user_movie_rating_match['rating'].min()
r_max_match = user_movie_rating_match['rating'].max()
reader = Reader(rating_scale = (r_min_match, r_max_match))
data_match = Dataset.load_from_df(user_movie_rating_match[['user_id', 'movie_id', 'rating']], reader)

trainset_match = data.build_full_trainset()
testset_match = trainset.build_testset()

algo = SVD()

##trainset으로 SVD 학습
algo.fit(trainset_match)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2d969b0b710>

In [158]:
predictions_match = algo.test(testset_match)
accuracy.rmse(predictions_match)

RMSE: 2.1258


2.125821213550905

In [159]:
from surprise.model_selection import GridSearchCV, cross_validate

param_grid = {'n_factors': [50, 75], 'lr_all': [0.5, 0.05], 'reg_all': [0.06, 0.04]} 
gs = GridSearchCV(algo_class=SVD, measures=['RMSE'], param_grid=param_grid) 
gs.fit(data) 
print('\n###################') 
print('Best Score :', gs.best_score['rmse']) 
print('Best Parameters :', gs.best_params['rmse']) 
print('#####################')


###################
Best Score : 3.3719792179671644
Best Parameters : {'n_factors': 50, 'lr_all': 0.05, 'reg_all': 0.04}
#####################


In [161]:
best_params = gs.best_params['rmse']

final_algo = SVD(n_factors = best_params['n_factors'], lr_all = best_params['lr_all'], reg_all = best_params['reg_all'])

# svd 학습
final_algo.fit(trainset_match)

# 최종 모델로 RMSE 측정
predictions_match = final_algo.test(testset_match)
accuracy.rmse(predictions_match)

RMSE: 0.0650


0.06502611469043694

In [162]:
pd.DataFrame(predictions_match)

Unnamed: 0,uid,iid,r_ui,est,details
0,4619,35839,9.0,8.937129,{'was_impossible': False}
1,1576,10874,7.0,6.936087,{'was_impossible': False}
2,1576,27051,7.0,6.943775,{'was_impossible': False}
3,1576,66046,6.0,6.006797,{'was_impossible': False}
4,1576,17806,8.0,7.891670,{'was_impossible': False}
...,...,...,...,...,...
14995,30187,46415,1.0,1.058096,{'was_impossible': False}
14996,18798,23093,10.0,9.913468,{'was_impossible': False}
14997,3077,10201,10.0,9.936756,{'was_impossible': False}
14998,10632,101960,1.0,1.066922,{'was_impossible': False}


KNNWithMeans

In [None]:
# from surprise import KNNWithMeans

# user_movie_rating = df[['user_id', 'movie_id', 'rating']]
# user_movie_rating_match = df_match[['user_id', 'movie_id', 'rating']]

# from sklearn.model_selection import train_test_split

# train_df, test_df = train_test_split(user_movie_rating, test_size=0.2)
# train_df_match, test_df_match = train_test_split(user_movie_rating_match, test_size=0.2)

# sim_options = {
#   "name" : "pearson" ,
#   "user_based" : True 
# }
# algo = KNNWithMeans(sim_options= sim_options)

# from surprise import Dataset, Reader
# from surprise import accuracy
# from surprise.model_selection import train_test_split

# # Rating의 범위를 지정하여 데이터 리더를 생성합니다.
# reader = Reader(rating_scale=(1, 10))

# trainset = Dataset.load_from_df(train_df[['user_id', 'movie_id', 'rating']], reader).build_full_trainset()
# testset = Dataset.load_from_df(test_df[['user_id', 'movie_id', 'rating']], reader).build_full_trainset().build_testset()

# trainset_match = Dataset.load_from_df(train_df_match[['user_id', 'movie_id', 'rating']], reader).build_full_trainset()
# testset_match = Dataset.load_from_df(test_df_match[['user_id', 'movie_id', 'rating']], reader).build_full_trainset().build_testset()

# # 전체 데이터에 대한 예측 및 평가
# algo.fit(trainset)
# predictions_full = algo.test(testset)
# accuracy.rmse(predictions_full) #3.8395

# # 일치 데이터에 대한 예측 및 평가
# algo.fit(trainset_match)
# predictions_match = algo.test(testset_match)
# accuracy.rmse(predictions_match) #3.9464
