In [21]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from math import sqrt
from surprise import Dataset, Reader 
from surprise.model_selection import train_test_split


In [45]:
# 데이터 로드 및 전처리
df = pd.read_csv('../df.csv')
df.reset_index(inplace=True, drop=True)
df = df[['user_id', 'movie_id', 'rating']]
df = df[:10000]

# 평점 범위를 1~10으로 설정
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df, reader)

# 학습 및 테스트 데이터셋 분리
trainset, testset = train_test_split(data, test_size=0.2, random_state=10)

SVD

In [44]:
from surprise import SVD, Dataset,  accuracy, Reader
from surprise.model_selection import cross_validate, GridSearchCV

In [65]:
# #연습
# algo = SVD( n_factors= 10, random_state= 42)
# algo.fit(trainset)

# prediction = algo.test(testset)
# accuracy.rmse(prediction)
# print('prediction type :', type(prediction),
#       'size :', len(prediction))
# print()
# print('predictino 결과값 5개 미리보기')
# print(prediction[:5])

# result = [(pred.uid, pred.iid, pred.est) for pred in prediction[ :5]]
# print(result)

# cross_validate(algo, data, measures= ['RMSE', 'MAE'], cv=5, verbose= True)

In [34]:
param_grid = {'n_epochs' : [20, 30, 40], 'n_factors' : [10, 20, 30, 40, 50, 100, 200]}

grid = GridSearchCV(SVD, param_grid = param_grid, measures= ['rmse', 'mse'], cv= 3)

grid.fit(data)

print(grid.best_score['rmse'])
print(grid.best_params['rmse'])


2.2658101884260673
{'n_epochs': 30, 'n_factors': 10}


In [35]:
param_grid = {'n_epochs' : [20, 30, 40], 'n_factors' : [1, 2, 3, 4, 5, 10]}

grid = GridSearchCV(SVD, param_grid = param_grid, measures= ['rmse', 'mse'], cv= 3)

grid.fit(data)

print(grid.best_score['rmse'])
print(grid.best_params['rmse'])

2.250357091930385
{'n_epochs': 30, 'n_factors': 1}


In [62]:
algo = SVD(n_factors = 1, n_epochs = 30, random_state= 42)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x19ea93baf10>

In [70]:
user_id = 9

# 특정 사용자가 평가한 영화 정보
user_movies = trainset.ur[user_id]
print(user_movies)

[(8, 10.0), (115, 10.0), (68, 10.0)]


In [72]:
for id, rating in user_movies :
  print(id)
  print(rating)

8
10.0
115
10.0
68
10.0


SVD

In [55]:
# 사용자가 본 영화 ID 목록
movie_ids = [movie_id for (movie_id, _) in user_movies]

# 사용자가 본 영화 ID 목록 출력
print(movie_ids)

if 115 not in movie_ids :
    print('user_id 가 9인 사람은 movie_id = 42 에 대한 평점이 없음')

[8, 115, 68]


In [58]:
def get_unseen_surprise(user_id) :
  #특정 유저가 본 movie_id들을 리스트로 할당
  user_movies = trainset.ur[user_id]
  seen_movie_ids = [movie_id for (movie_id, _) in user_movies]
  all_movie_ids = set()
  for user_id, movie_rating_list in trainset.ur.items() :
    movie_ids = [movie_id for (movie_id, _) in movie_rating_list]
    all_movie_ids.update(movie_ids)
    
    #모든 영화들 중 유저가 본 movie_id를 제외한 나머지
    unseen_movie_ids = [movie for movie in all_movie_ids if movie not in seen_movie_ids]
    return unseen_movie_ids

In [None]:
def recommend_movie_by_surprise(algo, user_id, unseen_movie_ids, top_n = 20) :
  prediction = [algo.predict(str(user_id), str(movie_id)) for movie_id in unseen_movie_ids]

  def sortkey_est(pred) :
    return pred.est
  
  prediction.sort(key = sortkey_est, reverse = True)
  top_predictions = prediction[:top_n]

  top_movie_ids = [int(pred.iid) for pred in top_predictions]
  top_movie_ratings = [pred.est for pred in top_predictions]

  top_movie_preds = [(ids, rating) for ids, rating in zip(top_movie_ids, top_movie_ratings)]

  return top_movie_preds


In [76]:
unseen_list_4 = get_unseen_surprise(4)
unseen_list_4

top_movies_preds = recommend_movie_by_surprise(algo, 4, unseen_list_4, top_n = 20)
top_movies_preds

[(43, 8.1615)]

In [77]:
for top_movie in top_movies_preds :
  print('* 추천 영화 이름 :', top_movie[0])
  print('* 해당 영화의 예측 평점 : ', top_movie[1])

* 추천 영화 이름 : 43
* 해당 영화의 예측 평점 :  8.1615


클러스터링

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from math import sqrt

# 데이터 로드 및 전처리
df = pd.read_csv('../df.csv')
df = df[['user_id', 'movie_id', 'rating']]
df = df[:20000]

# 사용자-아이템 평점 행렬 생성
ratings_matrix = df.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)

# 데이터 스케일링
scaler = StandardScaler()
scaled_ratings_matrix = scaler.fit_transform(ratings_matrix.values)

# K-Means 클러스터링
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
user_clusters = kmeans.fit_predict(scaled_ratings_matrix)

# RMSE 계산 함수
def calculate_rmse(original_matrix, predicted_matrix):
    mask = original_matrix != 0
    return sqrt(mean_squared_error(original_matrix[mask], predicted_matrix[mask]))

# 클러스터 평균을 사용한 예상 평점 행렬 계산
cluster_means = np.zeros_like(ratings_matrix.values)
for cluster in range(num_clusters):
    cluster_indices = np.where(user_clusters == cluster)[0]
    cluster_ratings = ratings_matrix.values[cluster_indices]
    cluster_mean = cluster_ratings.mean(axis=0)
    for i in cluster_indices:
        cluster_means[i] = cluster_mean

# RMSE 계산
rmse_kmeans = calculate_rmse(ratings_matrix.values, cluster_means)
print(f"K-Means 기반 클러스터링 RMSE: {rmse_kmeans}")

# 추천 함수
def recommend_movies_kmeans(user_id, user_clusters, ratings_matrix, n=10):
    user_index = ratings_matrix.index.get_loc(user_id)
    user_cluster = user_clusters[user_index]
    
    cluster_indices = np.where(user_clusters == user_cluster)[0]
    cluster_ratings = ratings_matrix.values[cluster_indices]
    mean_ratings = cluster_ratings.mean(axis=0)
    
    user_rated_movies = ratings_matrix.loc[user_id][ratings_matrix.loc[user_id] > 0].index
    movie_scores = [(ratings_matrix.columns[i], score) for i, score in enumerate(mean_ratings) if ratings_matrix.columns[i] not in user_rated_movies]
    movie_scores.sort(key=lambda x: x[1], reverse=True)
    
    return movie_scores[:n]

# 특정 사용자에 대한 추천 영화 출력
user_id = 1  # 예시로 사용자 ID 1
recommended_movies_kmeans = recommend_movies_kmeans(user_id, user_clusters, ratings_matrix, n=10)
print("Clustering (K-Means) 추천된 영화 목록:")
for movie_id, score in recommended_movies_kmeans:
    print(f"영화 ID: {movie_id}, 점수: {score}")

하이브리드 light-FM

In [None]:
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k

# Movielens 데이터셋 로드
data = fetch_movielens(min_rating=4.0)

# LightFM 모델 생성 (WARP 손실 함수 사용)
model = LightFM(loss='warp')

# 모델 학습
model.fit(data['train'], epochs=30, num_threads=2)

# 모델 평가
train_precision = precision_at_k(model, data['train'], k=5).mean()
test_precision = precision_at_k(model, data['test'], k=5).mean()

print('Train precision: %.2f' % train_precision)
print('Test precision: %.2f' % test_precision)