전처리 dataset_la

In [1]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, KNNBasic, SVD, accuracy
from surprise.model_selection import KFold, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# -------------------- 1. 평가 지표 함수 정의 --------------------
def calculate_metrics(true_ratings, predicted_ratings):
    """
    평가 지표 (RMSE, MAE)를 계산하는 함수.
    """
    true_ratings = np.array(true_ratings)
    predicted_ratings = np.array(predicted_ratings)
    
    # MSE를 직접 계산하여 RMSE를 구합니다.
    rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
    mae = mean_absolute_error(true_ratings, predicted_ratings)
    
    return rmse, mae

# -------------------- 2. 데이터 로딩 및 Surprise용 데이터셋 준비 --------------------
# 파일 로드
df = pd.read_parquet('review_data_optimized_fl.parquet')

# 필요한 컬럼 추출
df_processed = df[['user_id', 'business_id', 'review_stars']].copy()

print(f"✅ 데이터 로딩 완료. 전체 데이터 수: {len(df_processed)}")

# Surprise 라이브러리를 위한 데이터셋 준비
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_processed, reader)

print(f"✅ Surprise Dataset 객체 생성 완료.")

✅ 데이터 로딩 완료. 전체 데이터 수: 500892
✅ Surprise Dataset 객체 생성 완료.


UBCF

In [2]:
# --- 2. UBCF (User-Based Collaborative Filtering) 모델 ---

# k 값을 100으로 고정
best_k_ubcf = 100
print(f"✅ k 값을 {best_k_ubcf}로 고정하여 UBCF 평가를 진행합니다.")

# 반복 횟수 및 초기 random_state 설정
n_iterations = 5
start_random_state = 42

# 결과를 저장할 리스트
test_rmse_scores_ubcf = []
test_mae_scores_ubcf = []

print("\n--- UBCF 모델 5회 반복 학습 및 평가 시작 ---")

for i in range(n_iterations):
    current_random_state = start_random_state + i
    print(f"\n[{i+1}/{n_iterations}] Iteration with random_state = {current_random_state}")

    # 매 반복마다 새로운 데이터 분할
    train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=current_random_state)
    val_size_ratio = 1/8
    train_df, val_df = train_test_split(train_val_df, test_size=val_size_ratio, random_state=current_random_state)
    
    # Surprise 라이브러리용 데이터셋으로 변환
    reader = Reader(rating_scale=(1, 5))
    train_set_for_surprise = train_df[['user_id', 'business_id', 'review_stars']]
    data_for_train = Dataset.load_from_df(train_set_for_surprise, reader)
    train_set = data_for_train.build_full_trainset()
    test_set = [(row['user_id'], row['business_id'], row['review_stars']) for _, row in test_df.iterrows()]
    
    # 고정된 k 값으로 UBCF 모델 학습 및 평가
    ubcf_model = KNNBasic(sim_options={'name': 'cosine', 'user_based': True}, k=best_k_ubcf)
    ubcf_model.fit(train_set)
    predictions_ubcf = ubcf_model.test(test_set)
    
    # 성능 지표 계산
    true_ratings_ubcf = [pred.r_ui for pred in predictions_ubcf]
    predicted_ratings_ubcf = [pred.est for pred in predictions_ubcf]
    rmse, mae = calculate_metrics(true_ratings_ubcf, predicted_ratings_ubcf)
    
    test_rmse_scores_ubcf.append(rmse)
    test_mae_scores_ubcf.append(mae)
    
    print(f"  > Test Set Performance (k={best_k_ubcf}): RMSE = {rmse:.4f}, MAE = {mae:.4f}")

# 결과 평균 및 표준편차 계산
mean_rmse_ubcf = np.mean(test_rmse_scores_ubcf)
std_rmse_ubcf = np.std(test_rmse_scores_ubcf)
mean_mae_ubcf = np.mean(test_mae_scores_ubcf)
std_mae_ubcf = np.std(test_mae_scores_ubcf)

# 최종 결과 출력
print("\n" + "="*50)
print(f"--- 최종 UBCF 모델 성능 (5회 반복) ---")
print(f"평균 RMSE: {mean_rmse_ubcf:.4f} (표준편차: {std_rmse_ubcf:.4f})")
print(f"평균 MAE: {mean_mae_ubcf:.4f} (표준편차: {std_mae_ubcf:.4f})")
print("="*50)

✅ k 값을 100로 고정하여 UBCF 평가를 진행합니다.

--- UBCF 모델 5회 반복 학습 및 평가 시작 ---

[1/5] Iteration with random_state = 42
Computing the cosine similarity matrix...
Done computing similarity matrix.
  > Test Set Performance (k=100): RMSE = 1.2693, MAE = 0.9929

[2/5] Iteration with random_state = 43
Computing the cosine similarity matrix...
Done computing similarity matrix.
  > Test Set Performance (k=100): RMSE = 1.2640, MAE = 0.9907

[3/5] Iteration with random_state = 44
Computing the cosine similarity matrix...
Done computing similarity matrix.
  > Test Set Performance (k=100): RMSE = 1.2658, MAE = 0.9895

[4/5] Iteration with random_state = 45
Computing the cosine similarity matrix...
Done computing similarity matrix.
  > Test Set Performance (k=100): RMSE = 1.2666, MAE = 0.9908

[5/5] Iteration with random_state = 46
Computing the cosine similarity matrix...
Done computing similarity matrix.
  > Test Set Performance (k=100): RMSE = 1.2692, MAE = 0.9923

--- 최종 UBCF 모델 성능 (5회 반복) ---
평균 RMSE: 1.2

IBCF

In [3]:
# --- 3. IBCF (Item-Based Collaborative Filtering) 모델 ---

# k 값을 100으로 고정
best_k_ibcf = 100
print(f"✅ k 값을 {best_k_ibcf}로 고정하여 IBCF 평가를 진행합니다.")

# 반복 횟수 및 초기 random_state 설정
n_iterations = 5
start_random_state = 42

# 결과를 저장할 리스트
test_rmse_scores_ibcf = []
test_mae_scores_ibcf = []

print("\n--- IBCF 모델 5회 반복 학습 및 평가 시작 ---")

for i in range(n_iterations):
    current_random_state = start_random_state + i
    print(f"\n[{i+1}/{n_iterations}] Iteration with random_state = {current_random_state}")

    # 매 반복마다 새로운 데이터 분할
    train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=current_random_state)
    val_size_ratio = 1/8
    train_df, val_df = train_test_split(train_val_df, test_size=val_size_ratio, random_state=current_random_state)
    
    # Surprise 라이브러리용 데이터셋으로 변환
    reader = Reader(rating_scale=(1, 5))
    train_set_for_surprise = train_df[['user_id', 'business_id', 'review_stars']]
    data_for_train = Dataset.load_from_df(train_set_for_surprise, reader)
    train_set = data_for_train.build_full_trainset()
    test_set = [(row['user_id'], row['business_id'], row['review_stars']) for _, row in test_df.iterrows()]
    
    # 고정된 k 값으로 IBCF 모델 학습 및 평가
    ibcf_model = KNNBasic(sim_options={'name': 'cosine', 'user_based': False}, k=best_k_ibcf)
    ibcf_model.fit(train_set)
    predictions_ibcf = ibcf_model.test(test_set)
    
    # 성능 지표 계산
    true_ratings_ibcf = [pred.r_ui for pred in predictions_ibcf]
    predicted_ratings_ibcf = [pred.est for pred in predictions_ibcf]
    rmse, mae = calculate_metrics(true_ratings_ibcf, predicted_ratings_ibcf)
    
    test_rmse_scores_ibcf.append(rmse)
    test_mae_scores_ibcf.append(mae)
    
    print(f"  > Test Set Performance (k={best_k_ibcf}): RMSE = {rmse:.4f}, MAE = {mae:.4f}")

# 결과 평균 및 표준편차 계산
mean_rmse_ibcf = np.mean(test_rmse_scores_ibcf)
std_rmse_ibcf = np.std(test_rmse_scores_ibcf)
mean_mae_ibcf = np.mean(test_mae_scores_ibcf)
std_mae_ibcf = np.std(test_mae_scores_ibcf)

# 최종 결과 출력
print("\n" + "="*50)
print(f"--- 최종 IBCF 모델 성능 (5회 반복) ---")
print(f"평균 RMSE: {mean_rmse_ibcf:.4f} (표준편차: {std_rmse_ibcf:.4f})")
print(f"평균 MAE: {mean_mae_ibcf:.4f} (표준편차: {std_mae_ibcf:.4f})")
print("="*50)

✅ k 값을 100로 고정하여 IBCF 평가를 진행합니다.

--- IBCF 모델 5회 반복 학습 및 평가 시작 ---

[1/5] Iteration with random_state = 42
Computing the cosine similarity matrix...
Done computing similarity matrix.
  > Test Set Performance (k=100): RMSE = 1.2936, MAE = 0.9651

[2/5] Iteration with random_state = 43
Computing the cosine similarity matrix...
Done computing similarity matrix.
  > Test Set Performance (k=100): RMSE = 1.2957, MAE = 0.9671

[3/5] Iteration with random_state = 44
Computing the cosine similarity matrix...
Done computing similarity matrix.
  > Test Set Performance (k=100): RMSE = 1.2930, MAE = 0.9631

[4/5] Iteration with random_state = 45
Computing the cosine similarity matrix...
Done computing similarity matrix.
  > Test Set Performance (k=100): RMSE = 1.2954, MAE = 0.9660

[5/5] Iteration with random_state = 46
Computing the cosine similarity matrix...
Done computing similarity matrix.
  > Test Set Performance (k=100): RMSE = 1.2979, MAE = 0.9685

--- 최종 IBCF 모델 성능 (5회 반복) ---
평균 RMSE: 1.2

SVD

In [4]:
# --- 4. SVD (Singular Value Decomposition) 모델 ---

# 1. GridSearchCV로 최적의 하이퍼파라미터를 찾습니다.
print("--- [1단계] SVD 하이퍼파라미터 튜닝 시작 ---")
param_grid_svd = {
    'n_factors': [1, 30, 50, 100],
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.007],
    'reg_all': [0.02, 0.05]
}

reader = Reader(rating_scale=(1, 5))
full_data_for_tuning = Dataset.load_from_df(df_processed, reader)
gs = GridSearchCV(SVD, param_grid_svd, measures=['rmse'], cv=3, n_jobs=-1)
gs.fit(full_data_for_tuning)

best_params_svd = gs.best_params['rmse']
print(f"\n✅ 최적 SVD 파라미터: {best_params_svd}")
print(f"최적 파라미터의 교차 검증 RMSE: {gs.best_score['rmse']:.4f}")

# 2. 찾은 최적 파라미터로 5회 반복 최종 평가
print("\n--- [2단계] 최적 SVD 모델 5회 반복 학습 및 평가 시작 ---")
n_iterations = 5
start_random_state = 42

test_rmse_scores_svd = []
test_mae_scores_svd = []

for i in range(n_iterations):
    current_random_state = start_random_state + i
    print(f"\n[{i+1}/{n_iterations}] Iteration with random_state = {current_random_state}")

    train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=current_random_state)
    val_size_ratio = 1/8
    train_df, val_df = train_test_split(train_val_df, test_size=val_size_ratio, random_state=current_random_state)
    
    reader = Reader(rating_scale=(1, 5))
    train_set_for_surprise = train_df[['user_id', 'business_id', 'review_stars']]
    data_for_train = Dataset.load_from_df(train_set_for_surprise, reader)
    train_set = data_for_train.build_full_trainset()
    test_set = [(row['user_id'], row['business_id'], row['review_stars']) for _, row in test_df.iterrows()]
    
    svd_model = SVD(n_factors=best_params_svd['n_factors'],
                    n_epochs=best_params_svd['n_epochs'],
                    lr_all=best_params_svd['lr_all'],
                    reg_all=best_params_svd['reg_all'])
    svd_model.fit(train_set)
    predictions_svd = svd_model.test(test_set)

    true_ratings_svd = [pred.r_ui for pred in predictions_svd]
    predicted_ratings_svd = [pred.est for pred in predictions_svd]
    rmse, mae = calculate_metrics(true_ratings_svd, predicted_ratings_svd)
    
    test_rmse_scores_svd.append(rmse)
    test_mae_scores_svd.append(mae)
    
    print(f"  > Test Set Performance: RMSE = {rmse:.4f}, MAE = {mae:.4f}")

mean_rmse_svd = np.mean(test_rmse_scores_svd)
std_rmse_svd = np.std(test_rmse_scores_svd)
mean_mae_svd = np.mean(test_mae_scores_svd)
std_mae_svd = np.std(test_mae_scores_svd)

print("\n" + "="*50)
print(f"--- 최종 SVD 모델 성능 (5회 반복) ---")
print(f"평균 RMSE: {mean_rmse_svd:.4f} (표준편차: {std_rmse_svd:.4f})")
print(f"평균 MAE: {mean_mae_svd:.4f} (표준편차: {std_mae_svd:.4f})")
print("="*50)

--- [1단계] SVD 하이퍼파라미터 튜닝 시작 ---

✅ 최적 SVD 파라미터: {'n_factors': 1, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.05}
최적 파라미터의 교차 검증 RMSE: 1.1414

--- [2단계] 최적 SVD 모델 5회 반복 학습 및 평가 시작 ---

[1/5] Iteration with random_state = 42
  > Test Set Performance: RMSE = 1.1374, MAE = 0.8889

[2/5] Iteration with random_state = 43
  > Test Set Performance: RMSE = 1.1356, MAE = 0.8884

[3/5] Iteration with random_state = 44
  > Test Set Performance: RMSE = 1.1389, MAE = 0.8892

[4/5] Iteration with random_state = 45
  > Test Set Performance: RMSE = 1.1367, MAE = 0.8883

[5/5] Iteration with random_state = 46
  > Test Set Performance: RMSE = 1.1401, MAE = 0.8911

--- 최종 SVD 모델 성능 (5회 반복) ---
평균 RMSE: 1.1377 (표준편차: 0.0016)
평균 MAE: 0.8892 (표준편차: 0.0010)
