In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
)
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks

2025-08-30 20:37:59.323508: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-30 20:37:59.683772: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-08-30 20:38:01.239659: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
TASK = [
    "RETRIEVAL_QUERY",
    "CLASSIFICATION",
]

In [None]:
# 기존 모델 학습 코드를 이은다음에 for task in TASK로 반복 실행

for task in TASK:
    input_file = "../Dataset/states/fl_split5_" + task + ".jsonl"
    print(input_file)

    df_processed = pd.read_json(
        input_file,
        lines=True,
    )

    df_processed.info()

    df_processed.head(10)

    print(f"전체 데이터셋 크기: {len(df_processed)}")

    # 각 인코더 객체 생성
    user_encoder = LabelEncoder()
    business_encoder = LabelEncoder()

    # 인코딩 수행
    encoded_user_ids = user_encoder.fit_transform(df_processed["user_id"])
    encoded_business_ids = business_encoder.fit_transform(df_processed["business_id"])

    # 데이터프레임에 인코딩된 열 추가
    df_processed["user_encoded"] = encoded_user_ids
    df_processed["business_encoded"] = encoded_business_ids

    # 리뷰 데이터에서 고유한 사용자와 비지니스 수 계산(이후 모델 입력에 사용)

    num_users = len(user_encoder.classes_)
    num_businesses = len(business_encoder.classes_)

    print(num_users)
    print(num_businesses)

    # 7:1:2 비율로 데이터셋을 학습, 검증, 테스트로 나누기
    # 먼저 학습+검증 / 테스트로 나눔
    # 그 후 학습 / 검증으로 나눔

    # 학습+검증 / 테스트
    train_val_df, test_df = train_test_split(
        df_processed, test_size=0.2, random_state=42
    )

    # 학습 / 검증
    val_size_ratio = 1 / 8  # 전체 데이터의 10% = 학습+검증 데이터의 12.5%
    train_df, val_df = train_test_split(
        train_val_df, test_size=val_size_ratio, random_state=42
    )

    print(f"전체 데이터 수: {len(df_processed)}")
    print(
        f"학습 데이터 수: {len(train_df)} ({len(train_df)/len(df_processed)*100:.2f}%)"
    )
    print(f"검증 데이터 수: {len(val_df)} ({len(val_df)/len(df_processed)*100:.2f}%)")
    print(
        f"테스트 데이터 수: {len(test_df)} ({len(test_df)/len(df_processed)*100:.2f}%)"
    )

    train_embeddings = np.array(train_df["embedding"].tolist(), dtype=np.float32)
    val_embeddings = np.array(val_df["embedding"].tolist(), dtype=np.float32)
    test_embeddings = np.array(test_df["embedding"].tolist(), dtype=np.float32)

    print(f"학습 임베딩 데이터 형태: {train_embeddings.shape}")
    print(f"검증 임베딩 데이터 형태: {val_embeddings.shape}")
    print(f"테스트 임베딩 데이터 형태: {test_embeddings.shape}")

    print(f"데이터 type: {train_embeddings.dtype}")

    # user_id, business_id의 벡터 차원
    user_business_embedding_dim = 64

    # 유저-비즈니스 상호작용을 처리하는 MLP의 레이어 크기
    user_biz_mlp_dims = [128, 64]

    # 제미나이 리뷰 텍스트 임베딩 차원
    gemini_embedding_dim = 3072

    # 최종 예측을 위한 MLP의 각 레이어 크기
    final_mlp_dims = [32, 16]

    # 학습률
    learning_rate = 0.001

    # 배치 사이즈
    batch_size = 128
    # batch_size = 32

    # 입력층 정의
    user_input = keras.Input(shape=(1,), name="user_id_input")
    business_input = keras.Input(shape=(1,), name="business_id_input")

    # 임베딩 레이어: 각 유저/비즈니스 ID를 고유한 벡터로 변환
    user_embedding_layer = layers.Embedding(
        num_users, user_business_embedding_dim, name="user_embedding"
    )
    business_embedding_layer = layers.Embedding(
        num_businesses, user_business_embedding_dim, name="business_embedding"
    )

    user_vec = layers.Flatten()(user_embedding_layer(user_input))
    business_vec = layers.Flatten()(business_embedding_layer(business_input))

    # 두 벡터를 하나로 합침
    combined_vec = layers.concatenate([user_vec, business_vec])

    # 합쳐진 벡터를 MLP에 통과시켜 상호작용 특징을 추출
    interaction_features = combined_vec
    for dim in user_biz_mlp_dims:
        interaction_features = layers.Dense(dim, activation="relu")(
            interaction_features
        )

    # 입력층 정의
    gemini_input = keras.Input(
        shape=(gemini_embedding_dim,), name="gemini_embedding_input"
    )

    # 제미나이 임베딩(리뷰 텍스트)을 처리하는 MLP
    review_features = layers.Dense(1536, activation="relu")(gemini_input)
    review_features = layers.Dense(768, activation="relu")(review_features)
    review_features = layers.Dense(512, activation="relu")(review_features)
    # review_features = layers.Dense(256, activation="relu")(review_features)

    # 모듈 1과 모듈 2에서 추출된 특징들을 concat
    final_combined_features = layers.concatenate(
        [interaction_features, review_features]
    )

    # 최종적으로 별점을 예측하는 MLP
    predicted_rating = final_combined_features
    for dim in final_mlp_dims:
        predicted_rating = layers.Dense(dim, activation="relu")(predicted_rating)

    # 출력층 : 1개의 숫자로 된 최종 별점을 예측
    output_rating = layers.Dense(1, activation="linear", name="output_rating")(
        predicted_rating
    )

    # 최종 모델 정의, 어떤 입력들을 받고 어떤 출력을 내보낼지 설정
    final_model = models.Model(
        inputs=[user_input, business_input, gemini_input], outputs=output_rating
    )

    ############################################################

    final_model.compile(
        # Adam 옵티마이저
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        # loss 함수 = 평균 제곱 오차 (MSE)
        loss="mse",
        # 학습 중 모니터링할 지표 설정(rmse, mae)
        metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse"), "mae"],
    )

    final_model_base_path = f"final_best_gemini_model_{task}"

    early_stopping_callback = callbacks.EarlyStopping(
        monitor="val_rmse",
        patience=10,
        min_delta=0.0005,
        mode="min",
        restore_best_weights=True,
    )

    final_model_path = f"{final_model_base_path}_main.keras"

    model_checkpoint_callback = callbacks.ModelCheckpoint(
        filepath=final_model_path,
        monitor="val_rmse",
        save_best_only=True,
        mode="min",
        verbose=1,
    )

    #####################################################

    print(f"\n==== [{task}] 버전 학습 시작")

    epochs = 50

    history = final_model.fit(
        # 입력 데이터
        {
            "user_id_input": train_df["user_encoded"],
            "business_id_input": train_df["business_encoded"],
            "gemini_embedding_input": train_embeddings,
        },
        # 정답 데이터
        train_df["review_stars"],
        batch_size=batch_size,
        epochs=epochs,
        # 검증 시 사용할 데이터
        validation_data=(
            {
                "user_id_input": val_df["user_encoded"],
                "business_id_input": val_df["business_encoded"],
                "gemini_embedding_input": val_embeddings,
            },
            val_df["review_stars"],
        ),
        # 콜백 설정
        callbacks=[early_stopping_callback, model_checkpoint_callback],
        verbose=1,
    )

    #########################################################################################

    final_model = keras.models.load_model(final_model_path)

    test_predictions = final_model.predict(
        {
            "user_id_input": test_df["user_encoded"],
            "business_id_input": test_df["business_encoded"],
            "gemini_embedding_input": test_embeddings,
        }
    ).flatten()

    # 테스트 데이터 평점 열을 nparray로 가져옴
    true_ratings = test_df["review_stars"].values

    # 각종 평가지표 계산
    mse = mean_squared_error(true_ratings, test_predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(true_ratings, test_predictions)
    mape = mean_absolute_percentage_error(true_ratings, test_predictions) * 100

    # 출력
    print(f"{task}버전 모델 성능 평가")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"MAPE: {mape:.2f}%")

    # 5번 반복해 테스트하고 각 평가지표들의 표준편차가 < 0.005 인지 확인

    # 각 실행의 평가지표를 저장할 리스트 초기화
    mse_scores = []
    rmse_scores = []
    mae_scores = []
    mape_scores = []

    # 총 실행 횟수
    num_runs = 5
    for i in range(num_runs):
        print("\n" + "=" * 60)
        print(f"                   실험 {i+1}/{num_runs} 시작")
        print("=" * 60)

        user_input = keras.Input(shape=(1,), name="user_id_input")
        business_input = keras.Input(shape=(1,), name="business_id_input")
        gemini_input = keras.Input(
            shape=(gemini_embedding_dim,), name="gemini_embedding_input"
        )

        user_embedding_layer = layers.Embedding(
            num_users, user_business_embedding_dim, name="user_embedding"
        )
        business_embedding_layer = layers.Embedding(
            num_businesses, user_business_embedding_dim, name="business_embedding"
        )
        user_vec = layers.Flatten()(user_embedding_layer(user_input))
        business_vec = layers.Flatten()(business_embedding_layer(business_input))
        combined_vec = layers.concatenate([user_vec, business_vec])
        interaction_features = combined_vec
        for dim in user_biz_mlp_dims:
            interaction_features = layers.Dense(dim, activation="relu")(
                interaction_features
            )

        review_features = layers.Dense(1536, activation="relu")(gemini_input)
        review_features = layers.Dense(768, activation="relu")(review_features)
        review_features = layers.Dense(512, activation="relu")(review_features)

        final_combined_features = layers.concatenate(
            [interaction_features, review_features]
        )
        predicted_rating = final_combined_features
        for dim in final_mlp_dims:
            predicted_rating = layers.Dense(dim, activation="relu")(predicted_rating)
        output_rating = layers.Dense(1, activation="linear", name="output_rating")(
            predicted_rating
        )

        run_model = models.Model(
            inputs=[user_input, business_input, gemini_input], outputs=output_rating
        )
        run_model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
            loss="mse",
            metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse"), "mae"],
        )

        ########

        run_ckpt_path = f"{final_model_base_path}_run{i+1}.keras"
        run_es = callbacks.EarlyStopping(
            monitor="val_rmse",
            patience=10,
            min_delta=0.0005,
            mode="min",
            restore_best_weights=True,
        )
        run_ckpt = callbacks.ModelCheckpoint(
            filepath=run_ckpt_path,
            monitor="val_rmse",
            save_best_only=True,
            mode="min",
            verbose=0,
        )

        ########

        run_model.fit(
            {
                "user_id_input": train_df["user_encoded"],
                "business_id_input": train_df["business_encoded"],
                "gemini_embedding_input": train_embeddings,
            },
            train_df["review_stars"],
            batch_size=batch_size,
            epochs=epochs,
            validation_data=(
                {
                    "user_id_input": val_df["user_encoded"],
                    "business_id_input": val_df["business_encoded"],
                    "gemini_embedding_input": val_embeddings,
                },
                val_df["review_stars"],
            ),
            callbacks=[run_es, run_ckpt],
            verbose=0,
        )
        print(f"실험 {i+1}: 모델 학습 완료.")

        best_model = keras.models.load_model(run_ckpt_path)

        predictions = best_model.predict(
            {
                "user_id_input": test_df["user_encoded"],
                "business_id_input": test_df["business_encoded"],
                "gemini_embedding_input": test_embeddings,
            }
        ).flatten()

        true_ratings = test_df["review_stars"].values
        mse = mean_squared_error(true_ratings, predictions)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(true_ratings, predictions)
        mape = mean_absolute_percentage_error(true_ratings, predictions) * 100

        # 결과 저장
        mse_scores.append(mse)
        rmse_scores.append(rmse)
        mae_scores.append(mae)
        mape_scores.append(mape)

        print(f"실험 {i+1} 결과 - RMSE: {rmse:.4f}, MAE: {mae:.4f}, MAPE: {mape:.2f}%")

        ####

        del run_model
        del best_model
        del predictions

        import gc

        gc.collect()

        tf.keras.backend.clear_session()

    # mse 평가지표의 평균과 표준편차 계산
    avg_mse = np.mean(mse_scores)
    std_mse = np.std(mse_scores)

    # rmse 평가지표의 평균과 표준편차 계산
    avg_rmse = np.mean(rmse_scores)
    std_rmse = np.std(rmse_scores)

    # mae 평가지표의 평균과 표준편차 계산
    avg_mae = np.mean(mae_scores)
    std_mae = np.std(mae_scores)

    # mape 평가지표의 평균과 표준편차 계산
    avg_mape = np.mean(mape_scores)
    std_mape = np.std(mape_scores)

    # --- 최종 결과 보고 ---
    print("\n" + "=" * 60)
    print(f"{task}버전 모델 성능 통계 (5회 실행 평균)")
    print("=" * 60)
    print(f"평균 MSE: {avg_mse:.4f} (표준편차: {std_mse:.4f})")
    print(f"평균 RMSE: {avg_rmse:.4f} (표준편차: {std_rmse:.4f})")
    print(f"평균 MAE : {avg_mae:.4f} (표준편차: {std_mae:.4f})")
    print(f"평균 MAPE: {avg_mape:.2f}% (표준편차: {std_mape:.2f})")
    print("=" * 60)

    # 결과 데이터를 딕셔너리 형태로 구성
    # 각 평가지표에 대한 5회의 실행 결과, 평균, 표준편차를 정리
    summary_data = {
        "Metric": ["MSE", "RMSE", "MAE", "MAPE (%)"],
        "Run 1": [mse_scores[0], rmse_scores[0], mae_scores[0], mape_scores[0]],
        "Run 2": [mse_scores[1], rmse_scores[1], mae_scores[1], mape_scores[1]],
        "Run 3": [mse_scores[2], rmse_scores[2], mae_scores[2], mape_scores[2]],
        "Run 4": [mse_scores[3], rmse_scores[3], mae_scores[3], mape_scores[3]],
        "Run 5": [mse_scores[4], rmse_scores[4], mae_scores[4], mape_scores[4]],
        "Average": [avg_mse, avg_rmse, avg_mae, avg_mape],
        "Std. Deviation": [std_mse, std_rmse, std_mae, std_mape],
    }

    # 딕셔너리를 DataFrame으로 변환
    results_df = pd.DataFrame(summary_data)

    # 가독성을 위해 소수점 자리수를 정리
    results_df = results_df.round(
        {
            "Run 1": 4,
            "Run 2": 4,
            "Run 3": 4,
            "Run 4": 4,
            "Run 5": 4,
            "Average": 4,
            "Std. Deviation": 4,
        }
    )
    # MAPE는 % 단위이므로 소수점 2자리로 별도 처리
    results_df.loc[results_df["Metric"] == "MAPE (%)"] = results_df.loc[
        results_df["Metric"] == "MAPE (%)"
    ].round(2)

    print(f"--- [{task}] 버전 최종 성능 요약 테이블 ---")

    results_df.to_csv(
        "../performance/model_performance_" + task + ".csv",
        index=False,
        encoding="utf-8-sig",
    )
    print("csv로 저장 완료")