In [None]:
import os, gc, json, ast
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks

# ---------------------- 설정 ----------------------
INPUT_FILE_3072 = "review_data_optimized_fl.parquet"  # 3072차원 풀 임베딩 PARQUET
EXPERIMENT_TAGS = [
    "BASE"
]
DIM_LIST = [3072, 2048, 1536, 1024, 768, 512, 384, 256, 192, 128]

print(DIM_LIST)

RANDOM_STATE = 42
VAL_SIZE_RATIO = 1 / 8  # 전체의 10% (학+검 80% 중 12.5%)
EPOCHS = 50
BATCH_SIZE = 256
LEARNING_RATE = 0.0001
USER_EMB_DIM = 128
BIZ_EMB_DIM = 32
USER_BIZ_MLP_DIMS = [128, 64]
FINAL_MLP_DIMS = [64, 32]
NUM_RUNS = 5


# ---------------------- 유틸 ----------------------
def set_seeds(seed=RANDOM_STATE):
    np.random.seed(seed)
    tf.random.set_seed(seed)


def l2_norm(a: np.ndarray) -> np.ndarray:
    n = np.linalg.norm(a, axis=1, keepdims=True)
    n[n == 0] = 1.0
    return a / n


def build_model(
    gemini_embedding_dim: int, num_users: int, num_businesses: int
) -> keras.Model:
    # 입력
    user_input = keras.Input(shape=(1,), name="user_id_input")
    business_input = keras.Input(shape=(1,), name="business_id_input")
    gemini_input = keras.Input(
        shape=(gemini_embedding_dim,), name="gemini_embedding_input"
    )

    # 유저/비즈니스 임베딩
    user_emb_layer = layers.Embedding(
        num_users, USER_EMB_DIM, name="user_embedding"
    )
    biz_emb_layer = layers.Embedding(
        num_businesses, BIZ_EMB_DIM, name="business_embedding"
    )
    user_vec = layers.Flatten()(user_emb_layer(user_input))
    biz_vec = layers.Flatten()(biz_emb_layer(business_input))

    interaction = layers.concatenate([user_vec, biz_vec])
    for dim in USER_BIZ_MLP_DIMS:
        interaction = layers.Dense(dim, activation="relu")(interaction)

    # 제미나이 임베딩은 MLP 없이 바로 결합
    x = gemini_input

    # 결합 후 최종 회귀 헤드
    feat = layers.concatenate([interaction, x])
    for dim in FINAL_MLP_DIMS:
        feat = layers.Dense(dim, activation="relu")(feat)
    out = layers.Dense(1, activation="linear", name="output_rating")(feat)

    model = keras.Model(inputs=[user_input, business_input, gemini_input], outputs=out)
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
        loss="mse",
        metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse"), "mae"],
    )
    return model


# ---------------------- 1) 데이터 한 번만 로드 ----------------------
print(f"[LOAD] {INPUT_FILE_3072}")
df = pd.read_parquet(INPUT_FILE_3072)
assert "embedding" in df.columns, "'embedding' 컬럼이 필요합니다."
assert {"user_id", "business_id", "review_stars"}.issubset(
    df.columns
), "user_id, business_id, review_stars 컬럼이 필요합니다."

print(f"전체 데이터셋 크기: {len(df)}")
df_processed = df.copy() # 원본 df를 보존하기 위해 복사본 생성

# ---------------------- 2) 태그/차원 루프 ----------------------
all_results = []
for task in EXPERIMENT_TAGS:
    print("\n" + "=" * 80)
    print(
        f"=========================== EXPERIMENT TAG: {task} ==========================="
    )
    print("=" * 80)

    for D in DIM_LIST:
        print("\n" + "-" * 60)
        print(f"[D={D}] 슬라이스→L2 후 학습/평가 시작")

        rmse_scores, mae_scores = [], []
        
        # ---- 5회 반복 실험 ----
        for i in range(NUM_RUNS):
            keras.backend.clear_session()
            print(f"[D={D}] 실험 {i+1}/{NUM_RUNS} 시작")

            # 매번 새로운 데이터 분할
            set_seeds(RANDOM_STATE + i)
            train_val_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=RANDOM_STATE + i)
            train_df, val_df = train_test_split(train_val_df, test_size=VAL_SIZE_RATIO, random_state=RANDOM_STATE + i)
            
            # 매번 새로운 라벨 인코딩
            user_encoder = LabelEncoder()
            biz_encoder = LabelEncoder()
            train_df.loc[:, 'user_encoded'] = user_encoder.fit_transform(train_df['user_id'])
            train_df.loc[:, 'business_encoded'] = biz_encoder.fit_transform(train_df['business_id'])

            user_mapping = {label: i for i, label in enumerate(user_encoder.classes_)}
            biz_mapping = {label: i for i, label in enumerate(biz_encoder.classes_)}
            val_df.loc[:, 'user_encoded'] = val_df['user_id'].map(user_mapping).fillna(-1).astype(int)
            val_df.loc[:, 'business_encoded'] = val_df['business_id'].map(biz_mapping).fillna(-1).astype(int)
            test_df.loc[:, 'user_encoded'] = test_df['user_id'].map(user_mapping).fillna(-1).astype(int)
            test_df.loc[:, 'business_encoded'] = test_df['business_id'].map(biz_mapping).fillna(-1).astype(int)

            num_users = len(user_encoder.classes_)
            num_businesses = len(biz_encoder.classes_)

            # 차원별 임베딩 생성 (슬라이스 → L2)
            X_tr = l2_norm(np.vstack(train_df['embedding'].values)[:, :D].astype(np.float32))
            X_va = l2_norm(np.vstack(val_df['embedding'].values)[:, :D].astype(np.float32))
            X_te = l2_norm(np.vstack(test_df['embedding'].values)[:, :D].astype(np.float32))
            
            y_tr, y_va, y_te = train_df['review_stars'].values, val_df['review_stars'].values, test_df['review_stars'].values
            u_tr, u_va, u_te = train_df['user_encoded'].values, val_df['user_encoded'].values, test_df['user_encoded'].values
            b_tr, b_va, b_te = train_df['business_encoded'].values, val_df['business_encoded'].values, test_df['business_encoded'].values

            # 모델 빌드 및 컴파일
            run_model = build_model(D, num_users, num_businesses)
            
            # 콜백 설정
            run_es = callbacks.EarlyStopping(
                monitor="val_rmse",
                patience=10,
                min_delta=5e-4,
                mode="min",
                restore_best_weights=True,
            )
            run_ckpt = callbacks.ModelCheckpoint(
                filepath=f"final_best_gemini_model_{task}_D{D}_run{i+1}.keras",
                monitor="val_rmse",
                save_best_only=True,
                mode="min",
                verbose=0,
            )

            # 모델 학습
            run_model.fit(
                {"user_id_input": u_tr, "business_id_input": b_tr, "gemini_embedding_input": X_tr},
                y_tr,
                batch_size=BATCH_SIZE,
                epochs=EPOCHS,
                validation_data=({"user_id_input": u_va, "business_id_input": b_va, "gemini_embedding_input": X_va}, y_va),
                callbacks=[run_es, run_ckpt],
                verbose=0,
            )

            # 테스트 평가
            best_run = keras.models.load_model(f"final_best_gemini_model_{task}_D{D}_run{i+1}.keras")
            run_preds = best_run.predict(
                {"user_id_input": u_te, "business_id_input": b_te, "gemini_embedding_input": X_te},
                verbose=0,
            ).flatten()

            _rmse = np.sqrt(mean_squared_error(y_te, run_preds))
            _mae = mean_absolute_error(y_te, run_preds)

            rmse_scores.append(_rmse)
            mae_scores.append(_mae)
            print(f"[D={D}] 실험 {i+1}/{NUM_RUNS} → RMSE={_rmse:.4f}, MAE={_mae:.4f}")

            # 정리
            del run_model, best_run, run_preds
            gc.collect()
            tf.keras.backend.clear_session()

        # 통계
        avg_rmse, std_rmse = float(np.mean(rmse_scores)), float(np.std(rmse_scores))
        avg_mae, std_mae = float(np.mean(mae_scores)), float(np.std(mae_scores))

        print("\n" + "=" * 60)
        print(f"[{task}] D={D} 성능 통계 (5회 평균)")
        print("=" * 60)
        print(f"평균 RMSE: {avg_rmse:.4f} (±{std_rmse:.4f})")
        print(f"평균 MAE : {avg_mae:.4f} (±{std_mae:.4f})")

        # 결과 DataFrame 생성 및 리스트에 추가
        summary_df = pd.DataFrame({
            "Dimension": [D, D],
            "Metric": ["RMSE", "MAE"],
            "Average": [avg_rmse, avg_mae],
            "Std. Deviation": [std_rmse, std_mae],
        })
        summary_df["Task"] = task
        all_results.append(summary_df)

# 모든 결과 합쳐서 출력
final_results_df = pd.concat(all_results, ignore_index=True)
final_results_df = final_results_df.round(4)
print("\n" + "=" * 80)
print("============================ 최종 실험 결과 요약 ============================")
print("=" * 80)
print(final_results_df.to_string())

[3072, 2048, 1536, 1024, 768, 512, 384, 256, 192, 128]
[LOAD] review_data_optimized_fl.parquet
전체 데이터셋 크기: 500892


------------------------------------------------------------
[D=3072] 슬라이스→L2 후 학습/평가 시작

[D=3072] 실험 1/5 시작
