### 분할

In [2]:
import json
import random
from collections import defaultdict

# 설정
INPUT_FILE = "absa_ate_results.json"
TRAIN_FILE = "absa_train.json"
TEST_FILE = "absa_test.json"
TEST_RATIO = 0.2
SEED = 42
random.seed(SEED)

# --------------------------
# 1. Load and Group by user
# --------------------------
user_reviews = defaultdict(list)

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        if obj.get("aspects"):  # aspect 없는 리뷰 제외
            user_reviews[obj["user_id"]].append(obj)

print(f"✅ 유저 수: {len(user_reviews)}")

# --------------------------
# 2. Split by user
# --------------------------
user_ids = list(user_reviews.keys())
random.shuffle(user_ids)

n_test_users = int(len(user_ids) * TEST_RATIO)
test_users = set(user_ids[:n_test_users])
train_users = set(user_ids[n_test_users:])

train_data, test_data = [], []

for uid in train_users:
    train_data.extend(user_reviews[uid])

for uid in test_users:
    test_data.extend(user_reviews[uid])

# --------------------------
# 3. Save to files
# --------------------------
with open(TRAIN_FILE, "w", encoding="utf-8") as f:
    for d in train_data:
        f.write(json.dumps(d, ensure_ascii=False) + "\n")

with open(TEST_FILE, "w", encoding="utf-8") as f:
    for d in test_data:
        f.write(json.dumps(d, ensure_ascii=False) + "\n")

print(f"📦 훈련 유저 수: {len(train_users)}, 리뷰 수: {len(train_data)}")
print(f"🧪 테스트 유저 수: {len(test_users)}, 리뷰 수: {len(test_data)}")


✅ 유저 수: 28506
📦 훈련 유저 수: 22805, 리뷰 수: 327083
🧪 테스트 유저 수: 5701, 리뷰 수: 79594


####word2vec

In [6]:
import json
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from sklearn.preprocessing import normalize

# 🔧 파일 설정 (filtered 벡터 사용)
USER_VEC_FILE = "user_vector_filtered.json"
BIZ_VEC_FILE = "business_vector_filtered.json"
TRAIN_FILE = "absa_train.json"
RECOMMEND_FILE = "recommendations_filtered.json"
TOP_N = 10

# 📦 벡터 로딩
with open(USER_VEC_FILE, encoding="utf-8") as f:
    user_embed = json.load(f)
with open(BIZ_VEC_FILE, encoding="utf-8") as f:
    biz_embed = json.load(f)

# 👀 훈련 데이터 기반 사용자 방문 식당 기록
user_seen = defaultdict(set)
with open(TRAIN_FILE, encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        user_seen[obj["user_id"]].add(obj["business_id"])

# 🧮 정규화 및 행렬 구성
uids = list(user_embed.keys())
bids = list(biz_embed.keys())
u_matrix = normalize(np.array([user_embed[uid] for uid in uids]), axis=1)
b_matrix = normalize(np.array([biz_embed[bid] for bid in bids]), axis=1)

# 🧠 코사인 유사도 = dot product (정규화 되어 있음)
sim = np.dot(u_matrix, b_matrix.T)

# 🎯 추천 계산
recommendations = {}
for i, uid in enumerate(tqdm(uids, desc="📡 추천 계산 중")):
    seen = user_seen[uid]
    ranked_idx = np.argsort(sim[i])[::-1]
    top_biz = [bids[j] for j in ranked_idx if bids[j] not in seen][:TOP_N]
    recommendations[uid] = top_biz

# 💾 결과 저장
with open(RECOMMEND_FILE, "w", encoding="utf-8") as f:
    json.dump(recommendations, f, indent=2)

print("✅ 추천 저장 완료:", RECOMMEND_FILE)


📡 추천 계산 중: 100%|██████████| 21211/21211 [00:32<00:00, 649.80it/s]


✅ 추천 저장 완료: recommendations_filtered.json


In [7]:
import json
from collections import defaultdict
from tqdm import tqdm

# 파일 설정
RECOMMEND_FILE = "recommendations_filtered.json"
EVAL_FILE = "absa_test.json"
TOP_K = [1, 5, 10]  # 원하는 평가 기준

# 추천 결과 로딩
with open(RECOMMEND_FILE, encoding="utf-8") as f:
    recommendations = json.load(f)

# 테스트셋 (실제 방문한 식당 ground truth)
ground_truth = defaultdict(set)
with open(EVAL_FILE, encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        ground_truth[obj["user_id"]].add(obj["business_id"])

# 평가 대상 사용자: 추천과 실제 모두 있는 사용자만
common_users = set(recommendations.keys()) & set(ground_truth.keys())
print(f"📌 평가 대상 유저 수: {len(common_users)}")

# 메트릭 초기화
metrics = {k: {"precision": 0, "recall": 0, "f1": 0} for k in TOP_K}

for uid in tqdm(common_users, desc="🎯 Precision@K 계산 중"):
    pred = recommendations[uid]
    true = ground_truth[uid]

    for k in TOP_K:
        topk_pred = set(pred[:k])
        hits = topk_pred & true

        precision = len(hits) / k
        recall = len(hits) / len(true) if true else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0

        metrics[k]["precision"] += precision
        metrics[k]["recall"] += recall
        metrics[k]["f1"] += f1

# 평균 계산
n = len(common_users)
print("\n📊 Precision@K 결과")
for k in TOP_K:
    p = metrics[k]["precision"] / n
    r = metrics[k]["recall"] / n
    f = metrics[k]["f1"] / n
    print(f"🔹 Top-{k} → Precision: {p:.4f}, Recall: {r:.4f}")


📌 평가 대상 유저 수: 4214


🎯 Precision@K 계산 중: 100%|██████████| 4214/4214 [00:00<00:00, 63903.67it/s]


📊 Precision@K 결과
🔹 Top-1 → Precision: 0.0994, Recall: 0.0111
🔹 Top-5 → Precision: 0.0570, Recall: 0.0299
🔹 Top-10 → Precision: 0.0444, Recall: 0.0438



