In [2]:
import json
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from sentence_transformers import SentenceTransformer

# 설정
ABSA_FILE = "absa_ate_results.json"
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
SENT2WEIGHT = {"pos": 1.0, "neg": -1.0}
EMBED_DIM = 384  # 임베딩 차원

# 모델 로드
model = SentenceTransformer(MODEL_NAME)

# 유저별 누적 벡터
user_vecs = defaultdict(list)

with open(ABSA_FILE, encoding="utf-8") as f:
    for line in tqdm(f, desc="🔄 벡터 생성 중"):
        review = json.loads(line)
        user_id = review["user_id"]
        aspects = review.get("aspects", [])

        # 🔸 aspects가 없으면 스킵
        if not aspects:
            continue

        for asp in aspects:
            term = asp["term"].strip()
            sentiment = asp["sentiment"]
            confidence = asp["confidence"]

            if sentiment not in SENT2WEIGHT:
                continue

            weight = SENT2WEIGHT[sentiment] * confidence
            vec = model.encode(term)
            user_vecs[user_id].append(weight * vec)

# 평균 벡터 계산
user_embed = {
    uid: (np.mean(vecs, axis=0) if vecs else np.zeros(EMBED_DIM)).tolist()
    for uid, vecs in tqdm(user_vecs.items(), desc="👤 유저 벡터 평균")
}

# 저장
with open("user_vector.json", "w", encoding="utf-8") as f:
    json.dump(user_embed, f, indent=2)

print("✅ user_vector.json 저장 완료")


🔄 벡터 생성 중: 452505it [2:17:08, 54.99it/s]
👤 유저 벡터 평균: 100%|██████████| 28483/28483 [00:13<00:00, 2082.81it/s] 


✅ user_vector.json 저장 완료


#### word2vec

In [None]:
import json
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from gensim.models import KeyedVectors
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# 설정
ABSA_FILE = "absa_ate_results.json"
W2V_PATH = "GoogleNews-vectors-negative300.bin.gz"  # Word2Vec 모델
EMBED_DIM = 300
SENT2WEIGHT = {"pos": 1.0, "neg": -1.0}
MIN_LEN = 2        # term 최소 단어 수
MIN_CONF = 0.7     # confidence threshold

def is_valid_term(term, confidence, stop_words, min_len=2, min_conf=0.7):
    tokens = term.strip().split()
    if len(tokens) < min_len:
        return False
    if confidence < min_conf:
        return False
    if all(tok.lower() in stop_words for tok in tokens):
        return False
    return True

print("🔁 Word2Vec 로딩 중...")
w2v_model = KeyedVectors.load_word2vec_format(W2V_PATH, binary=True)
print("✅ Word2Vec 로딩 완료!")

user_vecs = defaultdict(list)

with open(ABSA_FILE, encoding="utf-8") as f:
    for line in tqdm(f, desc="👤 유저 벡터 생성 중"):
        obj = json.loads(line)
        uid = obj["user_id"]
        for asp in obj.get("aspects", []):
            term = asp["term"].strip()
            sent = asp["sentiment"]
            conf = asp["confidence"]

            if sent not in SENT2WEIGHT:
                continue
            if not is_valid_term(term, conf, stop_words, MIN_LEN, MIN_CONF):
                continue

            tokens = term.split()
            vecs = [w2v_model[tok] for tok in tokens if tok in w2v_model]
            if not vecs:
                continue

            vec = np.mean(vecs, axis=0)
            weight = SENT2WEIGHT[sent] * conf
            user_vecs[uid].append(weight * vec)

user_embed = {
    uid: (np.mean(vecs, axis=0) if vecs else np.zeros(EMBED_DIM)).tolist()
    for uid, vecs in tqdm(user_vecs.items(), desc="👤 유저 벡터 평균")
}

with open("user_vector_filtered.json", "w", encoding="utf-8") as f:
    json.dump(user_embed, f, indent=2)

biz_vecs = defaultdict(list)

with open(ABSA_FILE, encoding="utf-8") as f:
    for line in tqdm(f, desc="🏠 식당 벡터 생성 중"):
        obj = json.loads(line)
        bid = obj["business_id"]
        for asp in obj.get("aspects", []):
            term = asp["term"].strip()
            sent = asp["sentiment"]
            conf = asp["confidence"]

            if sent not in SENT2WEIGHT:
                continue
            if not is_valid_term(term, conf, stop_words, MIN_LEN, MIN_CONF):
                continue

            tokens = term.split()
            vecs = [w2v_model[tok] for tok in tokens if tok in w2v_model]
            if not vecs:
                continue

            vec = np.mean(vecs, axis=0)
            weight = SENT2WEIGHT[sent] * conf
            biz_vecs[bid].append(weight * vec)

biz_embed = {
    bid: (np.mean(vecs, axis=0) if vecs else np.zeros(EMBED_DIM)).tolist()
    for bid, vecs in tqdm(biz_vecs.items(), desc="🏠 식당 벡터 평균")
}

with open("business_vector_filtered.json", "w", encoding="utf-8") as f:
    json.dump(biz_embed, f, indent=2)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\82104\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


🔁 Word2Vec 로딩 중...
✅ Word2Vec 로딩 완료!


👤 유저 벡터 생성 중: 452505it [00:07, 57646.92it/s]
👤 유저 벡터 평균: 100%|██████████| 8933/8933 [00:10<00:00, 879.80it/s] 
🏠 식당 벡터 생성 중: 452505it [00:06, 66231.52it/s]
🏠 식당 벡터 평균: 100%|██████████| 3325/3325 [00:00<00:00, 34719.94it/s]
