In [None]:
!pip install gensim torch pandas numpy scikit-learn tqdm

In [None]:
# -*- coding: utf-8 -*-

import os, re, json, math, random
from collections import Counter
from pathlib import Path
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim.models import KeyedVectors

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# ==================== 0) 재현성 & 디바이스 ====================
SEED = 42
random.seed(SEED); np.random.seed(SEED)
torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[INFO] Device: {device}")

# ==================== 1) 경로/하이퍼파라미터 ====================
DATA_PATH = "/content/drive/MyDrive/review_business_5up_with_text.json"       # 파일경로수정
GOOGLE_NEWS_BIN = "/content/drive/MyDrive/GoogleNews-vectors-negative300.bin.gz" # https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit 다운
WORK_DIR = Path("/content/drive/MyDrive/") 
WORK_DIR.mkdir(parents=True, exist_ok=True)
BEST_MODEL_PATH = str(WORK_DIR / "best_deepconn.pt")

# 전처리
MIN_COUNT = 5
MAX_USER_LEN = 1000
MAX_ITEM_LEN = 1000
EMB_DIM = 300  # GoogleNews

# 학습
BATCH_SIZE = 256
EPOCHS = 50
LR = 2e-3
DROPOUT = 0.5
OUT_DIM = 50
CONV_KERNEL = 3
CONV_FILTERS = 100
FREEZE_EMB = False

# 조기 종료
PATIENCE = 5
MIN_DELTA = 1e-3

# ==================== 2) 유틸 함수 ====================
def simple_tokenize(text: str) -> List[str]:
    return re.findall(r"[A-Za-z0-9']+", str(text))

def load_reviews(path: str) -> pd.DataFrame:
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        head = f.read(2048); f.seek(0)
        if head.strip().startswith("["):
            rows = json.load(f)  # JSON array
        else:
            for line in f:       # JSONL
                line = line.strip()
                if not line: continue
                try:
                    rows.append(json.loads(line))
                except json.JSONDecodeError:
                    pass
    df = pd.DataFrame(rows)
    # 표준 컬럼 이름 맞추기
    if "stars" in df.columns and "rating" not in df.columns:
        df = df.rename(columns={"stars": "rating"})
    required = ["user_id", "business_id", "rating", "text"]
    miss = [c for c in required if c not in df.columns]
    if miss:
        raise ValueError(f"필수 컬럼 누락: {miss} | 현재: {list(df.columns)[:12]}")
    return df[required].dropna().reset_index(drop=True)

def mean_absolute_percentage_error(y_true, y_pred, eps=1e-10):
    y_true = np.asarray(y_true, dtype=np.float32)
    y_pred = np.asarray(y_pred, dtype=np.float32)
    denom = np.clip(np.abs(y_true), eps, None)  # 0-division 방지
    return float(np.mean(np.abs((y_true - y_pred) / denom)) * 100.0)

# ==================== 3) Stage-1: 전처리/임베딩 ====================
def build_data(df: pd.DataFrame):
    # 사용자/아이템 문서 구성
    print("[*] 사용자/아이템 문서 생성…")
    user_docs = df.groupby("user_id")["text"].apply(lambda s: " ".join(map(str, s))).to_dict()
    item_docs = df.groupby("business_id")["text"].apply(lambda s: " ".join(map(str, s))).to_dict()

    # 토큰화 & vocab
    print("[*] 토큰화/단어 카운트…")
    word_counter = Counter()
    def tokenize_and_count(docs: Dict[str,str]):
        tokenized = {}
        for k, txt in tqdm(docs.items(), desc="tokenize"):
            toks = simple_tokenize(txt)
            tokenized[k] = toks
            word_counter.update(toks)
        return tokenized

    user_tokens = tokenize_and_count(user_docs)
    item_tokens = tokenize_and_count(item_docs)

    PAD, UNK = "<PAD>", "<UNK>"
    vocab = {PAD: 0, UNK: 1}
    for w, c in word_counter.items():
        if c >= MIN_COUNT and w not in vocab:
            vocab[w] = len(vocab)
    print(f"[+] Vocab size: {len(vocab):,}")

    # Word2Vec 로드 & 임베딩 매트릭스
    if not os.path.exists(GOOGLE_NEWS_BIN):
        raise FileNotFoundError(f"사전학습 벡터 필요: {GOOGLE_NEWS_BIN}")
    print("[*] Word2Vec 로딩…")
    kv = KeyedVectors.load_word2vec_format(GOOGLE_NEWS_BIN, binary=True)
    assert kv.vector_size == EMB_DIM

    emb = np.random.normal(0, 0.01, size=(len(vocab), EMB_DIM)).astype(np.float32)
    emb[vocab[PAD]] = 0.0
    hit = 0
    for w, idx in vocab.items():
        if w in (PAD, UNK): continue
        if w in kv: emb[idx] = kv[w]; hit += 1
    print(f"[+] Embedding matrix: {emb.shape} | hit={hit:,} / oov={len(vocab)-hit-2:,}")

    # 시퀀스 변환
    def to_ids(tokens: List[str], vocab: Dict[str,int], max_len: int):
        ids = [vocab.get(w, 1) for w in tokens]  # UNK=1
        return ids[:max_len] if len(ids) > max_len else ids

    user_seqs = {uid: to_ids(toks, vocab, MAX_USER_LEN) for uid, toks in tqdm(user_tokens.items(), desc="make user seqs")}
    item_seqs = {bid: to_ids(toks, vocab, MAX_ITEM_LEN) for bid, toks in tqdm(item_tokens.items(), desc="make item seqs")}
    return vocab, emb, user_seqs, item_seqs

# ==================== 4) Dataset / Model 정의 ====================
class DeepConnDataset(Dataset):
    def __init__(self, frame: pd.DataFrame, user_seqs, item_seqs, pad_idx=0):
        self.df = frame.reset_index(drop=True)
        self.user_seqs = user_seqs
        self.item_seqs = item_seqs
        self.pad = pad_idx

    def __len__(self): return len(self.df)

    def _pad(self, seq, max_len):
        s = seq[:max_len]
        if len(s) < max_len:
            s = s + [self.pad] * (max_len - len(s))
        return torch.tensor(s, dtype=torch.long)

    def __getitem__(self, i):
        r = self.df.iloc[i]
        u_seq = self.user_seqs.get(r["user_id"], [])
        v_seq = self.item_seqs.get(r["business_id"], [])
        x_u = self._pad(u_seq, MAX_USER_LEN)
        x_v = self._pad(v_seq, MAX_ITEM_LEN)
        y = torch.tensor(float(r["rating"]), dtype=torch.float32)
        return x_u, x_v, y

class TextCNN(nn.Module):
    def __init__(self, emb_weights: np.ndarray, out_dim=50, conv_filters=100, kernel_size=3, dropout=0.5, freeze=False):
        super().__init__()
        vocab_size, emb_dim = emb_weights.shape
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.embedding.weight.data.copy_(torch.tensor(emb_weights))
        self.embedding.weight.requires_grad = not freeze

        self.conv = nn.Conv1d(emb_dim, conv_filters, kernel_size=kernel_size, stride=1, padding=0)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(conv_filters, out_dim)

    def forward(self, x):
        emb = self.embedding(x)        # (B, L, E)
        emb = emb.transpose(1, 2)      # (B, E, L)
        h = self.conv(emb)             # (B, F, L')
        h = self.relu(h)
        h = torch.max(h, dim=2).values # Global Max Pool -> (B, F)
        h = self.dropout(h)
        out = self.fc(h)               # (B, out_dim)
        return out

class DeepCoNN_TextCNN_FM(nn.Module):
    def __init__(self, emb_weights, out_dim=50, conv_filters=100, kernel_size=3, dropout=0.5, fm_k=40, freeze=False):
        super().__init__()
        self.user_net = TextCNN(emb_weights, out_dim, conv_filters, kernel_size, dropout, freeze)
        self.item_net = TextCNN(emb_weights, out_dim, conv_filters, kernel_size, dropout, freeze)
        F = out_dim * 2
        self.fm_bias = nn.Parameter(torch.zeros(1))
        self.fm_w = nn.Linear(F, 1, bias=False)
        self.fm_V = nn.Parameter(torch.randn(F, fm_k) * 0.01)

    def fm_forward(self, z):
        linear = self.fm_w(z) + self.fm_bias
        vz = torch.matmul(z, self.fm_V)                         # (B, k)
        pairwise = 0.5 * torch.sum(vz * vz - torch.matmul(z*z, self.fm_V*self.fm_V), dim=1, keepdim=True)
        return linear + pairwise

    def forward(self, u_ids, v_ids):
        xu = self.user_net(u_ids)
        yi = self.item_net(v_ids)
        z  = torch.cat([xu, yi], dim=1)                         # (B, 2*out_dim)
        y  = self.fm_forward(z).squeeze(1)
        return y

# ==================== 5) 학습/검증/테스트 루틴 ====================
def eval_on_loader(model, loader, device, clamp_to_star=False):
    model.eval()
    yt, yp = [], []
    with torch.no_grad():
        for u, v, y in loader:
            u, v = u.to(device), v.to(device)
            p = model(u, v)
            if clamp_to_star:
                p = torch.clamp(p, 1.0, 5.0)  # 별점 범위로 제한
            yp.append(p.cpu().numpy())
            yt.append(y.numpy())
    yt = np.concatenate(yt); yp = np.concatenate(yp)
    mse  = float(mean_squared_error(yt, yp))
    rmse = float(math.sqrt(mse))
    mae  = float(mean_absolute_error(yt, yp))
    mape = mean_absolute_percentage_error(yt, yp)
    return mse, rmse, mae, mape

def train_with_early_stop(df, emb_matrix):
    # Split 70/10/20
    train_df, test_df = train_test_split(df, test_size=0.1, random_state=SEED)
    train_df, val_df  = train_test_split(train_df, test_size=0.2222, random_state=SEED)

    train_ds = DeepConnDataset(train_df, user_seqs, item_seqs, pad_idx=0)
    val_ds   = DeepConnDataset(val_df,   user_seqs, item_seqs, pad_idx=0)
    test_ds  = DeepConnDataset(test_df,  user_seqs, item_seqs, pad_idx=0)

    num_workers = 2 if torch.cuda.is_available() else 0
    pin_memory  = True if torch.cuda.is_available() else False

    train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=num_workers, pin_memory=pin_memory, persistent_workers=bool(num_workers))
    val_dl   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=num_workers, pin_memory=pin_memory, persistent_workers=bool(num_workers))
    test_dl  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=num_workers, pin_memory=pin_memory, persistent_workers=bool(num_workers))

    model = DeepCoNN_TextCNN_FM(
        emb_matrix, out_dim=OUT_DIM, conv_filters=CONV_FILTERS,
        kernel_size=CONV_KERNEL, dropout=DROPOUT, fm_k=40, freeze=FREEZE_EMB
    ).to(device)

    optimizer = torch.optim.RMSprop(model.parameters(), lr=LR)
    criterion = nn.MSELoss()

    best_val_rmse = float("inf")
    epochs_no_improve = 0

    print("[INFO] Start training...")
    for epoch in range(1, EPOCHS + 1):
        model.train()
        total_loss, n = 0.0, 0
        for u, v, y in train_dl:
            u, v, y = u.to(device), v.to(device), y.to(device)
            optimizer.zero_grad()
            pred = model(u, v)
            loss = criterion(pred, y)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            total_loss += loss.item() * len(y)
            n += len(y)

        # ----- Validation -----
        val_mse, val_rmse, val_mae, val_mape = eval_on_loader(model, val_dl, device, clamp_to_star=False)
        train_mse = total_loss / max(n, 1)

        print(f"Epoch {epoch:02d} | Train MSE {train_mse:.4f} | "
              f"Val MSE {val_mse:.4f} | RMSE {val_rmse:.4f} | MAE {val_mae:.4f} | MAPE {val_mape:.2f}%")

        # Early Stopping
        if val_rmse < best_val_rmse - MIN_DELTA:
            best_val_rmse = val_rmse
            epochs_no_improve = 0
            torch.save(model.state_dict(), BEST_MODEL_PATH)
            print(f"  --> Improved. Save model (Val RMSE: {best_val_rmse:.4f})")
        else:
            epochs_no_improve += 1
            print(f"  --> No improvement ({epochs_no_improve}/{PATIENCE})")
            if epochs_no_improve >= PATIENCE:
                print("[INFO] Early stopping.")
                break

    # ----- Test -----
    if os.path.exists(BEST_MODEL_PATH):
        model.load_state_dict(torch.load(BEST_MODEL_PATH, map_location=device))
        print(f"[INFO] Loaded best model: {BEST_MODEL_PATH}")
    test_mse, test_rmse, test_mae, test_mape = eval_on_loader(model, test_dl, device, clamp_to_star=True)

    print("\n✅ [DeepCoNN: TextCNN+FM] 최종 테스트 지표 (클램프 1~5)")
    print(f"   - MSE  : {test_mse:.4f}")
    print(f"   - RMSE : {test_rmse:.4f}")
    print(f"   - MAE  : {test_mae:.4f}")
    print(f"   - MAPE : {test_mape:.2f}%")

# ==================== 6) 메인 플로우 ====================
if __name__ == "__main__":
    print("[*] 데이터 로드…")
    df = load_reviews(DATA_PATH)
    print(df.head(2))
    print(f"총 리뷰 수: {len(df):,}")

    print("[*] 전처리/임베딩…")
    vocab, emb_matrix, user_seqs, item_seqs = build_data(df)

    print("[*] 학습/검증/조기종료/테스트…")
    train_with_early_stop(df, emb_matrix)


[INFO] Device: cuda
[*] 데이터 로드…
                  user_id             business_id  rating  \
0  smOvOajNG0lS4Pq7d8g4JQ  RZtGWDLCAtuipwaZ-UfjmQ       4   
1  IQsF3Rc6IgCzjVV9DE8KXg  eFvzHawVJofxSnD7TgbZtg       5   

                                                text  
0  Good food--loved the gnocchi with marinara\nth...  
1  My absolute favorite cafe in the city. Their b...  
총 리뷰 수: 447,796
[*] 전처리/임베딩…
[*] 사용자/아이템 문서 생성…
[*] 토큰화/단어 카운트…


tokenize: 100%|██████████| 27807/27807 [00:19<00:00, 1424.16it/s]
tokenize: 100%|██████████| 6831/6831 [00:19<00:00, 341.60it/s]


[+] Vocab size: 80,562
[*] Word2Vec 로딩…
[+] Embedding matrix: (80562, 300) | hit=63,204 / oov=17,356


make user seqs: 100%|██████████| 27807/27807 [00:05<00:00, 5371.04it/s]
make item seqs: 100%|██████████| 6831/6831 [00:04<00:00, 1588.40it/s]


[*] 학습/검증/조기종료/테스트…
[INFO] Start training...
Epoch 01 | Train MSE 69.0589 | Val MSE 1.1418 | RMSE 1.0686 | MAE 0.8622 | MAPE 33.62%
  --> Improved. Save model (Val RMSE: 1.0686)
Epoch 02 | Train MSE 1.1953 | Val MSE 1.1106 | RMSE 1.0538 | MAE 0.8513 | MAPE 33.11%
  --> Improved. Save model (Val RMSE: 1.0538)
Epoch 03 | Train MSE 1.1397 | Val MSE 1.1736 | RMSE 1.0833 | MAE 0.8204 | MAPE 35.52%
  --> No improvement (1/5)
Epoch 04 | Train MSE 1.1132 | Val MSE 1.0913 | RMSE 1.0447 | MAE 0.8065 | MAPE 33.65%
  --> Improved. Save model (Val RMSE: 1.0447)
Epoch 05 | Train MSE 1.0956 | Val MSE 1.0937 | RMSE 1.0458 | MAE 0.8097 | MAPE 33.37%
  --> No improvement (1/5)
Epoch 06 | Train MSE 1.0798 | Val MSE 1.0849 | RMSE 1.0416 | MAE 0.8157 | MAPE 33.18%
  --> Improved. Save model (Val RMSE: 1.0416)
Epoch 07 | Train MSE 1.0693 | Val MSE 1.2218 | RMSE 1.1054 | MAE 0.8252 | MAPE 35.75%
  --> No improvement (1/5)
Epoch 08 | Train MSE 1.0586 | Val MSE 1.1905 | RMSE 1.0911 | MAE 0.8180 | MAPE 35.02%
 