In [12]:
import csv
import json
import math
import os
import pickle
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Dict, Iterator, List, Optional, Sequence, Tuple

import numpy as np
import pandas as pd

# Keep embedding dimensions consistent across the pipeline
FINAL_EMBEDDING_DIM = 64


In [13]:
# NOTE: 모듈 파일 분리하지 않고, 아래 셀들에서 정의된 코드만 사용합니다.

In [14]:
# ----------------------------
# Event schema
# ----------------------------


@dataclass(frozen=True)
class InteractionEvent:
    user_id: str
    poi_id: str
    interaction_type: str  # e.g., visit | rating | search | click | view
    value: float = 1.0
    timestamp: Optional[str] = None  # keep as string


# ----------------------------
# Interaction weighting
# ----------------------------


@dataclass(frozen=True)
class InteractionWeights:
    view: float = 0.2
    click: float = 0.6
    visit: float = 1.0
    like: float = 1.2
    search: float = 0.3
    rating: float = 1.0  # scaled by value/5
    bookmark: float = 0.7
    wishlist: float = 0.8
    share: float = 0.9
    review: float = 1.1
    purchase: float = 1.3
    revisit: float = 1.1
    dwell: float = 0.9  # scaled by value (seconds)
    other: float = 0.5


def weight_for(interaction_type: str, value: float, w: InteractionWeights) -> float:
    t = (interaction_type or "").lower().strip()
    if t == "view":
        return w.view
    if t == "click":
        return w.click
    if t == "visit":
        return w.visit
    if t == "like":
        return w.like
    if t == "search":
        return w.search
    if t == "rating":
        v = float(value)
        v = max(1.0, min(5.0, v))
        return (v / 5.0) * w.rating
    if t == "bookmark":
        return w.bookmark
    if t == "wishlist":
        return w.wishlist
    if t == "share":
        return w.share
    if t == "review":
        return w.review
    if t == "purchase":
        return w.purchase
    if t == "revisit":
        return w.revisit
    if t == "dwell":
        v = max(0.0, float(value))
        v = min(v, 3600.0)
        return (v / 60.0) * w.dwell
    return w.other


In [15]:
# ----------------------------
# Timestamp parsing (lightweight)
# ----------------------------


_TS_FORMATS = (
    "%m/%d/%Y %H:%M",
    "%m/%d/%Y %H:%M:%S",
    "%Y-%m-%d %H:%M:%S",
    "%Y-%m-%d %H:%M",
)


def parse_ts(ts: Optional[str]) -> Optional[datetime]:
    if not ts:
        return None
    s = str(ts).strip()
    if not s:
        return None
    for fmt in _TS_FORMATS:
        try:
            return datetime.strptime(s, fmt)
        except ValueError:
            continue
    try:
        return datetime.fromisoformat(s)
    except Exception:
        return None


In [16]:
class POITreeIndex:
    """POI tree index.

    Originally loaded from Sources/Files/poi_tree_with_uuids.json,
    but also accepts poi_tree_data (dict) from Sources/metadata.pkl.
    """

    def __init__(self, poi_tree: object):
        # poi_tree: dict (already loaded tree) or (str/Path) JSON path
        self.path: str = ""

        if isinstance(poi_tree, (str, Path)):
            self.path = str(poi_tree)
            with open(self.path, "r", encoding="utf-8") as f:
                self.tree = json.load(f)
        elif isinstance(poi_tree, dict):
            self.tree = poi_tree
        else:
            raise TypeError("poi_tree must be dict or path(str/Path)")

        level0 = self.tree.get("level_0", {})
        if not isinstance(level0, dict):
            raise ValueError("poi_tree: expected tree['level_0'] to be a dict")
        self.level0_ids = set(level0.keys())

        self._parent_cache: Optional[Dict[str, Dict[int, str]]] = None

    @classmethod
    def from_metadata_pkl(cls, metadata_pkl_path: object) -> "POITreeIndex":
        p = Path(metadata_pkl_path)
        with p.open("rb") as f:
            meta = pickle.load(f)
        tree = meta.get("poi_tree_data")
        if not isinstance(tree, dict):
            raise ValueError("metadata.pkl missing 'poi_tree_data' dict")
        return cls(tree)

    def is_valid_level0(self, poi_id: str) -> bool:
        return poi_id in self.level0_ids

    def build_parent_cache(self, max_level: int = 3) -> Dict[str, Dict[int, str]]:
        if self._parent_cache is not None:
            return self._parent_cache

        cache: Dict[str, Dict[int, str]] = {}
        for poi_id in self.level0_ids:
            cache[poi_id] = {}
            current = poi_id
            for lvl in range(max_level):
                node = self.tree.get(f"level_{lvl}", {}).get(current)
                if not isinstance(node, dict):
                    break
                parent = node.get("parent")
                if not parent:
                    break
                cache[poi_id][lvl + 1] = parent
                current = parent

        self._parent_cache = cache
        return cache

    def get_hierarchy(self, level0_poi_id: str, max_level: int = 3) -> List[Tuple[int, str]]:
        out: List[Tuple[int, str]] = [(0, level0_poi_id)]
        parents = self.build_parent_cache(max_level=max_level).get(level0_poi_id, {})
        for lvl in range(1, max_level + 1):
            pid = parents.get(lvl)
            if pid:
                out.append((lvl, pid))
        return out


In [17]:
# ----------------------------
# POI embedding sources (read-only)
# ----------------------------


class POIEmbeddingSource:
    """Read-only interface: poi_id -> embedding vector."""

    def dim(self) -> int:
        raise NotImplementedError

    def available(self) -> bool:
        raise NotImplementedError

    def reload(self) -> bool:
        raise NotImplementedError

    def has(self, poi_id: str) -> bool:
        raise NotImplementedError

    def get(self, poi_id: str) -> Optional[np.ndarray]:
        """Return embedding vector for poi_id, or None if unavailable/missing."""
        raise NotImplementedError


class HashPOIVectorSource(POIEmbeddingSource):
    """Deterministic pseudo-embedding for a POI id."""

    def __init__(self, dim: int):
        self._dim = int(dim)

    def dim(self) -> int:
        return self._dim

    def available(self) -> bool:
        return True

    def reload(self) -> bool:
        return True

    def has(self, poi_id: str) -> bool:
        return True

    def get(self, poi_id: str) -> np.ndarray:
        import hashlib

        h = hashlib.md5(poi_id.encode("utf-8")).hexdigest()
        seed = int(h[:8], 16)
        rng = np.random.default_rng(seed)
        v = rng.standard_normal(self._dim).astype(np.float32)
        v /= (np.linalg.norm(v) + 1e-12)
        return v


class NpyPOIEmbeddingSource(POIEmbeddingSource):
    """Loads POI embeddings from (poi_embeddings.npy, poi_ids.txt)."""

    def __init__(self, npy_path: str, ids_path: str, expected_dim: int):
        self.npy_path = Path(npy_path)
        self.ids_path = Path(ids_path)
        self.expected_dim = int(expected_dim)

        self._poi_index: Dict[str, int] = {}
        self._mat: Optional[np.ndarray] = None
        self._loaded = False
        self.reload()

    def dim(self) -> int:
        return self.expected_dim

    def available(self) -> bool:
        return self._loaded

    def reload(self) -> bool:
        if not (self.npy_path.exists() and self.ids_path.exists()):
            self._loaded = False
            self._poi_index = {}
            self._mat = None
            return False

        mat = np.load(self.npy_path)
        with self.ids_path.open("r", encoding="utf-8") as f:
            ids = [line.strip() for line in f if line.strip()]

        if mat.ndim != 2:
            raise ValueError(f"Expected 2D npy matrix, got shape {mat.shape}")
        if len(ids) != mat.shape[0]:
            raise ValueError(f"IDs count {len(ids)} != npy rows {mat.shape[0]}")
        if mat.shape[1] != self.expected_dim:
            raise ValueError(f"POI dim mismatch: expected {self.expected_dim}, got {mat.shape[1]}")

        self._mat = mat.astype(np.float32, copy=False)
        self._poi_index = {pid: i for i, pid in enumerate(ids)}
        self._loaded = True
        return True

    def has(self, poi_id: str) -> bool:
        return self._loaded and (poi_id in self._poi_index)

    def get(self, poi_id: str) -> Optional[np.ndarray]:
        if not self._loaded:
            return None
        idx = self._poi_index.get(poi_id)
        if idx is None:
            return None
        return self._mat[idx]


class PklPOIEmbeddingSource(POIEmbeddingSource):
    """Loads POI embeddings from a dict pickle: {poi_id: np.ndarray(dim,)}."""

    def __init__(self, pkl_path: str, expected_dim: int):
        self.pkl_path = Path(pkl_path)
        self.expected_dim = int(expected_dim)

        self._d: Dict[str, np.ndarray] = {}
        self._loaded = False
        self.reload()

    def dim(self) -> int:
        return self.expected_dim

    def available(self) -> bool:
        return self._loaded

    def reload(self) -> bool:
        if not self.pkl_path.exists():
            self._d = {}
            self._loaded = False
            return False

        with self.pkl_path.open("rb") as f:
            d = pickle.load(f)
        if not isinstance(d, dict):
            raise ValueError("POI pkl must be a dict {poi_id: vec}")

        sample = next(iter(d.values()), None)
        if sample is not None:
            v = np.asarray(sample)
            if v.ndim != 1 or v.shape[0] != self.expected_dim:
                raise ValueError(
                    f"POI pkl dim mismatch: expected ({self.expected_dim},), got {v.shape}"
                )

        self._d = {str(k): np.asarray(v, dtype=np.float32) for k, v in d.items()}
        self._loaded = True
        return True

    def has(self, poi_id: str) -> bool:
        return self._loaded and (poi_id in self._d)

    def get(self, poi_id: str) -> Optional[np.ndarray]:
        if not self._loaded:
            return None
        return self._d.get(poi_id)


class LazyPOIEmbeddingSource(POIEmbeddingSource):
    """POI embedding source that may be unavailable initially."""

    def __init__(
        self,
        expected_dim: int,
        *,
        npy_path: str = "",
        ids_path: str = "",
        pkl_path: str = "",
        prefer: str = "pkl",
    ):
        self.expected_dim = int(expected_dim)
        self.npy_path = Path(npy_path) if npy_path else None
        self.ids_path = Path(ids_path) if ids_path else None
        self.pkl_path = Path(pkl_path) if pkl_path else None
        self.prefer = prefer

        self._impl: Optional[POIEmbeddingSource] = None
        self.reload()

    def dim(self) -> int:
        return self.expected_dim

    def available(self) -> bool:
        return self._impl is not None and self._impl.available()

    def reload(self) -> bool:
        candidates: List[Tuple[str, bool]] = []
        has_pkl = bool(self.pkl_path and self.pkl_path.exists())
        has_npy = bool(self.npy_path and self.ids_path and self.npy_path.exists() and self.ids_path.exists())
        candidates.append(("pkl", has_pkl))
        candidates.append(("npy", has_npy))

        order = [self.prefer, "npy" if self.prefer == "pkl" else "pkl"]
        chosen: Optional[str] = None
        for kind in order:
            ok = dict(candidates).get(kind, False)
            if ok:
                chosen = kind
                break

        if chosen is None:
            self._impl = None
            return False

        if chosen == "pkl":
            self._impl = PklPOIEmbeddingSource(str(self.pkl_path), expected_dim=self.expected_dim)
            return True

        self._impl = NpyPOIEmbeddingSource(
            str(self.npy_path), str(self.ids_path), expected_dim=self.expected_dim
        )
        return True

    def has(self, poi_id: str) -> bool:
        return self._impl is not None and self._impl.has(poi_id)

    def get(self, poi_id: str) -> Optional[np.ndarray]:
        if self._impl is None:
            return None
        return self._impl.get(poi_id)


In [18]:
# ----------------------------
# Online update configs
# ----------------------------


@dataclass(frozen=True)
class UpdateRuleConfig:
    alpha: float = 0.15
    use_adaptive_alpha: bool = False
    alpha_max: float = 0.35
    alpha_c: float = 5.0


@dataclass(frozen=True)
class TimeDecayConfig:
    enabled: bool = True
    tau_seconds: float = 7.0 * 24.0 * 3600.0


@dataclass(frozen=True)
class NegativeSamplingConfig:
    enabled: bool = True
    k: int = 2
    weight: float = 0.1
    seed: int = 42


@dataclass(frozen=True)
class ColdStartInitConfig:
    enabled: bool = False
    user_preferences_csv: str = ""  # expects at least user_id column


In [19]:
# ----------------------------
# User vector store (persistent)
# ----------------------------


class UserVectorStore:
    """Persistent user vector matrix.

    Stores:
      - <prefix>.json : list of user_ids in row order
      - <prefix>.npy  : float32 matrix [num_users, dim]
    """

    def __init__(self, dim: int, prefix: str):
        self.dim = int(dim)
        self.prefix = str(prefix)

        self.user_ids: List[str] = []
        self.id_to_idx: Dict[str, int] = {}
        self.mat: np.ndarray = np.zeros((0, self.dim), dtype=np.float32)

        self.dirty = False
        self._load_if_exists()

    @property
    def ids_path(self) -> Path:
        return Path(self.prefix + ".json")

    @property
    def mat_path(self) -> Path:
        return Path(self.prefix + ".npy")

    def _load_if_exists(self) -> None:
        if self.ids_path.exists() and self.mat_path.exists():
            with self.ids_path.open("r", encoding="utf-8") as f:
                self.user_ids = json.load(f)
            self.id_to_idx = {u: i for i, u in enumerate(self.user_ids)}
            self.mat = np.load(self.mat_path).astype(np.float32, copy=False)
            if self.mat.ndim != 2 or self.mat.shape[1] != self.dim:
                raise ValueError(f"UserVectorStore dim mismatch: expected (*,{self.dim}), got {self.mat.shape}")

    def has(self, user_id: str) -> bool:
        return user_id in self.id_to_idx

    def get(self, user_id: str) -> np.ndarray:
        idx = self.id_to_idx.get(user_id)
        if idx is None:
            idx = self._add_user(user_id)
        return self.mat[idx]

    def set(self, user_id: str, vec: np.ndarray) -> None:
        idx = self.id_to_idx.get(user_id)
        if idx is None:
            idx = self._add_user(user_id)
        self.mat[idx] = vec.astype(np.float32, copy=False)
        self.dirty = True

    def _add_user(self, user_id: str) -> int:
        idx = len(self.user_ids)
        self.user_ids.append(user_id)
        self.id_to_idx[user_id] = idx
        if self.mat.size == 0:
            self.mat = np.zeros((1, self.dim), dtype=np.float32)
        else:
            self.mat = np.vstack([self.mat, np.zeros((1, self.dim), dtype=np.float32)])
        self.dirty = True
        return idx

    def flush(self) -> None:
        if not self.dirty:
            return
        self.ids_path.parent.mkdir(parents=True, exist_ok=True)

        tmp_ids = self.ids_path.with_suffix(".json.tmp")
        tmp_mat = self.mat_path.with_suffix(".npy.tmp")

        with tmp_ids.open("w", encoding="utf-8") as f:
            json.dump(self.user_ids, f)

        with open(tmp_mat, "wb") as f:
            np.save(f, self.mat.astype(np.float32, copy=False))

        os.replace(tmp_ids, self.ids_path)
        os.replace(tmp_mat, self.mat_path)
        self.dirty = False


In [20]:
# ----------------------------
# Cold-start init from Sources/Files/user_preferences.csv (POI-agnostic)
# ----------------------------


def _hash_to_unit_vec(key: str, dim: int) -> np.ndarray:
    import hashlib

    h = hashlib.md5(key.encode("utf-8")).hexdigest()
    seed = int(h[:8], 16)
    rng = np.random.default_rng(seed)
    v = rng.standard_normal(dim).astype(np.float32)
    v /= float(np.linalg.norm(v) + 1e-12)
    return v


class PreferenceInitializer:
    def __init__(self, csv_path: str, dim: int):
        self.dim = int(dim)
        self.csv_path = Path(csv_path)
        self.user_pref_vec: Dict[str, np.ndarray] = {}
        if self.csv_path.exists():
            self._load()

    def _load(self) -> None:
        with self.csv_path.open("r", encoding="utf-8") as f:
            reader = csv.DictReader(f)
            cols = reader.fieldnames or []
            if "user_id" not in cols:
                raise ValueError("user_preferences.csv must contain a 'user_id' column")
            for row in reader:
                uid = (row.get("user_id") or "").strip()
                if not uid:
                    continue
                parts = []
                for k, v in row.items():
                    if k == "user_id":
                        continue
                    if v is None:
                        continue
                    s = str(v).strip()
                    if s:
                        parts.append(f"{k}={s}")
                key = "|".join(parts) if parts else "__empty__"
                self.user_pref_vec[uid] = _hash_to_unit_vec(key, self.dim)

    def get_init(self, user_id: str) -> Optional[np.ndarray]:
        return self.user_pref_vec.get(user_id)


In [21]:
# ----------------------------
# Real-time embedder
# ----------------------------


class RealTimeInteractionEmbedder:
    """Real-time interaction handler.

    Update rule (conceptually):
      user <- (1-a)*user + a*(w(action,value)*poi_vec)
    """

    def __init__(
        self,
        *,
        dim: int,
        poi_index: POITreeIndex,
        poi_source: POIEmbeddingSource,
        user_store_prefix: str = "data/user_vecs",
        log_csv_path: Optional[str] = "data/interaction_log.csv",
        flush_every: int = 500,
        validate_poi: bool = True,
        use_hierarchy: bool = False,
        level_weights: Sequence[float] = (1.0, 0.4, 0.2, 0.1),
        weights: InteractionWeights = InteractionWeights(),
        update_cfg: UpdateRuleConfig = UpdateRuleConfig(),
        time_decay_cfg: TimeDecayConfig = TimeDecayConfig(),
        negative_cfg: NegativeSamplingConfig = NegativeSamplingConfig(),
        cold_start_cfg: ColdStartInitConfig = ColdStartInitConfig(),
        normalize_each_update: bool = True,
    ):
        self.dim = int(dim)
        self.poi_index = poi_index
        self.poi_source = poi_source

        self.user_store = UserVectorStore(self.dim, user_store_prefix)
        self.log_csv_path = log_csv_path
        self.flush_every = int(flush_every)
        self.validate_poi = bool(validate_poi)

        self.use_hierarchy = bool(use_hierarchy)
        self.level_weights = list(level_weights)
        if len(self.level_weights) != 4:
            raise ValueError("level_weights must have 4 values for level0..3")

        self.weights = weights
        self.update_cfg = update_cfg
        self.time_decay_cfg = time_decay_cfg
        self.neg_cfg = negative_cfg
        self.normalize_each_update = bool(normalize_each_update)

        self._event_count = 0
        self._user_event_counts: Dict[str, int] = {}
        self._user_last_ts: Dict[str, datetime] = {}

        self._neg_candidates = list(self.poi_index.level0_ids)
        self._neg_rng = np.random.default_rng(self.neg_cfg.seed)

        self._stats = {
            "events_total": 0,
            "events_ok": 0,
            "events_skipped_unknown_poi": 0,
            "events_skipped_no_poi_embedding": 0,
            "events_skipped_missing_user_id": 0,
            "events_skipped_missing_poi_id": 0,
        }

        self.pref_init: Optional[PreferenceInitializer] = None
        if cold_start_cfg.enabled and cold_start_cfg.user_preferences_csv:
            self.pref_init = PreferenceInitializer(cold_start_cfg.user_preferences_csv, self.dim)

    def refresh_poi_source(self) -> bool:
        return self.poi_source.reload()

    def _compute_alpha(self, user_id: str, ts: Optional[datetime]) -> float:
        alpha = float(self.update_cfg.alpha)
        if self.update_cfg.use_adaptive_alpha:
            n = self._user_event_counts.get(user_id, 0)
            scale = 1.0 + (math.log1p(n) / max(self.update_cfg.alpha_c, 1e-6))
            alpha = min(self.update_cfg.alpha_max, alpha * scale)

        if self.time_decay_cfg.enabled and ts is not None:
            last_ts = self._user_last_ts.get(user_id)
            if last_ts is not None:
                dt = (ts - last_ts).total_seconds()
                if dt > 0:
                    decay = math.exp(-dt / max(self.time_decay_cfg.tau_seconds, 1e-6))
                    alpha = alpha * decay
            self._user_last_ts[user_id] = ts

        return max(0.0, min(1.0, alpha))

    def _get_poi_signal_vec(self, poi_id_level0: str) -> Optional[np.ndarray]:
        if not self.use_hierarchy:
            return self.poi_source.get(poi_id_level0)

        acc = np.zeros((self.dim,), dtype=np.float32)
        found = False
        for lvl, pid in self.poi_index.get_hierarchy(poi_id_level0, max_level=3):
            w = float(self.level_weights[lvl])
            if w == 0.0:
                continue
            v = self.poi_source.get(pid)
            if v is None:
                continue
            acc += w * v.astype(np.float32, copy=False)
            found = True
        if not found:
            return None
        n = float(np.linalg.norm(acc) + 1e-12)
        acc = acc / n
        return acc

    def _maybe_init_user(self, user_id: str) -> None:
        if self.user_store.has(user_id):
            return
        if self.pref_init is None:
            _ = self.user_store.get(user_id)
            return
        init_vec = self.pref_init.get_init(user_id)
        if init_vec is None:
            _ = self.user_store.get(user_id)
            return
        self.user_store.set(user_id, init_vec)

    def handle_event(self, event: InteractionEvent) -> Dict[str, object]:
        self._stats["events_total"] += 1

        uid = (event.user_id or "").strip()
        pid0 = (event.poi_id or "").strip()
        if not uid:
            self._stats["events_skipped_missing_user_id"] += 1
            return {"ok": False, "reason": "missing_user_id"}
        if not pid0:
            self._stats["events_skipped_missing_poi_id"] += 1
            return {"ok": False, "reason": "missing_poi_id"}

        if self.validate_poi and not self.poi_index.is_valid_level0(pid0):
            self._stats["events_skipped_unknown_poi"] += 1
            return {"ok": False, "reason": "unknown_poi", "poi_id": pid0}

        self._maybe_init_user(uid)

        ts = parse_ts(event.timestamp)
        alpha_eff = self._compute_alpha(uid, ts)

        poi_vec = self._get_poi_signal_vec(pid0)
        if poi_vec is None:
            self._stats["events_skipped_no_poi_embedding"] += 1
            return {"ok": False, "reason": "no_poi_embedding", "poi_id": pid0}

        w = weight_for(event.interaction_type, event.value, self.weights)

        u = self.user_store.get(uid)
        u2 = (1.0 - alpha_eff) * u + alpha_eff * (w * poi_vec)
        if self.normalize_each_update:
            u2 = u2 / (np.linalg.norm(u2) + 1e-12)
        self.user_store.set(uid, u2)

        if self.neg_cfg.enabled and self.neg_cfg.k > 0 and self._neg_candidates:
            neg_ids = self._neg_rng.choice(self._neg_candidates, size=self.neg_cfg.k, replace=False)
            for neg_id in neg_ids:
                if neg_id == pid0:
                    continue
                neg_vec = self.poi_source.get(neg_id)
                if neg_vec is None:
                    continue
                u = self.user_store.get(uid)
                u2 = (1.0 - alpha_eff) * u - alpha_eff * (self.neg_cfg.weight * neg_vec)
                if self.normalize_each_update:
                    u2 = u2 / (np.linalg.norm(u2) + 1e-12)
                self.user_store.set(uid, u2)

        self._event_count += 1
        self._user_event_counts[uid] = self._user_event_counts.get(uid, 0) + 1
        self._stats["events_ok"] += 1

        if self.log_csv_path:
            with open(self.log_csv_path, "a", newline="", encoding="utf-8") as f:
                csv.writer(f).writerow(
                    [
                        self._event_count,
                        uid,
                        pid0,
                        event.interaction_type,
                        float(event.value),
                        event.timestamp or "",
                        1,
                    ]
                )

        if self._event_count % max(self.flush_every, 1) == 0:
            self.user_store.flush()

        return {"ok": True, "user_id": uid, "poi_id": pid0}

    def recommend_from_candidate_pois(
        self, user_id: str, candidate_poi_ids: Sequence[str], k: int = 10
    ) -> List[Tuple[str, float]]:
        if not self.poi_source.available():
            raise RuntimeError("POI embeddings not available: cannot compute recommendations")

        u = self.user_store.get(user_id).astype(np.float32)
        un = float(np.linalg.norm(u) + 1e-12)

        scores: List[Tuple[str, float]] = []
        for pid in candidate_poi_ids:
            if self.validate_poi and not self.poi_index.is_valid_level0(pid):
                continue
            v = self._get_poi_signal_vec(pid)
            if v is None:
                continue
            s = float(np.dot(u, v) / ((np.linalg.norm(v) + 1e-12) * un))
            scores.append((pid, s))
        scores.sort(key=lambda x: x[1], reverse=True)
        return scores[: int(k)]


In [22]:
# --------------------------------------------------
# Batch replay (offline) to rebuild user vectors
# --------------------------------------------------


def find_repo_root(start=None) -> Path:
    start = Path(start or Path.cwd())
    for p in [start] + list(start.parents):
        if (p / ".git").exists():
            return p
    raise RuntimeError("Cannot find repo root (.git not found). Run inside the repo.")


ROOT = find_repo_root()
SOURCES = ROOT / "Sources"
DATA_DIR = SOURCES / "Files"

POI_PKL = SOURCES / "poi_embeddings.pkl"
META_PKL = SOURCES / "metadata.pkl"
POI_TREE_JSON = DATA_DIR / "poi_tree_with_uuids.json"  # fallback only
INTERACTIONS_CSV = DATA_DIR / "user_poi_interactions.csv"

# Enforce consistent embedding dimension
EXPECTED_DIM = FINAL_EMBEDDING_DIM

OUT_USER_VECS_DIR = SOURCES / "user_vecs"
OUT_LOG_CSV = SOURCES / "interaction_log_out.csv"
OUT_USER_IDS_JSON = OUT_USER_VECS_DIR / "user_ids.json"
OUT_USER_MAT_NPY = OUT_USER_VECS_DIR / "user_mat.npy"

assert POI_PKL.exists(), f"Missing: {POI_PKL}"
assert INTERACTIONS_CSV.exists(), f"Missing: {INTERACTIONS_CSV}"

# If metadata.pkl exists, load poi_tree + indices here
if META_PKL.exists():
    with META_PKL.open("rb") as f:
        _meta = pickle.load(f)
    poi_tree = _meta.get("poi_tree_data", {})
    meta_poi_to_idx = _meta.get("poi_to_idx", {})
    meta_idx_to_poi = _meta.get("idx_to_poi", {})
    poi_index = POITreeIndex(poi_tree)
    print("Loaded metadata from", META_PKL)
else:
    assert POI_TREE_JSON.exists(), f"Missing: {POI_TREE_JSON} and no metadata.pkl"
    with POI_TREE_JSON.open("r", encoding="utf-8") as f:
        poi_tree = json.load(f)
    meta_poi_to_idx = {}
    meta_idx_to_poi = {}
    poi_index = POITreeIndex(poi_tree)


# 1) Build token <-> name mapping from poi_tree
lvl0 = poi_tree.get("level_0", {})
token_to_name: Dict[str, str] = {}
name_to_token: Dict[str, str] = {}

if isinstance(lvl0, dict):
    for tok, node in lvl0.items():
        if not isinstance(tok, str):
            continue
        nm = ""
        if isinstance(node, dict):
            nm = str(node.get("name", "")).strip()
        if nm:
            token_to_name[tok] = nm
            if nm not in name_to_token:
                name_to_token[nm] = tok

print("uuid-name pairs:", len(token_to_name))
print("sample:", list(token_to_name.items())[:3])


# 2) Load level_0 embeddings from PKL as (poi_id -> vec)

def load_level0_poi_embeddings(pkl_path: Path) -> Tuple[Dict[str, np.ndarray], int]:
    with open(pkl_path, "rb") as f:
        obj = pickle.load(f)

    lvl0 = obj["poi_embeddings"]["level_0"]
    mat = np.asarray(lvl0["embeddings"], dtype=np.float32)  # (N,D)
    ids = lvl0["poi_ids"]  # len N

    if mat.ndim != 2:
        raise ValueError(f"Expected 2D embeddings matrix, got shape={mat.shape}")
    if len(ids) != mat.shape[0]:
        raise ValueError(f"poi_ids len {len(ids)} != embeddings rows {mat.shape[0]}")
    if mat.shape[1] != EXPECTED_DIM:
        raise ValueError(f"POI embedding dim mismatch: expected {EXPECTED_DIM}, got {mat.shape[1]}")

    emb = {str(ids[i]): mat[i] for i in range(mat.shape[0])}
    return emb, EXPECTED_DIM


poi_embeddings, POI_DIM = load_level0_poi_embeddings(POI_PKL)

print("POI embeddings loaded (level_0)")
print("   n:", len(poi_embeddings))
print("   dim:", POI_DIM)
print("   key sample:", list(poi_embeddings.keys())[:3])


# 3) Load interactions CSV (flexible columns)

df = pd.read_csv(INTERACTIONS_CSV)


def pick_col(cands):
    cols = {c.lower(): c for c in df.columns}
    for c in cands:
        if c in cols:
            return cols[c]
    return None


USER_COL = pick_col(["user_id", "user", "uid"])
POI_COL = pick_col(["poi_id", "poi", "item_id", "entity_id", "poi_name"])
TYPE_COL = pick_col(["interaction_type", "action", "type"])
VAL_COL = pick_col(["value", "weight", "rating", "score"])
TS_COL = pick_col(["timestamp", "time", "ts", "datetime"])

assert USER_COL and POI_COL, f"interactions csv needs user_id & poi_id cols. got={list(df.columns)}"
if TYPE_COL is None:
    df["_type"] = "visit"
    TYPE_COL = "_type"
if VAL_COL is None:
    df["_val"] = 1.0
    VAL_COL = "_val"
if TS_COL is None:
    df["_ts"] = ""
    TS_COL = "_ts"

df[USER_COL] = df[USER_COL].astype(str).str.strip()
df[POI_COL] = df[POI_COL].astype(str).str.strip()

print("CSV POIs:", df[POI_COL].nunique())
print("CSV Users:", df[USER_COL].nunique())


# 4) Resolve: if CSV poi_id is a token, use direct; otherwise map name -> token
emb_keys = set(poi_embeddings.keys())
emb_key_list = list(emb_keys)


def resolve_poi_key(raw: str) -> Optional[str]:
    r = str(raw).strip()
    if r in emb_keys:
        return r
    tok = name_to_token.get(r)
    if tok and tok in emb_keys:
        return tok
    return None


# 5) Online updates (EMA)
ALPHA = 0.15
NORMALIZE_EACH_UPDATE = True

TIME_DECAY_ENABLED = True
TAU_SECONDS = 7.0 * 24.0 * 3600.0  # 7-day window

NEGATIVE_SAMPLES_PER_EVENT = 2
NEGATIVE_WEIGHT = 0.1
NEGATIVE_SEED = 42
_neg_rng = np.random.default_rng(NEGATIVE_SEED)


def parse_ts_local(ts: Optional[str]) -> Optional[datetime]:
    return parse_ts(ts)


user_vecs: Dict[str, np.ndarray] = {}
user_last_ts: Dict[str, datetime] = {}


def get_user_vec(uid: str) -> np.ndarray:
    if uid in user_vecs:
        return user_vecs[uid]
    v = np.zeros((POI_DIM,), dtype=np.float32)
    user_vecs[uid] = v
    return v


OUT_USER_VECS_DIR.mkdir(parents=True, exist_ok=True)
if OUT_LOG_CSV.exists():
    OUT_LOG_CSV.unlink()

with OUT_LOG_CSV.open("w", newline="", encoding="utf-8") as f:
    csv.writer(f).writerow(["i", "user_id", "poi_raw", "poi_id", "type", "value", "timestamp", "ok"])

ok = 0
skipped = 0
for i, row in df.iterrows():
    uid = row[USER_COL]
    raw_poi = row[POI_COL]
    t = row[TYPE_COL]
    val = float(row[VAL_COL]) if not pd.isna(row[VAL_COL]) else 1.0
    ts = row[TS_COL] if TS_COL in row else ""

    key = resolve_poi_key(raw_poi)
    if key is None:
        skipped += 1
        with OUT_LOG_CSV.open("a", newline="", encoding="utf-8") as f:
            csv.writer(f).writerow([i, uid, raw_poi, "", t, val, ts, 0])
        continue

    alpha_eff = ALPHA
    if TIME_DECAY_ENABLED:
        ts_dt = parse_ts_local(ts)
        if ts_dt is not None:
            last = user_last_ts.get(uid)
            if last is not None:
                dt = (ts_dt - last).total_seconds()
                if dt > 0:
                    alpha_eff *= math.exp(-dt / max(TAU_SECONDS, 1e-6))
            user_last_ts[uid] = ts_dt

    vec = poi_embeddings.get(key)
    if vec is None:
        skipped += 1
        with OUT_LOG_CSV.open("a", newline="", encoding="utf-8") as f:
            csv.writer(f).writerow([i, uid, raw_poi, key, t, val, ts, 0])
        continue

    w = weight_for(str(t), val, InteractionWeights())
    u = get_user_vec(uid)
    u2 = (1.0 - alpha_eff) * u + alpha_eff * (w * vec)
    if NORMALIZE_EACH_UPDATE:
        u2 = u2 / (np.linalg.norm(u2) + 1e-12)
    user_vecs[uid] = u2

    if NEGATIVE_SAMPLES_PER_EVENT > 0 and len(emb_key_list) > 0:
        neg_ids = _neg_rng.choice(emb_key_list, size=NEGATIVE_SAMPLES_PER_EVENT, replace=False)
        for neg_key in neg_ids:
            if neg_key == key:
                continue
            neg_vec = poi_embeddings.get(neg_key)
            if neg_vec is None:
                continue
            u = get_user_vec(uid)
            u2 = (1.0 - alpha_eff) * u - alpha_eff * (NEGATIVE_WEIGHT * neg_vec)
            if NORMALIZE_EACH_UPDATE:
                u2 = u2 / (np.linalg.norm(u2) + 1e-12)
            user_vecs[uid] = u2

    with OUT_LOG_CSV.open("a", newline="", encoding="utf-8") as f:
        csv.writer(f).writerow([i, uid, raw_poi, key, t, val, ts, 1])

    ok += 1

# Save (per-user npy + full matrix)
for uid, vec in user_vecs.items():
    np.save(OUT_USER_VECS_DIR / f"{uid}.npy", vec.astype(np.float32))

user_ids = list(user_vecs.keys())
mat = np.stack([user_vecs[u] for u in user_ids], axis=0) if user_ids else np.zeros((0, POI_DIM), dtype=np.float32)

OUT_USER_IDS_JSON.write_text(json.dumps(user_ids, ensure_ascii=False), encoding="utf-8")
np.save(OUT_USER_MAT_NPY, mat.astype(np.float32))

print("\nComplete!")
print(f"   POI dim: {POI_DIM}")
print(f"   Total events: {len(df)}")
print(f"   OK updates : {ok}")
print(f"   Skipped(no POI match): {skipped}")
print(f"   Users updated: {len(user_vecs)}")
print(f"   Saved user vecs -> {OUT_USER_VECS_DIR.relative_to(ROOT)}")
print(f"   Log -> {OUT_LOG_CSV.relative_to(ROOT)}")


Loaded metadata from c:\Users\syoon\SpatiaLynk_recommender\Sources\metadata.pkl
uuid-name pairs: 4696
sample: [('poi_0_Giant', 'Giant'), ('poi_1_NTUC_FairPrice', 'NTUC FairPrice'), ('poi_2_NTUC_FairPrice', 'NTUC FairPrice')]
POI embeddings loaded (level_0)
   n: 4696
   dim: 64
   key sample: ['poi_0_Giant', 'poi_1_NTUC_FairPrice', 'poi_2_NTUC_FairPrice']
CSV POIs: 235
CSV Users: 21

Complete!
   POI dim: 64
   Total events: 567
   OK updates : 567
   Skipped(no POI match): 0
   Users updated: 21
   Saved user vecs -> Sources\user_vecs
   Log -> Sources\interaction_log_out.csv
