In [1]:
import pandas as pd
import pickle

In [2]:
def build_dataset(train_df: pd.DataFrame, test_df: pd.DataFrame):
    # 打上 split 标签并合并
    train_df = train_df.copy().assign(__split="train")
    test_df  = test_df.copy().assign(__split="test")
    df = pd.concat([train_df, test_df], ignore_index=True)

    # 解析时间并转 48 槽（0..47）
    ts = pd.to_datetime(df["UTCTimeOffset"], errors="coerce")
    df["_slot48"] = ((ts.dt.hour * 60 + ts.dt.minute) // 30).astype("Int64")

    # 排序键：优先 epoch，没有就用解析后的时间
    sort_key = "UTCTimeOffsetEpoch" if "UTCTimeOffsetEpoch" in df.columns else "_slot48"
    df = df.sort_values(["UserId", "trajectory_id", sort_key]).reset_index(drop=True)

    # 稠密化用户索引
    user_ids = sorted(df["UserId"].unique().tolist())
    uid2idx = {uid: i for i, uid in enumerate(user_ids)}

    # PoiId 映射到 [0, P-1]
    pid_categories = sorted(df["PoiId"].dropna().unique().tolist())
    pid2idx = {pid: i for i, pid in enumerate(pid_categories)}
    df["PoiId_norm"] = df["PoiId"].map(pid2idx).astype("Int64")

    # === 全量 POI → 经纬度映射 (形式: {稠密pid: [lon, lat]}) ===
    vid_lookup = (
        df.drop_duplicates("PoiId")
          .set_index("PoiId")[["Longitude", "Latitude"]]
          .astype(float)
          .apply(lambda row: [row["Longitude"], row["Latitude"]], axis=1)
          .rename(index=pid2idx)   # 把原始 PoiId 替换成稠密索引
          .to_dict()
    )

    data = {}
    for uid in user_ids:
        uidx = uid2idx[uid]
        g_user = df[df["UserId"] == uid]

        # 轨迹按最早时间排序得到稳定顺序
        if "UTCTimeOffsetEpoch" in g_user.columns:
            traj_order = (
                g_user.groupby("trajectory_id")["UTCTimeOffsetEpoch"]
                .min().sort_values().index.tolist()
            )
        else:
            traj_order = (
                g_user.groupby("trajectory_id")["_slot48"]
                .min().sort_values().index.tolist()
            )

        # 仅为“保留的轨迹”分配连续 session 编号
        trajid2sess = {}
        sessions = {}
        kept_traj_ids = []
        sess_counter = 0

        for tid in traj_order:
            g_traj = g_user[g_user["trajectory_id"] == tid]
            if len(g_traj) == 1:      # 丢弃长度为 1 的轨迹
                continue

            trajid2sess[tid] = sess_counter
            kept_traj_ids.append(tid)
            sess_counter += 1

            pairs = (
                g_traj[["PoiId", "_slot48"]]
                .astype({"PoiId": "Int64", "_slot48": "Int64"})
                .values.tolist()
            )
            sessions[trajid2sess[tid]] = pairs

        # 强校验：只检查“保留下来的轨迹”是否混有两个 split
        split_by_traj = (
            g_user.groupby("trajectory_id")["__split"]
            .nunique().reindex(kept_traj_ids).fillna(0).astype(int)
        )
        bad = [tid for tid, k in split_by_traj.items() if k > 1]
        if bad:
            raise ValueError(f"检测到同一用户 {uid} 的以下轨迹同时出现在 train 与 test：{bad}")

        # 轨迹归属（只从 kept_traj_ids 中取，并映射到新的 session idx）
        traj_split_all = g_user.groupby("trajectory_id")["__split"].first().to_dict()
        train_idx = sorted([trajid2sess[tid] for tid in kept_traj_ids if traj_split_all[tid] == "train"])
        test_idx  = sorted([trajid2sess[tid] for tid in kept_traj_ids if traj_split_all[tid] == "test"])

        data[uidx] = {"train": train_idx, "test": test_idx, "sessions": sessions}

    return data, vid_lookup

In [2]:
train_df = pd.read_csv("../../Foursquare-NYC/train_sample.csv")
test_df  = pd.read_csv("../../Foursquare-NYC/test_sample.csv")

In [4]:
data_neural, vid_lookup = build_dataset(train_df, test_df)

In [5]:
len(data_neural)

1047

In [9]:
data = {}
data['data_neural'] = data_neural
data['uid_list'] = len(data_neural)
data['vid_list'] = 4980
data['vid_lookup'] = vid_lookup

In [None]:
with open("dataset/nyc_cut_one_day.pkl", "wb") as file:
    pickle.dump(data, file)