In [1]:
import pandas as pd
import pickle

In [2]:
def build_dataset_tkyca(train_df: pd.DataFrame, test_df: pd.DataFrame):
    # 打上 split 标签并合并
    train_df = train_df.copy().assign(__split="train")
    test_df  = test_df.copy().assign(__split="test")
    df = pd.concat([train_df, test_df], ignore_index=True)
    df["PoiId"] = df["PoiId"].astype(int) - 1

    # 解析时间并转 48 槽（0..47）
    ts = pd.to_datetime(df["UTCTimeOffset"], errors="coerce")
    df["_slot48"] = ((ts.dt.hour * 60 + ts.dt.minute) // 30).astype("Int64")

    # 排序键：优先 epoch，没有就用解析后的时间槽
    sort_key = "UTCTimeOffsetEpoch" if "UTCTimeOffsetEpoch" in df.columns else "_slot48"
    df = df.sort_values(["UserId", "pseudo_session_trajectory_id", sort_key]).reset_index(drop=True)

    # 稠密化用户索引
    user_ids = sorted(df["UserId"].unique().tolist())
    uid2idx = {uid: i for i, uid in enumerate(user_ids)}

    # 稠密化 POI 索引
    poi_ids = sorted(df["PoiId"].unique().tolist())
    pid2idx = {pid: i for i, pid in enumerate(poi_ids)}
    idx2pid = {i: pid for pid, i in pid2idx.items()}

    # === 全量 POI → 经纬度映射 (形式: {稠密pid: [lon, lat]}) ===
    vid_lookup = (
        df.drop_duplicates("PoiId")
          .set_index("PoiId")[["Longitude", "Latitude"]]
          .astype(float)
          .apply(lambda row: [row["Longitude"], row["Latitude"]], axis=1)
          .rename(index=pid2idx)   # 把原始 PoiId 替换成稠密索引
          .to_dict()
    )

    data = {}

    for uid in user_ids:
        uidx = uid2idx[uid]
        g_user = df[df["UserId"] == uid].copy()

        # —— 解决“同一轨迹编号同时在 train & test 出现”的冲突 —— #
        dup_traj_ids = (
            g_user.groupby("pseudo_session_trajectory_id")["__split"]
            .nunique()
            .loc[lambda s: s > 1]
            .index.tolist()
        )
        if len(dup_traj_ids) > 0:
            current_max = pd.to_numeric(g_user["pseudo_session_trajectory_id"], errors="coerce").max()
            if pd.isna(current_max):
                current_max = -1
            current_max = int(current_max)
            for old_tid in dup_traj_ids:
                current_max += 1
                new_tid = current_max
                mask_conflict_test = (
                    (g_user["pseudo_session_trajectory_id"] == old_tid) &
                    (g_user["__split"] == "test")
                )
                g_user.loc[mask_conflict_test, "pseudo_session_trajectory_id"] = new_tid
        # —— 改动结束 —— #

        # 用轨迹的“最早时间”给轨迹排序，得到稳定顺序
        if "UTCTimeOffsetEpoch" in g_user.columns:
            traj_order = (
                g_user.groupby("pseudo_session_trajectory_id")["UTCTimeOffsetEpoch"]
                .min().sort_values().index.tolist()
            )
        else:
            traj_order = (
                g_user.groupby("pseudo_session_trajectory_id")["_slot48"]
                .min().sort_values().index.tolist()
            )

        # 🔧 只给“保留下来的轨迹”分配连续 session 编号
        trajid2sess = {}
        sessions = {}
        kept_traj_ids = []         # 🔧 记录保留下来的轨迹编号
        sess_counter = 0

        for tid in traj_order:
            g_traj = g_user[g_user["pseudo_session_trajectory_id"] == tid]
            if len(g_traj) == 1:   # 丢弃长度为 1 的轨迹
                continue

            trajid2sess[tid] = sess_counter
            kept_traj_ids.append(tid)   # 🔧
            sess_counter += 1

            pairs = (
                g_traj[["PoiId", "_slot48"]]
                .astype({"PoiId": "Int64", "_slot48": "Int64"})
                .values.tolist()
            )
            sessions[trajid2sess[tid]] = pairs

        # 强校验：每条“保留下来”的轨迹现在应该只来自一个 split
        split_by_traj = (
            g_user.groupby("pseudo_session_trajectory_id")["__split"]
            .nunique().reindex(kept_traj_ids).fillna(0).astype(int)   # 🔧 只检查 kept
        )
        bad = [tid for tid, k in split_by_traj.items() if k > 1]
        if bad:
            raise ValueError(f"用户 {uid} 的以下轨迹仍混有 train/test：{bad}")

        # 轨迹归属（唯一值）
        traj_split_all = g_user.groupby("pseudo_session_trajectory_id")["__split"].first().to_dict()
        # 🔧 仅从 kept_traj_ids 里取，映射到新的 session idx
        train_idx = sorted([trajid2sess[tid] for tid in kept_traj_ids if traj_split_all[tid] == "train"])
        test_idx  = sorted([trajid2sess[tid] for tid in kept_traj_ids if traj_split_all[tid] == "test"])

        data[uidx] = {"train": train_idx, "test": test_idx, "sessions": sessions}

    return data, vid_lookup

In [None]:
# tky
train_df = pd.read_csv("../../Foursquare-TKY/train_sample.csv")
test_df  = pd.read_csv("../../Gowalla-CA/test_sample.csv")

data_neural, vid_lookup = build_dataset_tkyca(train_df, test_df)

data = {}
data['data_neural'] = data_neural
data['uid_list'] = len(data_neural)
data['vid_list'] = 7832
data['vid_lookup'] = vid_lookup

with open("dataset/tky_cut_one_day.pkl", "wb") as file:
    pickle.dump(data, file)

In [None]:
# ca

train_df = pd.read_csv("../../Gowalla-CA/train_sample.csv")
test_df  = pd.read_csv("../../Gowalla-CA/test_sample.csv")

data_neural, vid_lookup = build_dataset_tkyca(train_df, test_df)

data = {}
data['data_neural'] = data_neural
data['uid_list'] = len(data_neural)
data['vid_list'] = 9689
data['vid_lookup'] = vid_lookup

with open("dataset/ca_cut_one_day.pkl", "wb") as file:
    pickle.dump(data, file)