In [13]:
import pandas as pd
import os

# nyc

In [7]:
# nyc
df_train = pd.read_csv('../../Foursquare-NYC/train_sample.csv')
df_val = pd.read_csv('../../Foursquare-NYC/valid_sample.csv')
df_test = pd.read_csv('../../Foursquare-NYC/test_sample.csv')

In [8]:
# train_filter following GETNext

from tqdm import tqdm

def train_filter(train_df: pd.DataFrame) -> pd.DataFrame:
    """
    filter short trajectory following GETNext setting (default = 2)
    """
    filtered_rows = []

    for traj_id in tqdm(train_df['trajectory_id'].unique()):
        traj_df = train_df[train_df['trajectory_id'] == traj_id]
        poi_ids = traj_df['PoiId'].to_list()

        input_seq = [(poi_ids[i], 0) for i in range(len(poi_ids) - 1)]
        
        if len(input_seq) >= 2:
            filtered_rows.append(traj_df)

    df_filtered = pd.concat(filtered_rows, ignore_index=True)

    return df_filtered

In [9]:
df_train_filter = train_filter(df_train)

  0%|          | 0/11022 [00:00<?, ?it/s]

 23%|██▎       | 2556/11022 [00:14<00:49, 171.77it/s]


KeyboardInterrupt: 

In [None]:
# valid and test filter following GETNext

from tqdm import tqdm

def val_test_filter(train_df: pd.DataFrame, test_df: pd.DataFrame) -> pd.DataFrame:
    """
    val and test filter
    1. user id only in train set
    2. poi id only in train set
    3. filter short trajectory
    """
    # user id and poi id from train before short trajectory filter
    train_user_ids = set(train_df['UserId'].unique())
    train_poi_ids = set(train_df['PoiId'].unique())

    # 1. user id filter
    df_filtered = test_df[test_df['UserId'].isin(train_user_ids)]

    # 2. poi id filter
    df_filtered = df_filtered[df_filtered['PoiId'].isin(train_poi_ids)]

    # 3. filter short trajectory following GETNext setting (default = 2)
    #traj_counts = df_filtered.groupby('trajectory_id')['PoiId'].transform('count')
    #df_filtered = df_filtered[traj_counts >= 2]

    # 3. filter short trajectory following GETNext setting (default = 2)
    filtered_rows = []

    for traj_id in tqdm(df_filtered['trajectory_id'].unique()):
        traj_df = df_filtered[df_filtered['trajectory_id'] == traj_id]
        poi_ids = traj_df['PoiId'].to_list()

        input_seq = [(poi_ids[i], 0) for i in range(len(poi_ids) - 1)]
        
        if len(input_seq) >= 2:
            filtered_rows.append(traj_df)

    df_filtered = pd.concat(filtered_rows, ignore_index=True)

    df_filtered = df_filtered.reset_index(drop=True)

    return df_filtered

In [None]:
df_val_filter = val_test_filter(df_train, df_val)

  0%|          | 0/1486 [00:00<?, ?it/s]

100%|██████████| 1486/1486 [00:01<00:00, 997.68it/s] 


In [None]:
df_test_filter = val_test_filter(df_train, df_test)

  0%|          | 0/1447 [00:00<?, ?it/s]

100%|██████████| 1447/1447 [00:01<00:00, 989.20it/s] 


In [None]:
# rename columns

def rename_and_select_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    rename and select columns

    Args:
        df (pd.DataFrame): input data

    Returns:
        pd.DataFrame: processed data
    """
    df_rename = df.rename(
        columns={
            "UserId": "user_id",
            "PoiId": "location_id",
            "PoiCategoryId": "category",
            "PoiCategoryName": "name",
            "Latitude": "latitude",
            "Longitude": "longitude",
            "UTCTimeOffset": "started_at"
        }
    )[
        ["user_id", "location_id", "category", "name", "latitude", "longitude", "started_at", "trajectory_id"]
    ]
    return df_rename

In [None]:
df_train_processed = rename_and_select_columns(df_train_filter)
df_val_processed = rename_and_select_columns(df_val_filter)
df_test_processed = rename_and_select_columns(df_test_filter)

In [None]:
os.makedirs("nyc/", exist_ok=True)

In [None]:
df_train_processed.to_csv('nyc/train.csv', index=False)
df_val_processed.to_csv('nyc/valid.csv', index=False)
df_test_processed.to_csv('nyc/test.csv', index=False)