In [1]:
!pip install transformers tqdm scikit-learn

import gzip
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, DistilBertModel, AdamW
from tqdm import tqdm
import warnings





In [2]:
import pandas as pd

def load_checkins_txt(file_path):
    """
    Load Brighkite check-in data from a plain text file.
    """
    # Column names based on Gowalla data description
    columns = ["user_id", "check_in_time", "latitude", "longitude", "location_id"]
    data = pd.read_csv(file_path, sep="\t", names=columns)
    # Convert timestamp to datetime for easier manipulation
    data["check_in_time"] = pd.to_datetime(data["check_in_time"])
    return data

checkins_file = "/kaggle/input/brightkite-raw/Brightkite_totalCheckins.txt"  
brightkite_data = load_checkins_txt(checkins_file)
print(brightkite_data.head())


   user_id             check_in_time   latitude   longitude  \
0        0 2010-10-17 01:48:53+00:00  39.747652 -104.992510   
1        0 2010-10-16 06:02:04+00:00  39.891383 -105.070814   
2        0 2010-10-16 03:48:54+00:00  39.891077 -105.068532   
3        0 2010-10-14 18:25:51+00:00  39.750469 -104.999073   
4        0 2010-10-14 00:21:47+00:00  39.752713 -104.996337   

                                location_id  
0          88c46bf20db295831bd2d1718ad7e6f5  
1          7a0f88982aa015062b95e3b4843f9ca2  
2          dd7cd3d264c2d063832db506fba8bf79  
3  9848afcc62e500a01cf6fbf24b797732f8963683  
4          2ef143e12038c870038df53e0478cefc  


In [3]:
def add_spatio_temporal_features(df, lat_multiplier=10, lon_multiplier=10):
    # Ensure numeric
    df["latitude"] = pd.to_numeric(df["latitude"], errors="coerce")
    df["longitude"] = pd.to_numeric(df["longitude"], errors="coerce")

    # Drop rows with NaN in these columns
    df = df.dropna(subset=["latitude", "longitude"])
    
    # Also remove infinite if present
    df = df[~df["latitude"].isin([np.inf, -np.inf])]
    df = df[~df["longitude"].isin([np.inf, -np.inf])]
    
    # Now cast safely
    df["lat_bucket"] = (df["latitude"] * lat_multiplier).astype(int)
    df["lon_bucket"] = (df["longitude"] * lon_multiplier).astype(int)

    # Day of week, hour of day
    df["day_of_week"] = df["check_in_time"].dt.dayofweek
    df["hour_of_day"] = df["check_in_time"].dt.hour
    
    return df

brightkite_data = add_spatio_temporal_features(brightkite_data)
print(brightkite_data.head())


   user_id             check_in_time   latitude   longitude  \
0        0 2010-10-17 01:48:53+00:00  39.747652 -104.992510   
1        0 2010-10-16 06:02:04+00:00  39.891383 -105.070814   
2        0 2010-10-16 03:48:54+00:00  39.891077 -105.068532   
3        0 2010-10-14 18:25:51+00:00  39.750469 -104.999073   
4        0 2010-10-14 00:21:47+00:00  39.752713 -104.996337   

                                location_id  lat_bucket  lon_bucket  \
0          88c46bf20db295831bd2d1718ad7e6f5         397       -1049   
1          7a0f88982aa015062b95e3b4843f9ca2         398       -1050   
2          dd7cd3d264c2d063832db506fba8bf79         398       -1050   
3  9848afcc62e500a01cf6fbf24b797732f8963683         397       -1049   
4          2ef143e12038c870038df53e0478cefc         397       -1049   

   day_of_week  hour_of_day  
0            6            1  
1            5            6  
2            5            3  
3            3           18  
4            3            0  


In [4]:
def construct_trajectories(data, time_window="1D"):
    """
    Group check-ins into user trajectories based on a time window.
    time_window is a Pandas time frequency string (e.g., '1D' for daily).
    """
    trajectories = []

    grouped = data.groupby("user_id")
    for user_id, group in grouped:
        group = group.sort_values("check_in_time")
        # Identify new trajectory if gap > time_window
        group["trajectory_id"] = (group["check_in_time"].diff() > pd.Timedelta(time_window)).cumsum()

        for traj_id, traj_group in group.groupby("trajectory_id"):
            trajectory = {
                "user_id": user_id,
                "trajectory_id": traj_id,
                "timestamps": traj_group["check_in_time"].tolist(),
                "locations": traj_group["location_id"].tolist(),
                "day_of_week": traj_group["day_of_week"].tolist(),
                "hour_of_day": traj_group["hour_of_day"].tolist(),
                "lat_bucket": traj_group["lat_bucket"].tolist(),
                "lon_bucket": traj_group["lon_bucket"].tolist()
            }
            trajectories.append(trajectory)

    return pd.DataFrame(trajectories)

trajectories_df = construct_trajectories(brightkite_data, time_window="1D")
trajectories_df.head()


Unnamed: 0,user_id,trajectory_id,timestamps,locations,day_of_week,hour_of_day,lat_bucket,lon_bucket
0,0,0,"[2009-05-25 20:56:10+00:00, 2009-05-25 21:35:2...","[ee81ef22a22411ddb5e97f082c799f59, 248b82709e6...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, ...","[20, 21, 21, 21, 22, 2, 2, 4, 16, 16, 20, 22, ...","[377, 376, 376, 376, 376, 398, 398, 397, 397, ...","[-1224, -1223, -1223, -1223, -1223, -1046, -10..."
1,0,1,"[2009-06-07 19:36:52+00:00, 2009-06-07 19:47:2...","[510bf64ef8ff9ec95b72d6be7f142ddc663ce1ac, 510...","[6, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[19, 19, 20, 21, 22, 22, 4, 4, 5, 17, 17, 21, ...","[397, 397, 397, 398, 399, 399, 397, 397, 397, ...","[-1049, -1049, -1049, -1050, -1050, -1050, -10..."
2,0,2,"[2009-07-05 23:53:50+00:00, 2009-07-05 23:53:5...","[ee8b1d0ea22411ddb074dbd65f1665cf, ee8b1d0ea22...","[6, 6, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, ...","[23, 23, 16, 18, 19, 1, 1, 1, 2, 16, 20, 20, 2...","[397, 397, 397, 397, 397, 397, 397, 397, 397, ...","[-1049, -1049, -1049, -1049, -1049, -1049, -10..."
3,0,3,"[2009-07-20 00:46:19+00:00, 2009-07-20 09:56:5...","[a9aa93f62f88451e110ccd655423f1395996f03d, ee8...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 9, 17, 19, 19, 0, 2, 2, 2, 2, 3, 4, 4, 16,...","[398, 397, 397, 397, 397, 397, 397, 397, 397, ...","[-1051, -1049, -1049, -1049, -1049, -1049, -10..."
4,0,4,"[2009-07-23 03:59:02+00:00, 2009-07-23 04:00:4...","[ee8b1d0ea22411ddb074dbd65f1665cf, ee8b1d0ea22...","[3, 3, 4, 5, 5, 5, 5]","[3, 4, 3, 0, 22, 22, 23]","[397, 397, 397, 397, 397, 397, 396]","[-1049, -1049, -1049, -1049, -1049, -1049, -1049]"


In [5]:
def encode_pois(trajectories):
    """
    Encode POIs into unique numeric IDs.
    """
    all_locations = set(loc for traj in trajectories["locations"] for loc in traj)
    location_mapping = {loc: idx for idx, loc in enumerate(all_locations)}

    trajectories["encoded_locations"] = trajectories["locations"].apply(
        lambda locs: [location_mapping[loc] for loc in locs]
    )
    return trajectories, location_mapping

trajectories_df, poi_mapping = encode_pois(trajectories_df)
print(trajectories_df.head())

# trajectories_df.to_csv("brightkite_trajectories_spatiotemp.csv", index=False)



   user_id  trajectory_id                                         timestamps  \
0        0              0  [2009-05-25 20:56:10+00:00, 2009-05-25 21:35:2...   
1        0              1  [2009-06-07 19:36:52+00:00, 2009-06-07 19:47:2...   
2        0              2  [2009-07-05 23:53:50+00:00, 2009-07-05 23:53:5...   
3        0              3  [2009-07-20 00:46:19+00:00, 2009-07-20 09:56:5...   
4        0              4  [2009-07-23 03:59:02+00:00, 2009-07-23 04:00:4...   

                                           locations  \
0  [ee81ef22a22411ddb5e97f082c799f59, 248b82709e6...   
1  [510bf64ef8ff9ec95b72d6be7f142ddc663ce1ac, 510...   
2  [ee8b1d0ea22411ddb074dbd65f1665cf, ee8b1d0ea22...   
3  [a9aa93f62f88451e110ccd655423f1395996f03d, ee8...   
4  [ee8b1d0ea22411ddb074dbd65f1665cf, ee8b1d0ea22...   

                                         day_of_week  \
0  [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, ...   
1  [6, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...   
2  [6, 6, 0, 0

In [6]:
def build_mixed_trajectory(loc_ids, days, hours, lat_buckets, lon_buckets):
    """
    Convert each check-in to a single token that includes 
    (POI, day_of_week, hour_of_day, lat_bucket, lon_bucket).
    e.g., "POI123_DAY2_H14_LAT39_LON116"
    """
    tokens = []
    for loc_id, d, h, latb, lonb in zip(loc_ids, days, hours, lat_buckets, lon_buckets):
        token = f"POI{loc_id}_DAY{d}_H{h}_LAT{latb}_LON{lonb}"
        tokens.append(token)
    return " ".join(tokens)


In [7]:
class SpatioTemporalBrightkiteDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        """
        df should contain:
          - encoded_locations (list of location IDs)
          - day_of_week (list of day indices)
          - hour_of_day (list of hour indices)
          - lat_bucket (list of lat bucket ints)
          - lon_bucket (list of lon bucket ints)
          - label (integer-encoded user ID)
        """
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.labels = df["label"].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        loc_ids = row["encoded_locations"]
        days = row["day_of_week"]
        hours = row["hour_of_day"]
        latb = row["lat_bucket"]
        lonb = row["lon_bucket"]

        # Build spatio-temporal token string
        trajectory_str = build_mixed_trajectory(loc_ids, days, hours, latb, lonb)

        # Tokenize
        inputs = self.tokenizer(
            trajectory_str,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt",
        )

        label = self.labels[idx]

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [8]:
# Random subset of users
unique_users = trajectories_df['user_id'].unique()
random.seed(42)
selected_users = random.sample(list(unique_users), 92)

filtered_df = trajectories_df[trajectories_df['user_id'].isin(selected_users)].reset_index(drop=True)

# Label encode user_id
label_encoder = LabelEncoder()
filtered_df["label"] = label_encoder.fit_transform(filtered_df["user_id"])
num_users = len(label_encoder.classes_)
print(f"Number of selected users: {num_users}")

train_df, val_df = train_test_split(filtered_df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)


Number of selected users: 92


In [9]:
# Instantiate the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create dataset objects
train_dataset = SpatioTemporalBrightkiteDataset(train_df, tokenizer, max_length=128)
val_dataset = SpatioTemporalBrightkiteDataset(val_df, tokenizer, max_length=128)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

class TrajectoryDistilBERT(nn.Module):
    def __init__(self, num_users, dropout_rate=0.1):
        super(TrajectoryDistilBERT, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.bert.config.dim, num_users)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



In [10]:
# Initialize model
model = TrajectoryDistilBERT(num_users)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and loss
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]



In [11]:
def compute_metrics(outputs, labels, top_ks=[1, 5]):
    metrics = {}
    with torch.no_grad():
        for k in top_ks:
            _, topk_indices = torch.topk(outputs, k=k, dim=1)
            correct_topk = 0
            for i in range(labels.size(0)):
                if labels[i].item() in topk_indices[i]:
                    correct_topk += 1
            metrics[f"ACC@{k}"] = correct_topk / labels.size(0)

    # For F1 and classification report
    _, preds = torch.max(outputs, dim=1)
    metrics["top1_preds"] = preds
    return metrics

from sklearn.metrics import classification_report

def evaluate_model(model, loader, device, loss_fn):
    model.eval()
    total_loss = 0.0
    all_preds = []
    all_labels = []
    acc_metrics = {"ACC@1": 0, "ACC@5": 0}
    total_samples = 0

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            batch_size = labels.size(0)
            total_loss += loss.item() * batch_size

            batch_metrics = compute_metrics(outputs, labels)
            acc_metrics["ACC@1"] += batch_metrics["ACC@1"] * batch_size
            acc_metrics["ACC@5"] += batch_metrics["ACC@5"] * batch_size

            all_preds.extend(batch_metrics["top1_preds"].cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            total_samples += batch_size

    avg_loss = total_loss / total_samples
    acc_metrics["ACC@1"] /= total_samples
    acc_metrics["ACC@5"] /= total_samples

    classification_metrics = classification_report(all_labels, all_preds, output_dict=True)
    return avg_loss, acc_metrics, classification_metrics


In [12]:
def train_model(model, train_loader, val_loader, device, loss_fn, optimizer, epochs=5):
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        train_iter = tqdm(train_loader, desc=f"Epoch {epoch+1} [Training]", leave=False)
        
        for batch in train_iter:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            train_iter.set_description(f"Epoch {epoch+1} [Training] loss: {loss.item():.4f}")

        # Evaluate on train and val sets
        train_loss, train_acc_metrics, train_classification_metrics = evaluate_model(model, train_loader, device, loss_fn)
        val_loss, val_acc_metrics, val_classification_metrics = evaluate_model(model, val_loader, device, loss_fn)

        # Print epoch summary
        print(f"\nEpoch {epoch+1}:")
        print(f"  Train Loss: {train_loss:.4f}, Train ACC@1: {train_acc_metrics['ACC@1']:.4f}, ACC@5: {train_acc_metrics['ACC@5']:.4f}")
        print(f"  Val   Loss: {val_loss:.4f},   Val ACC@1:   {val_acc_metrics['ACC@1']:.4f}, ACC@5: {val_acc_metrics['ACC@5']:.4f}")
        print(f"  Val Macro F1: {val_classification_metrics['macro avg']['f1-score']:.4f}, Weighted F1: {val_classification_metrics['weighted avg']['f1-score']:.4f}")


In [17]:
epochs = 1
train_model(model, train_loader, val_loader, device, loss_fn, optimizer, epochs=epochs)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 1:
  Train Loss: 2.2553, Train ACC@1: 0.5850, ACC@5: 0.7405
  Val   Loss: 2.2494,   Val ACC@1:   0.5817, ACC@5: 0.7452
  Val Macro F1: 0.1729, Weighted F1: 0.4746


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Using best parameters for 92 users


In [14]:
# Random subset of users
unique_users = trajectories_df['user_id'].unique()
random.seed(42)
selected_users = random.sample(list(unique_users), 92)

filtered_df = trajectories_df[trajectories_df['user_id'].isin(selected_users)].reset_index(drop=True)

# Label encode user_id
label_encoder = LabelEncoder()
filtered_df["label"] = label_encoder.fit_transform(filtered_df["user_id"])
num_users = len(label_encoder.classes_)
print(f"Number of selected users: {num_users}")

train_df, val_df = train_test_split(filtered_df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)


Number of selected users: 92


In [15]:
# Best Hyperparams
best_dropout = 0.0
best_lr = 3e-5
best_batch_size = 16
best_max_length = 128

train_dataset = SpatioTemporalBrightkiteDataset(train_df, tokenizer, max_length=best_max_length)
val_dataset = SpatioTemporalBrightkiteDataset(val_df, tokenizer, max_length=best_max_length)

train_loader = DataLoader(train_dataset, batch_size=best_batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=best_batch_size, shuffle=False)


In [16]:
class TrajectoryDistilBERT(nn.Module):
    def __init__(self, num_users, dropout_rate=0.1):
        super(TrajectoryDistilBERT, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.bert.config.dim, num_users)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # DistilBERT's last_hidden_state: (batch_size, seq_len, hidden_size)
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits


In [17]:
# Build the model 
model = TrajectoryDistilBERT(num_users, dropout_rate=best_dropout)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

TrajectoryDistilBERT(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Li

In [18]:
optimizer = AdamW(model.parameters(), lr=best_lr)
loss_fn = nn.CrossEntropyLoss()




In [19]:
epochs = 10  
train_model(model, train_loader, val_loader, device, loss_fn, optimizer, epochs=epochs)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 1:
  Train Loss: 1.6637, Train ACC@1: 0.6891, ACC@5: 0.8223
  Val   Loss: 1.6801,   Val ACC@1:   0.6953, ACC@5: 0.8227
  Val Macro F1: 0.2454, Weighted F1: 0.6038


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 2:
  Train Loss: 0.7882, Train ACC@1: 0.8425, ACC@5: 0.9563
  Val   Loss: 0.9599,   Val ACC@1:   0.8172, ACC@5: 0.8947
  Val Macro F1: 0.4203, Weighted F1: 0.7711


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 3:
  Train Loss: 0.5799, Train ACC@1: 0.8654, ACC@5: 0.9799
  Val   Loss: 0.8166,   Val ACC@1:   0.8144, ACC@5: 0.9169
  Val Macro F1: 0.4473, Weighted F1: 0.7795


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 4:
  Train Loss: 0.4283, Train ACC@1: 0.8952, ACC@5: 0.9917
  Val   Loss: 0.7194,   Val ACC@1:   0.8421, ACC@5: 0.9446
  Val Macro F1: 0.4900, Weighted F1: 0.8137


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 5:
  Train Loss: 0.3421, Train ACC@1: 0.9216, ACC@5: 0.9979
  Val   Loss: 0.7133,   Val ACC@1:   0.8421, ACC@5: 0.9446
  Val Macro F1: 0.4884, Weighted F1: 0.8294


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 6:
  Train Loss: 0.2699, Train ACC@1: 0.9389, ACC@5: 1.0000
  Val   Loss: 0.6450,   Val ACC@1:   0.8476, ACC@5: 0.9418
  Val Macro F1: 0.5384, Weighted F1: 0.8417


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 7:
  Train Loss: 0.2182, Train ACC@1: 0.9549, ACC@5: 1.0000
  Val   Loss: 0.6414,   Val ACC@1:   0.8670, ACC@5: 0.9446
  Val Macro F1: 0.5708, Weighted F1: 0.8629


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 8:
  Train Loss: 0.1736, Train ACC@1: 0.9646, ACC@5: 1.0000
  Val   Loss: 0.6150,   Val ACC@1:   0.8726, ACC@5: 0.9446
  Val Macro F1: 0.5571, Weighted F1: 0.8639


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 9:
  Train Loss: 0.1261, Train ACC@1: 0.9757, ACC@5: 1.0000
  Val   Loss: 0.6004,   Val ACC@1:   0.8809, ACC@5: 0.9501
  Val Macro F1: 0.5871, Weighted F1: 0.8742


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 10:
  Train Loss: 0.1136, Train ACC@1: 0.9820, ACC@5: 1.0000
  Val   Loss: 0.6217,   Val ACC@1:   0.8726, ACC@5: 0.9474
  Val Macro F1: 0.5649, Weighted F1: 0.8663


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# WITH TEST SET

In [14]:
# Random subset of users
unique_users = trajectories_df['user_id'].unique()
random.seed(42)
selected_users = random.sample(list(unique_users), 92)

filtered_df = trajectories_df[trajectories_df['user_id'].isin(selected_users)].reset_index(drop=True)

# Label encode user_id
label_encoder = LabelEncoder()
filtered_df["label"] = label_encoder.fit_transform(filtered_df["user_id"])
num_users = len(label_encoder.classes_)
print(f"Number of selected users: {num_users}")

# Split data into training, validation, and test sets
train_df, temp_df = train_test_split(filtered_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Best Hyperparams
best_dropout = 0.0
best_lr = 3e-5
best_batch_size = 16
best_max_length = 128

# Create datasets
train_dataset = SpatioTemporalBrightkiteDataset(train_df, tokenizer, max_length=best_max_length)
val_dataset = SpatioTemporalBrightkiteDataset(val_df, tokenizer, max_length=best_max_length)
test_dataset = SpatioTemporalBrightkiteDataset(test_df, tokenizer, max_length=best_max_length)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=best_batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=best_batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=best_batch_size, shuffle=False)

# Define the evaluation function
def evaluate_test_set(model, loader, device, loss_fn):
    model.eval()
    total_loss = 0.0
    all_preds, all_labels = [], []
    acc_metrics = {"ACC@1": 0, "ACC@5": 0}
    total_samples = 0

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            batch_size = labels.size(0)
            total_loss += loss.item() * batch_size

            batch_metrics = compute_metrics(outputs, labels)
            acc_metrics["ACC@1"] += batch_metrics["ACC@1"] * batch_size
            acc_metrics["ACC@5"] += batch_metrics["ACC@5"] * batch_size

            all_preds.extend(batch_metrics["top1_preds"].cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            total_samples += batch_size

    # Compute averages
    avg_loss = total_loss / total_samples
    acc_metrics["ACC@1"] /= total_samples
    acc_metrics["ACC@5"] /= total_samples

    # Generate classification report
    cls_metrics = classification_report(all_labels, all_preds, output_dict=True)
    
    print(f"Test Loss: {avg_loss:.4f}")
    print(f"ACC@1: {acc_metrics['ACC@1']:.4f}")
    print(f"ACC@5: {acc_metrics['ACC@5']:.4f}")
    
    return avg_loss, acc_metrics, cls_metrics

# Define model
class TrajectoryDistilBERT(nn.Module):
    def __init__(self, num_users, dropout_rate=0.1):
        super(TrajectoryDistilBERT, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.bert.config.dim, num_users)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits

# Build the model 
model = TrajectoryDistilBERT(num_users, dropout_rate=best_dropout)
model.to(device)

# Optimizer and loss
optimizer = AdamW(model.parameters(), lr=best_lr)
loss_fn = nn.CrossEntropyLoss()

# Train the model
epochs = 10
train_model(model, train_loader, val_loader, device, loss_fn, optimizer, epochs=epochs)

# Evaluate on the test set
print("Evaluating the model on the test set...")
evaluate_test_set(model, test_loader, device, loss_fn)


Number of selected users: 92


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 1:
  Train Loss: 1.9649, Train ACC@1: 0.6693, ACC@5: 0.7772
  Val   Loss: 1.9214,   Val ACC@1:   0.6778, ACC@5: 0.7852
  Val Macro F1: 0.2570, Weighted F1: 0.5833


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 2:
  Train Loss: 0.8699, Train ACC@1: 0.8184, ACC@5: 0.9461
  Val   Loss: 0.9833,   Val ACC@1:   0.8222, ACC@5: 0.8889
  Val Macro F1: 0.4673, Weighted F1: 0.7654


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 3:
  Train Loss: 0.5855, Train ACC@1: 0.8747, ACC@5: 0.9802
  Val   Loss: 0.8306,   Val ACC@1:   0.8407, ACC@5: 0.9296
  Val Macro F1: 0.5160, Weighted F1: 0.8026


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 4:
  Train Loss: 0.4388, Train ACC@1: 0.9096, ACC@5: 0.9913
  Val   Loss: 0.7920,   Val ACC@1:   0.8333, ACC@5: 0.9259
  Val Macro F1: 0.5350, Weighted F1: 0.8035


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 5:
  Train Loss: 0.3436, Train ACC@1: 0.9199, ACC@5: 0.9984
  Val   Loss: 0.7199,   Val ACC@1:   0.8519, ACC@5: 0.9444
  Val Macro F1: 0.5682, Weighted F1: 0.8364


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 6:
  Train Loss: 0.2896, Train ACC@1: 0.9381, ACC@5: 0.9992
  Val   Loss: 0.7337,   Val ACC@1:   0.8556, ACC@5: 0.9333
  Val Macro F1: 0.5668, Weighted F1: 0.8405


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 7:
  Train Loss: 0.2193, Train ACC@1: 0.9556, ACC@5: 1.0000
  Val   Loss: 0.6963,   Val ACC@1:   0.8630, ACC@5: 0.9444
  Val Macro F1: 0.5867, Weighted F1: 0.8523


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 8:
  Train Loss: 0.1662, Train ACC@1: 0.9707, ACC@5: 1.0000
  Val   Loss: 0.6763,   Val ACC@1:   0.8704, ACC@5: 0.9407
  Val Macro F1: 0.6146, Weighted F1: 0.8607


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 9:
  Train Loss: 0.1284, Train ACC@1: 0.9762, ACC@5: 1.0000
  Val   Loss: 0.6728,   Val ACC@1:   0.8667, ACC@5: 0.9370
  Val Macro F1: 0.6178, Weighted F1: 0.8564


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 10:
  Train Loss: 0.0973, Train ACC@1: 0.9881, ACC@5: 1.0000
  Val   Loss: 0.6784,   Val ACC@1:   0.8667, ACC@5: 0.9370
  Val Macro F1: 0.5918, Weighted F1: 0.8553
Evaluating the model on the test set...
Test Loss: 0.5967
ACC@1: 0.8672
ACC@5: 0.9557


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.5967287633471823,
 {'ACC@1': 0.8671586715867159, 'ACC@5': 0.955719557195572},
 {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
  '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 1},
  '2': {'precision': 0.9,
   'recall': 1.0,
   'f1-score': 0.9473684210526316,
   'support': 9},
  '4': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 4},
  '5': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 16},
  '6': {'precision': 1.0,
   'recall': 0.75,
   'f1-score': 0.8571428571428571,
   'support': 4},
  '7': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 5},
  '8': {'precision': 0.8076923076923077,
   'recall': 0.9130434782608695,
   'f1-score': 0.8571428571428572,
   'support': 23},
  '11': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 2},
  '12': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1},
  '14': {'precision': 1.0,
   'recall': 0.75,
   'f1-score': 0.8571428571428571,
 

# Using best parameters for 34 users

In [20]:
# Random subset of users
unique_users = trajectories_df['user_id'].unique()
random.seed(42)
selected_users = random.sample(list(unique_users), 34)

filtered_df = trajectories_df[trajectories_df['user_id'].isin(selected_users)].reset_index(drop=True)

# Label encode user_id
label_encoder = LabelEncoder()
filtered_df["label"] = label_encoder.fit_transform(filtered_df["user_id"])
num_users = len(label_encoder.classes_)
print(f"Number of selected users: {num_users}")

train_df, val_df = train_test_split(filtered_df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)


Number of selected users: 34


In [21]:
# Best Hyperparams
best_dropout = 0.0
best_lr = 3e-5
best_batch_size = 16
best_max_length = 128

train_dataset = SpatioTemporalBrightkiteDataset(train_df, tokenizer, max_length=best_max_length)
val_dataset = SpatioTemporalBrightkiteDataset(val_df, tokenizer, max_length=best_max_length)

train_loader = DataLoader(train_dataset, batch_size=best_batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=best_batch_size, shuffle=False)


In [22]:
class TrajectoryDistilBERT(nn.Module):
    def __init__(self, num_users, dropout_rate=0.1):
        super(TrajectoryDistilBERT, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.bert.config.dim, num_users)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # DistilBERT's last_hidden_state: (batch_size, seq_len, hidden_size)
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits


In [23]:
# Build the model 
model = TrajectoryDistilBERT(num_users, dropout_rate=best_dropout)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

TrajectoryDistilBERT(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Li

In [24]:
optimizer = AdamW(model.parameters(), lr=best_lr)
loss_fn = nn.CrossEntropyLoss()




In [25]:
epochs = 10  
train_model(model, train_loader, val_loader, device, loss_fn, optimizer, epochs=epochs)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 1:
  Train Loss: 2.2482, Train ACC@1: 0.5196, ACC@5: 0.7102
  Val   Loss: 2.6074,   Val ACC@1:   0.4375, ACC@5: 0.5938
  Val Macro F1: 0.1259, Weighted F1: 0.3393


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 2:
  Train Loss: 1.1180, Train ACC@1: 0.7337, ACC@5: 0.9034
  Val   Loss: 1.5943,   Val ACC@1:   0.6354, ACC@5: 0.7604
  Val Macro F1: 0.2662, Weighted F1: 0.5272


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 3:
  Train Loss: 0.6715, Train ACC@1: 0.8381, ACC@5: 0.9765
  Val   Loss: 1.2060,   Val ACC@1:   0.6875, ACC@5: 0.8542
  Val Macro F1: 0.3778, Weighted F1: 0.6069


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 4:
  Train Loss: 0.4641, Train ACC@1: 0.9112, ACC@5: 0.9974
  Val   Loss: 1.0322,   Val ACC@1:   0.7396, ACC@5: 0.9167
  Val Macro F1: 0.4667, Weighted F1: 0.6888


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 5:
  Train Loss: 0.3303, Train ACC@1: 0.9191, ACC@5: 1.0000
  Val   Loss: 0.9584,   Val ACC@1:   0.7500, ACC@5: 0.9479
  Val Macro F1: 0.5370, Weighted F1: 0.7131


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 6:
  Train Loss: 0.2707, Train ACC@1: 0.9217, ACC@5: 1.0000
  Val   Loss: 0.8327,   Val ACC@1:   0.7708, ACC@5: 0.9583
  Val Macro F1: 0.5444, Weighted F1: 0.7243


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 7:
  Train Loss: 0.1951, Train ACC@1: 0.9478, ACC@5: 1.0000
  Val   Loss: 0.7828,   Val ACC@1:   0.7812, ACC@5: 0.9688
  Val Macro F1: 0.5993, Weighted F1: 0.7468


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 8:
  Train Loss: 0.1476, Train ACC@1: 0.9661, ACC@5: 1.0000
  Val   Loss: 0.7773,   Val ACC@1:   0.7917, ACC@5: 0.9583
  Val Macro F1: 0.6300, Weighted F1: 0.7628


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 9:
  Train Loss: 0.1200, Train ACC@1: 0.9791, ACC@5: 1.0000
  Val   Loss: 0.7055,   Val ACC@1:   0.8021, ACC@5: 0.9688
  Val Macro F1: 0.6367, Weighted F1: 0.7725


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 10:
  Train Loss: 0.1021, Train ACC@1: 0.9791, ACC@5: 1.0000
  Val   Loss: 0.6819,   Val ACC@1:   0.8333, ACC@5: 0.9688
  Val Macro F1: 0.6674, Weighted F1: 0.8081


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# WITH TEST SET

In [15]:
# Random subset of users
unique_users = trajectories_df['user_id'].unique()
random.seed(42)
selected_users = random.sample(list(unique_users), 34)

filtered_df = trajectories_df[trajectories_df['user_id'].isin(selected_users)].reset_index(drop=True)

# Label encode user_id
label_encoder = LabelEncoder()
filtered_df["label"] = label_encoder.fit_transform(filtered_df["user_id"])
num_users = len(label_encoder.classes_)
print(f"Number of selected users: {num_users}")

# Split data into training, validation, and test sets
train_df, temp_df = train_test_split(filtered_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Best Hyperparams
best_dropout = 0.0
best_lr = 3e-5
best_batch_size = 16
best_max_length = 128

# Create datasets
train_dataset = SpatioTemporalBrightkiteDataset(train_df, tokenizer, max_length=best_max_length)
val_dataset = SpatioTemporalBrightkiteDataset(val_df, tokenizer, max_length=best_max_length)
test_dataset = SpatioTemporalBrightkiteDataset(test_df, tokenizer, max_length=best_max_length)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=best_batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=best_batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=best_batch_size, shuffle=False)

# Define the evaluation function
def evaluate_test_set(model, loader, device, loss_fn):
    model.eval()
    total_loss = 0.0
    all_preds, all_labels = [], []
    acc_metrics = {"ACC@1": 0, "ACC@5": 0}
    total_samples = 0

    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            batch_size = labels.size(0)
            total_loss += loss.item() * batch_size

            batch_metrics = compute_metrics(outputs, labels)
            acc_metrics["ACC@1"] += batch_metrics["ACC@1"] * batch_size
            acc_metrics["ACC@5"] += batch_metrics["ACC@5"] * batch_size

            all_preds.extend(batch_metrics["top1_preds"].cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            total_samples += batch_size

    # Compute averages
    avg_loss = total_loss / total_samples
    acc_metrics["ACC@1"] /= total_samples
    acc_metrics["ACC@5"] /= total_samples

    # Generate classification report
    cls_metrics = classification_report(all_labels, all_preds, output_dict=True)
    
    print(f"Test Loss: {avg_loss:.4f}")
    print(f"ACC@1: {acc_metrics['ACC@1']:.4f}")
    print(f"ACC@5: {acc_metrics['ACC@5']:.4f}")
    
    return avg_loss, acc_metrics, cls_metrics

# Define model
class TrajectoryDistilBERT(nn.Module):
    def __init__(self, num_users, dropout_rate=0.1):
        super(TrajectoryDistilBERT, self).__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.bert.config.dim, num_users)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits

# Build the model 
model = TrajectoryDistilBERT(num_users, dropout_rate=best_dropout)
model.to(device)

# Optimizer and loss
optimizer = AdamW(model.parameters(), lr=best_lr)
loss_fn = nn.CrossEntropyLoss()

# Train the model
epochs = 10
train_model(model, train_loader, val_loader, device, loss_fn, optimizer, epochs=epochs)

# Evaluate on the test set
print("Evaluating the model on the test set...")
evaluate_test_set(model, test_loader, device, loss_fn)


Number of selected users: 34


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 1:
  Train Loss: 2.4532, Train ACC@1: 0.2388, ACC@5: 0.6866
  Val   Loss: 2.8638,   Val ACC@1:   0.1944, ACC@5: 0.5694
  Val Macro F1: 0.0360, Weighted F1: 0.0885


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 2:
  Train Loss: 1.4102, Train ACC@1: 0.6358, ACC@5: 0.8567
  Val   Loss: 2.0378,   Val ACC@1:   0.5417, ACC@5: 0.6667
  Val Macro F1: 0.1432, Weighted F1: 0.4061


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 3:
  Train Loss: 0.8221, Train ACC@1: 0.7910, ACC@5: 0.9403
  Val   Loss: 1.5721,   Val ACC@1:   0.6389, ACC@5: 0.7917
  Val Macro F1: 0.2316, Weighted F1: 0.5180


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 4:
  Train Loss: 0.5172, Train ACC@1: 0.9015, ACC@5: 0.9701
  Val   Loss: 1.3030,   Val ACC@1:   0.6806, ACC@5: 0.8056
  Val Macro F1: 0.3609, Weighted F1: 0.6025


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 5:
  Train Loss: 0.3672, Train ACC@1: 0.9224, ACC@5: 0.9940
  Val   Loss: 1.1492,   Val ACC@1:   0.6944, ACC@5: 0.8472
  Val Macro F1: 0.4229, Weighted F1: 0.6443


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 6:
  Train Loss: 0.2626, Train ACC@1: 0.9433, ACC@5: 0.9970
  Val   Loss: 1.0163,   Val ACC@1:   0.7500, ACC@5: 0.8750
  Val Macro F1: 0.5176, Weighted F1: 0.7064


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 7:
  Train Loss: 0.1951, Train ACC@1: 0.9582, ACC@5: 1.0000
  Val   Loss: 0.9603,   Val ACC@1:   0.7361, ACC@5: 0.9167
  Val Macro F1: 0.4875, Weighted F1: 0.6909


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 8:
  Train Loss: 0.1483, Train ACC@1: 0.9761, ACC@5: 1.0000
  Val   Loss: 0.9166,   Val ACC@1:   0.7917, ACC@5: 0.9306
  Val Macro F1: 0.6200, Weighted F1: 0.7674


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 9:
  Train Loss: 0.1279, Train ACC@1: 0.9761, ACC@5: 1.0000
  Val   Loss: 0.8162,   Val ACC@1:   0.8056, ACC@5: 0.9306
  Val Macro F1: 0.5966, Weighted F1: 0.7714


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Epoch 10:
  Train Loss: 0.0803, Train ACC@1: 0.9940, ACC@5: 1.0000
  Val   Loss: 0.8509,   Val ACC@1:   0.7917, ACC@5: 0.9306
  Val Macro F1: 0.6239, Weighted F1: 0.7712
Evaluating the model on the test set...
Test Loss: 0.6278
ACC@1: 0.8611
ACC@5: 0.9583


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(0.6278373549381892,
 {'ACC@1': 0.8611111111111112, 'ACC@5': 0.9583333333333334},
 {'0': {'precision': 0.6666666666666666,
   'recall': 0.6666666666666666,
   'f1-score': 0.6666666666666666,
   'support': 3},
  '1': {'precision': 0.9166666666666666,
   'recall': 1.0,
   'f1-score': 0.9565217391304348,
   'support': 11},
  '2': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1},
  '3': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 5},
  '5': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1},
  '6': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 6},
  '7': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 15},
  '8': {'precision': 0.6666666666666666,
   'recall': 1.0,
   'f1-score': 0.8,
   'support': 2},
  '11': {'precision': 0.6,
   'recall': 1.0,
   'f1-score': 0.7499999999999999,
   'support': 3},
  '13': {'precision': 1.0,
   'recall': 0.5,
   'f1-score': 0.6666666666666666,
   'support': 4},
  '14': {'precision