# Assignment 2
## Task 2:
*Perform hyperparameter tuning of the number of attention heads and try a deeper embedding of the node features.*

*For the attention heads, just evaluate 2 additional settings.*

*For the embedding of the node features, instead of the linear transformation of the node states as suggested in the tutorial, try to add one fully connected layer with ReLU activation and one additional fully connected layer.*

---

#### Imports & Setup

In [None]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt

#### Data Aggregation from Multiple Scenes

In [None]:
DATASET_PATH = "dataset"

scene_ids = sorted(set(f.split(".")[0] for f in os.listdir(DATASET_PATH)))

all_nodes = []
all_edges = []
offset = 0

for scene_id in scene_ids:
    nodes_path = os.path.join(DATASET_PATH, f"{scene_id}.nodes")
    edges_path = os.path.join(DATASET_PATH, f"{scene_id}.edges")

    if not os.path.exists(nodes_path) or not os.path.exists(edges_path):
        continue

    nodes_df = pd.read_csv(
        nodes_path, header=None,
        names=["node_id", "current_x", "current_y", "prev_x", "prev_y", "future_x", "future_y"],
        na_values=["_", "NA", "NaN", "nan"]
    )

    nodes_df = nodes_df.dropna(subset=["current_x", "current_y", "prev_x", "prev_y"])
    nodes_df[["current_x", "current_y", "prev_x", "prev_y"]] = nodes_df[["current_x", "current_y", "prev_x", "prev_y"]].astype(float)

    local_to_global = {nid: i + offset for i, nid in enumerate(nodes_df["node_id"])}
    nodes_df["global_id"] = nodes_df["node_id"].map(local_to_global)

    edges_df = pd.read_csv(edges_path, header=None, names=["target", "source"])
    edges_df["source"] = edges_df["source"].map(local_to_global)
    edges_df["target"] = edges_df["target"].map(local_to_global)
    
    reversed_edges = edges_df.rename(columns={"source": "target", "target": "source"})
    full_edges = pd.concat([edges_df, reversed_edges])

    all_nodes.append(nodes_df)
    all_edges.append(full_edges)

    offset += len(nodes_df)
    
nodes_all = pd.concat(all_nodes).sort_values("global_id")
edges_all = pd.concat(all_edges).dropna().astype(int)

#### Tensor Creation for Model Input

In [None]:
node_features = tf.convert_to_tensor(
    nodes_all[["current_x", "current_y", "prev_x", "prev_y"]].to_numpy(),
    dtype=tf.float32
)

labels = tf.convert_to_tensor(
    nodes_all[["future_x", "future_y"]].fillna(0.0).to_numpy(),
    dtype=tf.float32
)

mask = tf.convert_to_tensor(
    ~nodes_all[["future_x", "future_y"]].isna().any(axis=1),
    dtype=tf.bool
)

edges = tf.convert_to_tensor(
    edges_all[["target", "source"]].to_numpy(),
    dtype=tf.int64
)

#### Train-Test Split

In [None]:
split_ratio = 0.8
random_indices = np.random.permutation(len(nodes_all))
split_idx = int(split_ratio * len(random_indices))
train_indices = random_indices[: split_idx]
test_indices = random_indices[split_idx :]

train_node_features = node_features.numpy()[train_indices]
test_node_features = node_features.numpy()[test_indices]

train_labels = labels.numpy()[train_indices]
test_labels = labels.numpy()[test_indices]

train_mask = mask.numpy()[train_indices]
test_mask = mask.numpy()[test_indices]

train_node_features = tf.convert_to_tensor(train_node_features, dtype=tf.float32)
test_node_features = tf.convert_to_tensor(test_node_features, dtype=tf.float32)

train_labels = tf.convert_to_tensor(train_labels, dtype=tf.float32)
test_labels = tf.convert_to_tensor(test_labels, dtype=tf.float32)

train_mask = tf.convert_to_tensor(train_mask, dtype=tf.bool)
test_mask = tf.convert_to_tensor(test_mask, dtype=tf.bool)

*Display*:

In [None]:
print("Shape of training features:", train_node_features.shape)
print("Shape of training labels:", train_labels.shape)
print("Shape of training masks:", train_mask.shape)

print("Shape of test features:", test_node_features.shape)
print("Shape of test labels:", test_labels.shape)
print("Shape of test masks:", test_mask.shape)

#### Graph Attention Layer Definition (GAT Layer)

In [None]:
class GraphAttention(layers.Layer):
    def __init__(
        self,
        units,
        kernel_initializer="glorot_uniform",
        kernel_regularizer=None,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.units = units
        self.kernel_initializer = keras.initializers.get(kernel_initializer)
        self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
    
    def build(self, input_shape):
        self.kernel = self.add_weight(
            shape=(input_shape[0][-1], self.units),
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            name="kernel",
            trainable=True,
        )
        self.kernel_attention = self.add_weight(
            shape=(self.units * 2, 1),
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            name="kernel_attention",
            trainable=True,
        )
        super().build(input_shape)
        
    def call(self, inputs):
        node_states, edges = inputs

        h = tf.matmul(node_states, self.kernel)

        edge_states = tf.gather(h, edges)
        edge_states = tf.reshape(
            edge_states, (tf.shape(edges)[0], 2 * self.units)
        )
        scores = tf.nn.leaky_relu(
            tf.matmul(edge_states, self.kernel_attention)
        )
        scores = tf.squeeze(scores, -1)
        
        scores_exp = tf.exp(tf.clip_by_value(scores, -2, 2))
        denom = tf.math.unsorted_segment_sum(
            data=scores_exp,
            segment_ids=edges[:, 0],
            num_segments=tf.shape(node_states)[0]
        )
        denom_per_edge = tf.gather(denom, edges[:, 0])
        alpha = scores_exp / (denom_per_edge + tf.keras.backend.epsilon())
        
        neigh = tf.gather(h, edges[:, 1])
        out = tf.math.unsorted_segment_sum(
            data=neigh * alpha[:, tf.newaxis],
            segment_ids=edges[:, 0],
            num_segments=tf.shape(node_states)[0]
        )
        return out

#### Define Multi-Head Attention Layer

In [None]:
class MultiHeadGraphAttention(layers.Layer):
    def __init__(self, units, num_heads=8, merge_type="concat", **kwargs):
        super().__init__(**kwargs)
        self.num_heads = num_heads
        self.merge_type = merge_type
        self.attention_heads = [GraphAttention(units) for _ in range(num_heads)]

    def call(self, inputs):
        node_states, edges = inputs
        head_outputs = [head([node_states, edges]) for head in self.attention_heads]
        if self.merge_type == "concat":
            h = tf.concat(head_outputs, axis=-1)
        else:
            h = tf.reduce_mean(tf.stack(head_outputs, axis=-1), axis=-1)
        return tf.nn.relu(h)

#### Define Graph Attention Network (GAT) Model

In [None]:
class GraphAttentionNetwork(keras.Model):
    def __init__(
        self,
        node_features,
        edges,
        hidden_units=32,
        num_heads=4,
        num_layers=2,
        output_dim=2,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.node_features = node_features
        self.edges = edges

        self.preprocess = keras.Sequential([
            layers.Dense(hidden_units * num_heads, activation="relu", name="embedding_fc1"),
            layers.Dense(hidden_units * num_heads, name="embedding_fc2"),
        ], name="node_embedding")

        self.gat_layers = [
            MultiHeadGraphAttention(hidden_units, num_heads, merge_type="concat")
            for _ in range(num_layers)
        ]

        self.output_layer = layers.Dense(output_dim)

    def call(self, inputs):
        x, edges = inputs

        x = self.preprocess(x)

        for gat in self.gat_layers:
            x = gat([x, edges]) + x
        
        return self.output_layer(x)

    def train_step(self, data):
        indices, y_true = data

        with tf.GradientTape() as tape:
            y_pred = self([self.node_features, self.edges])
            y_pred_batch = tf.gather(y_pred, indices)
            loss = self.compiled_loss(y_true, y_pred_batch)

        grads = tape.gradient(loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

        self.compiled_metrics.update_state(y_true, y_pred_batch)
        return {m.name: m.result() for m in self.metrics}

    def test_step(self, data):
        indices, y_true = data
        y_pred = self([self.node_features, self.edges])
        y_pred_batch = tf.gather(y_pred, indices)
        loss = self.compiled_loss(y_true, y_pred_batch)
        self.compiled_metrics.update_state(y_true, y_pred_batch)
        return {m.name: m.result() for m in self.metrics}

    def predict_step(self, data):
        indices = data
        y_pred = self([self.node_features, self.edges])
        return tf.gather(y_pred, indices)

#### Define Training Configuration and Hyperparameter Search Space

In [None]:
HIDDEN_UNITS = 32
NUM_LAYERS = 2
OUTPUT_DIM = 2

NUM_EPOCHS = 100
BATCH_SIZE = 256
VALIDATION_SPLIT = 0.1
LEARNING_RATE = 1e-4
PATIENCE = 30

HEAD_OPTIONS = [2, 4, 8]

loss_fn = keras.losses.MeanSquaredError()
optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
mae_metric = keras.metrics.MeanAbsoluteError(name="mae")

early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_mae",
    min_delta=1e-4,
    patience=PATIENCE,
    restore_best_weights=True
)

#### Train and Compare GAT Models with Varying Attention Heads

In [None]:
histories = {}
mae_results = {}
predictions = {}

for num_heads in HEAD_OPTIONS:
    print(f"\n--- Training model with {num_heads} attention heads ---")
    
    optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)

    gat_model = GraphAttentionNetwork(
        node_features=node_features,
        edges=edges,
        hidden_units=HIDDEN_UNITS,
        num_heads=num_heads,
        num_layers=NUM_LAYERS,
        output_dim=OUTPUT_DIM
    )

    gat_model.compile(
        loss=loss_fn,
        optimizer=optimizer,
        metrics=[mae_metric]
    )

    history = gat_model.fit(
        x=train_indices,
        y=train_labels,
        validation_split=VALIDATION_SPLIT,
        batch_size=BATCH_SIZE,
        epochs=NUM_EPOCHS,
        callbacks=[early_stopping],
        verbose=0,
    )

    loss, mae = gat_model.evaluate(
        x=test_indices,
        y=test_labels,
        verbose=0
    )

    histories[num_heads] = history.history
    mae_results[num_heads] = mae
    predictions[num_heads] = gat_model.predict(x=test_indices)

test_preds = gat_model.predict(x=test_indices)

#### Validation MAE Curves — Attention Head Comparison

In [None]:
for num_heads in HEAD_OPTIONS:
    val_mae = histories[num_heads]['val_mae']
    plt.plot(val_mae, label=f"{num_heads} heads")
plt.title("Validation MAE per number of attention heads")
plt.xlabel("Epoch")
plt.ylabel("MAE")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

#### Test MAE Comparison by Attention Head Count

In [None]:
plt.figure(figsize=(6, 5))
plt.bar(
    [str(h) + " heads" for h in mae_results.keys()],
    [mae_results[h] for h in mae_results],
    color="skyblue"
)
plt.title("Test MAE per model")
plt.ylabel("MAE")
plt.grid(axis="y")
plt.tight_layout()
plt.show()

*Display*:

#### Prediction Samples from Best Model

In [None]:
best_heads = min(mae_results, key=mae_results.get)
best_preds = predictions[best_heads]

print(f"\nBest model: {best_heads} attention heads (Test MAE = {mae_results[best_heads]:.4f})")

errors = []

for i, (pred, true) in enumerate(zip(best_preds, test_labels)):
    true_norm = tf.norm(true).numpy()
    l2_error = tf.norm(pred - true).numpy()
    errors.append({
        "index": i,
        "prediction": pred,
        "ground_truth": true,
        "l2_error": l2_error
    })

errors_sorted = sorted(errors, key=lambda x: x["l2_error"], reverse=False)

num_examples = 5
for i, item in enumerate(errors_sorted[:num_examples]):
    pred = item["prediction"]
    true = item["ground_truth"]
    l2_error = item["l2_error"]
    print(f"Example {i+1}:")
    print(f"\tPrediction   = ({pred[0]:.2f}, {pred[1]:.2f})")
    print(f"\tGround Truth = ({true[0]:.2f}, {true[1]:.2f})")
    print(f"\tL2 Error     = {l2_error:.2f} units")
    print("---" * 20)