## Import packages

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd
import os
import warnings

import glob

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 6)
pd.set_option("display.max_rows", 6)
np.random.seed(2)

## Get files

In [3]:
edge_files = glob.glob(os.path.join("dataset", "*.edges"))
node_files = glob.glob(os.path.join("dataset", "*.nodes"))


## Create dataset

In [10]:
edge_dfs = []
for file_path in edge_files:
    df = pd.read_csv(
        file_path,
        sep=", ",
        header=None,
        names=["target", "source"]
    )
    edge_dfs.append(df)

node_dfs = []
for file_path in node_files:
    df = pd.read_csv(
        file_path,
        sep=",",
        header=None,
        names=["node id", "current x", "current y", "previous x", "previous y", "future x", "future y"],
        na_values="_"
    )
    node_dfs.append(df)

edges = pd.concat(edge_dfs, ignore_index=True)
nodes = pd.concat(node_dfs, ignore_index=True)

In [11]:
print(edges)
print(nodes)

        target    source
0     19585800  19590700
1     19585800  19595200
2     19590700  19592400
...        ...       ...
3456  19592800        -1
3457  20014600        -1
3458  20015100        -1

[3459 rows x 2 columns]
       node id  current x  current y  ...  previous y  future x  future y
0     19502500    40972.0   -16957.0  ...    -16957.0   41185.0  -16480.0
1     19585800    12688.0    -6816.0  ...     -6816.0   13381.0   -7427.0
2     19590400   -16367.0    21644.0  ...     21644.0       NaN       NaN
...        ...        ...        ...  ...         ...       ...       ...
2306  20015100   -19196.0     3668.0  ...      3041.0  -19196.0    3668.0
2307  20015600    17568.0   -13258.0  ...    -14736.0   17568.0  -13258.0
2308  20015900    16994.0   -12152.0  ...         NaN   16994.0  -12152.0

[2309 rows x 7 columns]


## Split the dataset

In [12]:
# Obtain random indices
random_indices = np.random.permutation(range(nodes.shape[0]))

# 50/50 split
train_data = nodes.iloc[random_indices[: len(random_indices) // 2]]
test_data = nodes.iloc[random_indices[len(random_indices) // 2 :]]

In [13]:
print(train_data)
print(test_data)

       node id  current x  current y  ...  previous y  future x  future y
322   19585800    15484.0    -9938.0  ...     -8963.0   16229.0  -10621.0
1734  20002900    34808.0   -22367.0  ...    -22301.0   34858.0  -22511.0
1447  20002900    34478.0   -21915.0  ...    -22065.0   34414.0  -22177.0
...        ...        ...        ...  ...         ...       ...       ...
1187  19595300    30636.0   -16351.0  ...    -16060.0   31589.0  -16622.0
1480  19595300    37002.0   -18290.0  ...    -18035.0   38039.0  -18755.0
1755  19595800    41817.0   -21716.0  ...    -20817.0   42670.0  -22344.0

[1154 rows x 7 columns]
       node id  current x  current y  ...  previous y  future x  future y
1823  20000700    40022.0   -18541.0  ...    -18926.0   41085.0  -18375.0
812   20001100    47054.0   -19951.0  ...    -20969.0       NaN       NaN
1316  20001800    34571.0   -18728.0  ...    -18346.0   35204.0  -19160.0
...        ...        ...        ...  ...         ...       ...       ...
1618  2000180

In [14]:
print(nodes.sort_values("node id").iloc[:, 1:-1].astype(np.float32).dtypes)

current x     float32
current y     float32
previous x    float32
previous y    float32
future x      float32
dtype: object


## Prepare the graph

In [16]:
# Obtain paper indices which will be used to gather node states
# from the graph later on when training the model
train_indices = train_data["node id"].to_numpy()
test_indices = test_data["node id"].to_numpy()

# Obtain ground truth labels corresponding to each paper_id
train_labels = train_data[["future x", "future y"]].to_numpy()
test_labels = test_data[["future x", "future y"]].to_numpy()

# Define graph, namely an edge tensor and a node feature tensor
edges = tf.convert_to_tensor(edges[["target", "source"]])
node_states = tf.convert_to_tensor(nodes.sort_values("node id").iloc[:, 1:-1])

# Print shapes of the graph
print("Edges shape:\t\t", edges.shape)
print("Node features shape:", node_states.shape)

Edges shape:		 (3459, 2)
Node features shape: (2309, 5)
