In [1]:
import numpy as np
import pandas as pd

In [17]:
# Sample data
data = {
    'pickup_datetime': ['2024-01-01 00:00:00'] * 3 + ['2024-01-01 01:00:00'] * 3,
    'edge_indices': [0,1,2] * 2,
    'counts': [10, 20, 30, 40, 50, 60],
    'total_amount': [100, 200, 300, 400, 500, 600],
    'tip_amount': [10, 20, 30, 40, 50, 60],
    'fare_amount': [10, 20, 30, 40, 50, 60],
    'trip_distance': [10, 15, 20, 25, 30, 35],
    'passenger_count': [2, 3, 4, 5, 6, 7],
    'trip_duration': [30, 45, 60, 75, 90, 105],
}

ts = pd.DataFrame(data)

In [56]:
# create the numpy version of dataframe by pivoting dataframe
output_columns = ['counts', 'total_amount', 'tip_amount', 'fare_amount', 'trip_distance', 'passenger_count', 'trip_duration']
pivoted_ts = ts.pivot_table(index='pickup_datetime', columns='edge_indices', values=output_columns, aggfunc='first', sort = False)

# Convert to a NumPy array
edge_features = pivoted_ts.to_numpy()
edge_features.shape

(2, 21)

In [57]:
pivoted_ts

Unnamed: 0_level_0,counts,counts,counts,total_amount,total_amount,total_amount,tip_amount,tip_amount,tip_amount,fare_amount,fare_amount,fare_amount,trip_distance,trip_distance,trip_distance,passenger_count,passenger_count,passenger_count,trip_duration,trip_duration,trip_duration
edge_indices,0,1,2,0,1,2,0,1,2,0,...,2,0,1,2,0,1,2,0,1,2
pickup_datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2024-01-01 00:00:00,10,20,30,100,200,300,10,20,30,10,...,30,10,15,20,2,3,4,30,45,60
2024-01-01 01:00:00,40,50,60,400,500,600,40,50,60,40,...,60,25,30,35,5,6,7,75,90,105


In [58]:
# Reshape the array to the desired shape
num_times = len(ts['pickup_datetime'].unique())
num_edges = len(ts['edge_indices'].unique())
num_outputs = len(output_columns)

edge_features_r = edge_features.reshape((num_times, num_edges, num_outputs))
edge_features_r.shape

(2, 3, 7)

In [59]:
edge_features[0]

array([ 10,  20,  30, 100, 200, 300,  10,  20,  30,  10,  20,  30,  10,
        15,  20,   2,   3,   4,  30,  45,  60])

In [60]:
edge_features_r[0,:]

array([[ 10,  20,  30, 100, 200, 300,  10],
       [ 20,  30,  10,  20,  30,  10,  15],
       [ 20,   2,   3,   4,  30,  45,  60]])

In [61]:
# notice, we would have gotten the right result if the ith row was made up of the
# the elements (i, i+3, ... ) of the original edge_features array for each time step

# we do this by creating the appropriate index array, apply it to each time step, then reshape as before

In [63]:
idx = np.arange(0, num_outputs*num_edges).reshape((num_outputs, num_edges))
idx.T.flatten()

array([ 0,  3,  6,  9, 12, 15, 18,  1,  4,  7, 10, 13, 16, 19,  2,  5,  8,
       11, 14, 17, 20])

In [64]:
edge_features[:, idx.T.flatten()].reshape((num_times, num_edges, num_outputs))

array([[[ 10, 100,  10,  10,  10,   2,  30],
        [ 20, 200,  20,  20,  15,   3,  45],
        [ 30, 300,  30,  30,  20,   4,  60]],

       [[ 40, 400,  40,  40,  25,   5,  75],
        [ 50, 500,  50,  50,  30,   6,  90],
        [ 60, 600,  60,  60,  35,   7, 105]]])

In [65]:
ts.sort_values('edge_indices')

Unnamed: 0,pickup_datetime,edge_indices,counts,total_amount,tip_amount,fare_amount,trip_distance,passenger_count,trip_duration
0,2024-01-01 00:00:00,0,10,100,10,10,10,2,30
3,2024-01-01 01:00:00,0,40,400,40,40,25,5,75
1,2024-01-01 00:00:00,1,20,200,20,20,15,3,45
4,2024-01-01 01:00:00,1,50,500,50,50,30,6,90
2,2024-01-01 00:00:00,2,30,300,30,30,20,4,60
5,2024-01-01 01:00:00,2,60,600,60,60,35,7,105
