In [18]:
from google.transit import gtfs_realtime_pb2
import numpy as np
import gzip
import os
import datetime

directory = '../data/30-small/'
datapoints = np.empty((0,3), int)

# Loop through all files in directory
for filename in os.listdir(directory):
    if (filename != '.DS_Store'):
        
        # Uncompress and parse protobuff-file using gtfs_realtime_pb2
        with gzip.open(directory + filename, 'rb') as file:
            response = file.read()
            feed = gtfs_realtime_pb2.FeedMessage()
            feed.ParseFromString(response)
            
            # Loop through each entity (trip) in each feed / file. Every trip contains many stop_time_updates
            for trip in feed.entity:
                
                # Loop through all stop_time_update(s) of currently looped trip
                for i, stop_time_update in enumerate(trip.trip_update.stop_time_update):
                    
                    # Excluding the last stop as it does not have a "departuring edge"
                    if (i<len(trip.trip_update.stop_time_update)-1):
                        
                        # Store contextual metadata for the unique delay data point
                        ts = int(trip.trip_update.stop_time_update[i+1].arrival.time)
                        stop_id1 = trip.trip_update.stop_time_update[i].stop_id
                        stop_id2 = trip.trip_update.stop_time_update[i+1].stop_id
                        key = int(str(trip.trip_update.stop_time_update[i].stop_id) + str(trip.trip_update.stop_time_update[i+1].stop_id))
                        
                        # Store the delay data point
                        delay = int(trip.trip_update.stop_time_update[i+1].arrival.delay-trip.trip_update.stop_time_update[i].arrival.delay)
                        
                        datapoints = np.append(datapoints, np.array([[key,ts,delay]]), axis=0)

In [34]:
import pandas as pd

ID_col = datapoints[:,0]
TS_col = datapoints[:,1]
Delay_col = datapoints[:,2]

d = {'Timestamp': TS_col, 'Delay': Delay_col}

df = pd.DataFrame(data=d,index=ID_col)
df

Unnamed: 0,Timestamp,Delay
90220037931390029022003660600002,1580424310,-81
90220037812240029022003781221002,1580424223,-2
90220037812210029022003781051002,1580424282,20
90220037810510029022003781216002,1580424336,-7
90220037812160029022003781210001,1580424450,-69
...,...,...
90220037000520029022003700051002,1580426401,0
90220037000510029022003700008002,1580426520,0
90220037000080029022003700445004,1580426627,0
90220037004450049022003700444002,1580426728,0


array([90220037931390029022003660600002, 90220037812240029022003781221002,
       90220037812210029022003781051002, ...,
       90220037000080029022003700445004, 90220037004450049022003700444002,
       90220037004440029022003700441001], dtype=object)

In [9]:
arr = np.append(arr, np.array([[2,24,33,44]]), axis=0)

In [10]:
arr

array([[ 1,  2,  3,  4],
       [ 2, 24, 33, 44]])

In [13]:
arr

array([[ 1,  2,  3,  4],
       [ 2, 24, 33, 44]])