# GTFS Static & Realtime parser

#### 1. Looping through all directories within _directory_ of (GTFS) static network data to;
* create aggregated dictionary with all types of _trips_ in the public transport network (total 10737 in our case)
* create aggregated dictionary with all active _stops_ in the public transport network (total 12854 in our case)


In [None]:
import csv
import os

stops = {}
ci=13
stopsOnTrip={}

directory = '../data/static/'

for mapp in os.listdir(directory):
    print("Reading directory",mapp)
    
    try:
        with open('../data/static/' + mapp + '/stops.txt') as file:
            reader = csv.DictReader(file)
            # Create dictionary key for each row in stops.csv with values on name, latitude and longitude
            
            for row in reader:
                if (row['stop_id'][0:ci] not in stops):
                    print("\t","Stop",row['stop_id'][0:ci],"added to stops dictionary")
                    stops[row['stop_id'][0:ci]] = {'stop_id': row['stop_id'][0:ci], 'stop_name': row['stop_name'],
                                             'stop_lat': row['stop_lat'], 'stop_lon': row['stop_lon']}

        with open('../data/static/' + mapp + '/stop_times.txt') as file:
            reader1 = csv.DictReader(file)
            # Loops through stop_times.csv and creates dictionary with stops per unique trip
            
            for row in reader1:
                
                if row['trip_id'] in stopsOnTrip and int(row['stop_sequence']) > len(stopsOnTrip[row['trip_id']]):
                    stopsOnTrip[row['trip_id']]
                    stopsOnTrip.setdefault(row['trip_id'],[]).append({'stop_sequence':row['stop_sequence'],'stop_id':row['stop_id'][0:ci],'arr_time':row['arrival_time'],'dep_time':row['departure_time']})
                    print("\t","Stop",row['stop_id'][0:ci],"added to trip",row['trip_id'])
                
                elif row['trip_id'] not in stopsOnTrip:
                    stopsOnTrip.setdefault(row['trip_id'],[]).append({'stop_sequence':row['stop_sequence'],'stop_id':row['stop_id'][0:ci],'arr_time':row['arrival_time'],'dep_time':row['departure_time']})
                    print("\n","Key",row['trip_id'],"added to stopsOnTrip dictionary")
                
                else:
                    continue
    
    except NotADirectoryError:
        continue

In [None]:
from google.transit import gtfs_realtime_pb2
import gzip
import os
from datetime import datetime
import csv
import numpy as np
import pandas as pd

directory = '../data/tripu/'
hms = '%H:%M:%S'

# Loop through all folders (days) in directory
for folder in os.listdir(directory):
    print("Reading directory",folder)
    datapoints = []
    read_trips = set()

    # Loop through all files (requests) per folder (day)
    try:
        for filename in os.listdir(directory + folder)[::15]:
            try:
                # Uncompress and parse protobuff-file using gtfs_realtime_pb2
                with gzip.open(directory + folder + "/" + filename, 'rb') as file:
                    response = file.read()
                    feed = gtfs_realtime_pb2.FeedMessage()
                    feed.ParseFromString(response)

                    print("Reading filename: " + filename, "(" + str(len(feed.entity)) + " entities)")

                    for trip in feed.entity:
                        this_trip_update = trip.trip_update
                        this_trip_id = this_trip_update.trip.trip_id
                        print('\t','Reading trip',this_trip_id)
                        if trip.trip_update.trip.trip_id not in read_trips:

                            try:
                                # Making sure trip_update is on last stop to avoid duplicate data
                                if len(this_trip_update.stop_time_update) == len(stopsOnTrip[this_trip_id]):
                                    print('\t','Adding data from trip',this_trip_id)
                                    
                                    # Creating two iterators to walk through the list of stop_time_update(s)
                                    cur_updates = iter(this_trip_update.stop_time_update)
                                    nxt_updates = iter(this_trip_update.stop_time_update)
                                    
                                    # Advance the nxt_updates iterator so it is one ahead of cur_updates
                                    next(nxt_updates)

                                    for cur_update, nxt_update in zip(cur_updates, nxt_updates):
                                        cur_stop_metadata = stops[cur_update.stop_id[0:13]]
                                        nxt_stop_metadata = stops[nxt_update.stop_id[0:13]]

                                        # Store the delay data point (arrival difference of two ascending nodes)
                                        delay = nxt_update.arrival.delay - cur_update.arrival.delay
                                        
                                        # Convert and store the scheduled duration between current and next stop
                                        arrTimeNxt = stopsOnTrip[this_trip_id][nxt_update.stop_sequence-1]['arr_time']
                                        arrTimeCur = stopsOnTrip[this_trip_id][cur_update.stop_sequence-1]['arr_time']                                        
                                        if int(arrTimeNxt[0:2]) >= 24:
                                            arrTimeNxt = '00' + arrTimeNxt[2:8]
                                        if int(arrTimeCur[0:2]) >= 24:
                                            arrTimeCur = '00' + arrTimeCur[2:8]
                                        scheduled_time = (datetime.strptime(arrTimeNxt, hms) - datetime.strptime(arrTimeCur, hms)).seconds

                                        # Store contextual metadata
                                        date = int(nxt_update.arrival.time)
                                        ts = int(nxt_update.arrival.time)
                                        key = "{}/{}".format(cur_update.stop_id[0:13], nxt_update.stop_id[0:13])
                                        
                                        source_name=cur_stop_metadata['stop_name']
                                        source_id=cur_stop_metadata['stop_id']
                                        source_lon=cur_stop_metadata['stop_lon']
                                        source_lat=cur_stop_metadata['stop_lat']
                                        
                                        sink_name=nxt_stop_metadata['stop_name']
                                        sink_id=nxt_stop_metadata['stop_id']
                                        sink_lon=nxt_stop_metadata['stop_lon']
                                        sink_lat=nxt_stop_metadata['stop_lat']
                                        
                                        # Append data point to array
                                        datapoints.append((key,ts,source_name,sink_name,delay,source_lon,source_lat,sink_lon,sink_lat,source_id,sink_id,scheduled_time))

                                    read_trips.add(trip.trip_update.trip.trip_id)
                                    print("\t","Storing delays for trip_id",this_trip_id,"with",len(this_trip_update.stop_time_update),"out of",len(stopsOnTrip[this_trip_id]),"total stops")

                            except KeyError:
                                continue
                        else:
                            continue
            except OSError:
                continue
    except NotADirectoryError:
        continue

    print("==========================================")
    print("Creating np.array from data in directory",folder,"...")
    npdata = np.array(datapoints, dtype=str)

    print("Creating pandas dataframe with",len(npdata[:,0]),"rows ...")
    Edge_ID_col = npdata[:,0]
    Unix_TS_col = npdata[:,1]
    Date_col = pd.to_datetime(npdata[:,1], unit='s', origin='unix').date
    Time_col = pd.to_datetime(npdata[:,1], unit='s', origin='unix').time
    Delay_col = pd.to_numeric(npdata[:,4])
    Source_col = npdata[:,2]
    Sink_col = npdata[:,3]
    Source_lon = npdata[:,5]
    Source_lat = npdata[:,6]
    Sink_lon = npdata[:,7]
    Sink_lat = npdata[:,8]
    Source_id = npdata[:,9]
    Sink_id = npdata[:,10]
    Scheduled_duration = pd.to_numeric(npdata[:,11])

    d = {'edge_id': Edge_ID_col, 
         'source_id': Source_id, 
         'source_name': Source_col, 
         'source_lon': Source_lon, 
         'source_lat': Source_lat, 
         'sink_id': Sink_id, 
         'sink_name': Sink_col, 
         'sink_lon': Sink_lon, 
         'sink_lat': Sink_lat, 
         'UNIX_TS': Unix_TS_col, 
         'TIME': Time_col, 
         'DATE': Date_col,
         'scheduled_duration': Scheduled_duration,
         'delay': Delay_col}
    df = pd.DataFrame(data=d)
    df.drop_duplicates()

    print("Creating " + folder + ".csv")
    df.to_csv('../data/csv/' + (str(folder) + '.csv'), encoding='utf-8', index=False)
    print("Done!")
    print("==========================================")