# Determining available Shape ID's of a Given Route in the GTFS Realtime Data Sample

## Sample is requested from supervisor and data shared is most likely a 1 day data.
## Sample is extracted from TransLink Open Data Portal using an application created by Lim. A.

In [1]:
# Import pandas for using dataframe
import pandas as pd
# Import glob libraries
import glob
# Import re to enable regex
import re

# import StringIO for reading files from your machine
from io import StringIO

# Set the maximum number of rows that pandas datframe will display
pd.set_option('display.max_rows', 200)

In [2]:
# Filepaths of three datasets available listed here.
# For the experimental visualization works in Tableau, the filepaths that was used was the dataset with the 26-8-2021 date

# Only Chosen the TU feeds 2-8-2021 in this section to avoid overwriting the 26-8-2021 data in the database
# as I have used the TU feeds 26-8-2021 data for the Tableau Visualization

# tu_source_path = r'/Users/francisroberto/OneDrive - Queensland University of Technology/QUT/IFN712/GTFS DATA/TU feeds 2-8-2021'
# tu_source_path = r'/Users/francisroberto/OneDrive - Queensland University of Technology/QUT/IFN712/GTFS DATA/TU feeds 3-8-2021'
tu_source_path = r'/Users/francisroberto/OneDrive - Queensland University of Technology/QUT/IFN712/GTFS DATA/TU feeds 26-8-2021'

# Locate all required csv files from the source path and save the collection to a variable
tu_csv_files = glob.glob(tu_source_path + "/*.csv")

## Input the Route you want to search for in the variable name target_route

In [3]:
# Replace the value inside the single quotes in the target_route variable with the route you want to search for

route = '333-1886'



In [4]:
# Instantiate an empty DataFrame object
result_df = pd.DataFrame()

# Loop through all the files in the files collection variable and then look for
# shape_id that are included in our shape id list for a specific routes
for file in tu_csv_files:
    df = pd.read_csv(file, usecols=range(1,17), dtype='unicode', low_memory=False)
    # newdf = df[(df.shape_id == shape_ids[shape_id_index])]
    newdf = df[(df.route_id == route)]

    
    result_df = result_df.append(newdf, ignore_index=True)

In [5]:
result_df.head()

Unnamed: 0,trip_id,start_time,start_date,route_id,stop_id,stop_sequence,arrival_delay,arrival_time,arrival_uncertainty,departure_delay,departure_time,departure_uncertainty,schedule_relationship,id,timestamp,shape_id
0,18316923-BT 21_22-AUG_FUL-Weekday-01,13:15:00,20210826,333-1886,3944,1.0,,,,0.0,1629947700.0,30.0,0.0,TU-18316923-BT 21_22-AUG_FUL-Weekday-01,1629945976,3330071
1,18316923-BT 21_22-AUG_FUL-Weekday-01,13:15:00,20210826,333-1886,3943,2.0,49.0,1629947809.0,30.0,49.0,1629947809.0,30.0,0.0,TU-18316923-BT 21_22-AUG_FUL-Weekday-01,1629945976,3330071
2,18316923-BT 21_22-AUG_FUL-Weekday-01,13:15:00,20210826,333-1886,10900,3.0,-16.0,1629947984.0,30.0,-16.0,1629947984.0,30.0,0.0,TU-18316923-BT 21_22-AUG_FUL-Weekday-01,1629945976,3330071
3,18316923-BT 21_22-AUG_FUL-Weekday-01,13:15:00,20210826,333-1886,11270,4.0,-85.0,1629948215.0,30.0,-85.0,1629948215.0,30.0,0.0,TU-18316923-BT 21_22-AUG_FUL-Weekday-01,1629945976,3330071
4,18316923-BT 21_22-AUG_FUL-Weekday-01,13:15:00,20210826,333-1886,11272,5.0,-84.0,1629948276.0,30.0,-84.0,1629948276.0,30.0,0.0,TU-18316923-BT 21_22-AUG_FUL-Weekday-01,1629945976,3330071


In [6]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157761 entries, 0 to 157760
Data columns (total 16 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   trip_id                157761 non-null  object
 1   start_time             157761 non-null  object
 2   start_date             157761 non-null  object
 3   route_id               157761 non-null  object
 4   stop_id                157567 non-null  object
 5   stop_sequence          157567 non-null  object
 6   arrival_delay          149610 non-null  object
 7   arrival_time           149610 non-null  object
 8   arrival_uncertainty    149610 non-null  object
 9   departure_delay        157529 non-null  object
 10  departure_time         157529 non-null  object
 11  departure_uncertainty  157529 non-null  object
 12  schedule_relationship  157567 non-null  object
 13  id                     157761 non-null  object
 14  timestamp              157761 non-null  object
 15  

## Listing out the available Shape Id's for a given Route

In [7]:
# print(list(result_df.shape_id.unique()))
print(list(result_df.shape_id.unique()))

['3330071', '3330083', '3330072', nan]


In [8]:
result_df_3330071 = result_df.copy()
result_df_3330071 = result_df_3330071.loc[result_df_3330071['shape_id'] == '3330071']



In [9]:
print(list(result_df_3330071.shape_id.unique()))

['3330071']


In [10]:
result_df_3330072 = result_df.copy()
result_df_3330072 = result_df_3330072.loc[result_df_3330072['shape_id'] == '3330072']


In [11]:
result_df_3330083 = result_df.copy()
result_df_3330083 = result_df_3330083.loc[result_df_3330083['shape_id'] == '3330083']


In [12]:
result_df_3330071.to_csv('RT_3330071.csv', index=False)

In [13]:
result_df_3330072.to_csv('RT_3330072.csv', index=False)

In [14]:
result_df_3330083.to_csv('RT_3330083.csv', index=False)