In [1]:
import urllib.request as urllib
import zipfile
# from zipfile import ZipFile
from io import BytesIO
import pandas as pd
from datetime import date, timedelta
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
import os
import gc
import math
import openpyxl
from os.path import basename
from datetime import time,timedelta
from nptime import nptime

In [2]:
def convert_time(time_string):
    time_split = time_string.split(':')
    hour = int(time_split[0])
    if hour >= 24:
        hour -= 24
        time_split[0] = '%.2d'% (hour)
    else:
        hour = '%.2d'% (hour) 
    hour_var = ':'.join(time_split)

    return hour_var

# intervals needs to be integer that can be divided by 1440 (minutes in day) with remainder as 0
def time_to_intervals_string(time_string,interval_in_minutes):
    total_minutes_in_a_day = 24 * 60

    if total_minutes_in_a_day%interval_in_minutes != 0:
        print('Invalid Intervals!')
    else:
        time_split = time_string.split(':')
        hour = int(time_split[0])
        minute = int(time_split[1])
        second = int(time_split[2])

        interval =  (hour*60 + minute) // interval_in_minutes

        start_total_minutes = interval_in_minutes * interval
        # end_total_minutes = interval_in_minutes * (interval + 1)

        start_hour = start_total_minutes // 60
        start_minute = start_total_minutes % 60
        start_time = nptime(start_hour,start_minute,second)
        end_time = start_time + timedelta(minutes=interval_in_minutes)
        interval_string = str(start_time) # +' - ' + str(end_time)
        # end_hour = end_total_minutes // 60
        # end_minute = end_total_minutes % 60

        # interval_string = '%.2d:%.2d:00 - %.2d:%.2d:00'% (start_hour,start_minute,end_hour,end_minute)

    return interval_string

def dateRange_to_dates(raw_calendar):
    num_service_id = raw_calendar.shape[0] # number of service_ids
    list = []
    i = 0
    while i < num_service_id:
        start_date = str(raw_calendar.iloc[i,-2])
        end_date = str(raw_calendar.iloc[i,-1])
        dates = pd.date_range(start_date,end_date)
        i += 1
        list.append(dates)

    return list

def load_file(file_folder_location, file_name):
    with open(os.path.join(file_folder_location,file_name), 'rb') as f:
        zip_file = zipfile.ZipFile(BytesIO(f.read()))
        
    dfst = {}
    for name in zip_file.namelist():
        dfst[name.lower()[:-4]] = pd.read_csv(zip_file.open(name))

    return dfst

def download_zipfile(file, file_folder_location, file_name):
        with open(os.path.join(file_folder_location,file_name), 'wb') as f:
    # Set the chunk size
            chunk_size = 4096

  # Read the file in chunks
            while True:
    # Read a chunk
                chunk = file.read(chunk_size)

    # Break if the chunk is empty
                if not chunk:
                    break

    # Write the chunk to the file
                f.write(chunk)

def save_csvFile(file_folder_location, file_name, df):
    file_path = os.path.join(file_folder_location,file_name)
    df.to_csv(file_path, index = False, chunksize=1000)

def delete_file (file_folder_location, file_name):
    file_path = os.path.join(file_folder_location,file_name)
    try:
    # Try to delete the file
        os.remove(file_path)
    except FileNotFoundError:
    # Do nothing if the file doesn't exist
        pass

def id_to_string(dfst):
    dfst['stop_times']['stop_id'] = dfst['stop_times']['stop_id'].astype('string')
    dfst['stop_times']['trip_id'] = dfst['stop_times']['trip_id'].astype('string')
    dfst['stops']['stop_id'] = dfst['stops']['stop_id'].astype('string')
    dfst['trips']['trip_id'] = dfst['trips']['trip_id'].astype('string')
    dfst['trips']['service_id'] = dfst['trips']['service_id'].astype('string')
    dfst['trips']['route_id'] = dfst['trips']['route_id'].astype('string')
    dfst['calendar']['service_id'] = dfst['calendar']['service_id'].astype('string')
    dfst['routes']['route_id'] = dfst['routes']['route_id'].astype('string')
    dfst['routes']['agency_id'] = dfst['routes']['agency_id'].astype('string')
    dfst['stop_times']['departure_time'] = dfst['stop_times']['departure_time'].astype('string')

def new_calendar(calendar_dfst):
    date_list = dateRange_to_dates(calendar_dfst)
    service_id = calendar_dfst['service_id'].tolist()
    cal = pd.DataFrame({'service_id': service_id, 'date':date_list}).explode('date')
    #add date column, explode by date
    calendar_file = calendar_dfst.merge(cal) 

    #convert 1 to the respective column name (day name)
    for col in calendar_file.columns[1:8]:
        calendar_file[col] = calendar_file[col].apply(lambda x: col if x == 1 else 0)

    calendar_file['day'] = calendar_file['date'].apply(lambda x: x.strftime("%A").lower())
    calendar_file = calendar_file.loc[calendar_file.iloc[:,1:8].apply(lambda x: x == calendar_file['day']).any(1)]


    # calendar_file = calendar_file.iloc[:,[0,-2]] # trim calendar file
    calendar_file = calendar_file.loc[:,['service_id','date']]
    
    #merge with calendar_dates file

    return calendar_file

def filtered_calendar (calendar_dfst, calendar_dates_dfst):
    calendar_file = new_calendar(calendar_dfst)

    #convert date type to datetime
    calendar_dates_dfst['date'] = pd.to_datetime(calendar_dates_dfst['date'],format='%Y%m%d')

    #merge calendar_file with calendar_dates file
    calendar_file = pd.merge(calendar_file, calendar_dates_dfst, how='outer', left_on=['service_id','date'], right_on=['service_id','date'])

    #filter out the service dates that are removed (exception type = 2)
    calendar_file = calendar_file.loc[calendar_file['exception_type'] != 2]
    calendar_file = calendar_file.loc[:,['service_id','date']]
        
    return calendar_file

def dedup_calendar(start_date_from_last_file,calendar_dfst):
    #filter out the servcie dates by start date from the previous file to remove duplicates    
    if start_date_from_last_file != '':
        calendar_dfst = calendar_dfst[calendar_dfst['date'] < start_date_from_last_file]      

    return calendar_dfst

def routes_by_agencyId (route_dfst, agencyIds_list): #filter routes table by agency id
    frames = []
    for agency_id in agencyIds_list:
        frames.append(route_dfst[route_dfst['agency_id'] == agency_id])
    #routes = route_dfst[route_dfst['agency_id'] == agencyId]
    # train agency id 1 & 2
    routes = pd.concat(frames,ignore_index=True)
    
    return routes

def add_to_zipfile(file_folder_location, zip_name, csv_name):
    filepath = os.path.join(file_folder_location, zip_name)
    #append to the existing zip file.
    #create a new zip file if doesn't exsit already
    #zipfile.ZIP_DEFLATED <- to compress the file
    with zipfile.ZipFile(filepath, 'a', zipfile.ZIP_DEFLATED) as z:
        path = os.path.join(file_folder_location, csv_name)
        z.write(path, basename(path))
    

In [3]:
#Get stop_times information by dates, agency_id
def get_merged_file(dfst, calendar_file, agencyIds_list):

    routes = routes_by_agencyId(route_dfst=dfst['routes'], agencyIds_list=
    agencyIds_list)
    #train_route_ids = train_route['route_id']

    # Get trip_ids by filtering Trips table using route_id & service_id
    trips = dfst['trips'].loc[dfst['trips']['route_id'].isin(routes['route_id'])]
    trips = trips.loc[trips['service_id'].isin(calendar_file['service_id'])]
    trips = trips.merge(routes, how = 'left', on = 'route_id')

    # get the final stop_times by filtering stop_time table using trip_id
    stop_times = dfst['stop_times'].loc[dfst['stop_times']['trip_id'].isin(trips['trip_id'])]
    # merge with stops table to get stop name
    stop_times = stop_times.merge(dfst['stops'], how = 'left', on = 'stop_id')
    stop_times = stop_times.loc[:,['trip_id','stop_id','stop_name','departure_time']]

    #final merge
    #merge trip table with stop_times table
    final_merge = stop_times.merge(trips, how = 'left', on = 'trip_id')
    final_merge = final_merge.loc[:,['stop_id','stop_name','service_id','trip_id','departure_time','trip_headsign','agency_id']]

    final_merge = final_merge.merge(calendar_file, how = 'left', on = 'service_id')
    
    return final_merge

def datetime_formatting(merged_file):
    #add column 'next_day' if departure_time is after 23:59:00, assign 1 as true, 0 as false
    merged_file['next_day'] = merged_file['departure_time'].apply(lambda x: 1 if int(x.split(':')[0]) > 23 else 0)

    #add column 'service_date'. Its value is date + next_day (i.e. if date is 2022/01/01 and the departure_time is after 24:10:00, next_day is 1 and service_date is 2022/01/02)
    merged_file['service_date'] = merged_file['date'] + pd.to_timedelta(merged_file['next_day'],unit='D')

    #convert departure_time to 24HR format
    merged_file['departure_time'] = merged_file['departure_time'].apply(lambda x: convert_time(x))

def add_intervals(merged_file, interval):
    #add intervals based on departure_time.
    merged_file['interval'] = merged_file['departure_time'].apply(lambda x: time_to_intervals_string(x, interval))

def add_direction(merged_file):
    #add direction column to the file
    #into city is 1 | out of city is 0
    city = re.compile('.*City')
    to_city_headsign = list(filter(city.match, merged_file['trip_headsign'].unique()))
    to_city_headsign.extend(['Melbourne','Flinders Street','Southern Cross'])
    merged_file['direction'] = merged_file['trip_headsign'].apply(lambda x: 1 if x in(to_city_headsign) else 0)


In [4]:
page_number = 15
page_link = 'https://transitfeeds.com/p/ptv/497?p='
folder_location = 'C:/Users/Administrator/Desktop/gtfs/'
head = 'https://transitfeeds.com'
start_date = ''

# agency_id for train
Vline = '1'
Metro = '2'
ids = [Vline, Metro]

interval_mins = 30

page = 1
while page <= page_number:
    url = page_link + str(page)
    req = requests.get(url)
    soup = BeautifulSoup(req.text,'html.parser')


    for link in soup.find_all('a', text='Download'):
        #download the zipfile into the directory
        name = str(link).split('/')
        name = name[4]
        zipFile_name = name +'.zip'
        result_filename = name + '.csv'

        print(zipFile_name)
        link = head + link.get('href')
        print(link)
        zipFile = urllib.urlopen(link)
        download_zipfile(zipFile, folder_location, zipFile_name)
        
        #load files
        dfst = load_file(folder_location,zipFile_name)    

        #setup files
        id_to_string(dfst)

        #setup calendar file   
        calendar = filtered_calendar(calendar_dfst=dfst['calendar'], calendar_dates_dfst=dfst['calendar_dates'])
        start = calendar['date'].min()
        calendar = dedup_calendar(start_date_from_last_file=start_date,calendar_dfst=calendar)
        start_date = start
        print('Calendar - Done!')

        final_merge = get_merged_file(dfst=dfst,calendar_file=calendar,agencyIds_list=ids)
        datetime_formatting(final_merge)
        add_intervals(final_merge,interval_mins)
        add_direction(final_merge)
        print('Merge done!')

        #save mergedfile just in case program crashes
        # merged_filename = name + '_merged.csv'
        # save_csvFile(file_folder_location=folder_location,file_name=merged_filename,df=final_merge)

        #get the count number of services by stop, date, interval, directiobn
        result = final_merge.groupby(['stop_id','stop_name','date','interval','trip_headsign','direction','agency_id'])['interval'].size().to_frame(name = 'count').reset_index()
        print('Result file done!')
        #save the result as csv
        save_csvFile(file_folder_location=folder_location,file_name=result_filename,df=result)
        add_to_zipfile(file_folder_location=folder_location, zip_name='result.zip', csv_name=result_filename)
        print('Result saved!')


        print(name + ' - Completed')
        delete_file(file_folder_location=folder_location, file_name=zipFile_name)
        # delete_file(file_folder_location=folder_location,file_name=merged_filename)
        gc.collect()
    

    print('Page %d is completed'% (page))
    page += 1

print('All done')

  calendar_file = calendar_file.loc[calendar_file.iloc[:,1:8].apply(lambda x: x == calendar_file['day']).any(1)]


Calendar - Done!
Merge done!
Result file done!
Result saved!
20200110 - Completed
Page 15 is completed
All done


In [5]:
#concat all individual files into one

file_folder_location = 'C:/Users/Administrator/Desktop/gtfs/'
file_name = 'results.zip'

with open(os.path.join(file_folder_location,file_name), 'rb') as f:
    zip_file = zipfile.ZipFile(BytesIO(f.read()))
    
dfst = {}
frames = []
for name in zip_file.namelist():
    df_name = name.lower()[:-4]
    dfst[df_name] = pd.read_csv(zip_file.open(name))
    frames.append(dfst[df_name])

final_data = pd.concat(frames,ignore_index=True)

save_csvFile(file_folder_location=file_folder_location,file_name='final_result.csv',df=final_data)
add_to_zipfile(file_folder_location=file_folder_location, zip_name='final_result.zip', csv_name='final_result.csv')

In [41]:
# test = final_data.loc[final_data['trip_headsign'] == 'Flinders Street Southern Cross Station']
test = final_data.loc[final_data['agency_id'] == 2]

In [25]:
uni_headsign = final_data['trip_headsign'].unique()
uni_headsign

array(['Sunbury', 'City (Flinders Street)', 'Seymour', 'Melbourne',
       'Shepparton', 'Frankston', 'Stony Point', 'Glen Waverley',
       'Hurstbridge', 'Sandringham', 'Belgrave', 'Alamein', 'Lilydale',
       'Craigieburn', 'Mernda', 'Pakenham', 'Werribee', 'Upfield',
       'Williamstown', 'Flinders Street', 'Cranbourne', 'Ballarat',
       'Ararat', 'Geelong', 'Bendigo', 'Echuca/Moama', 'Albury',
       'Warrnambool', 'Swan Hill', 'Nhill', 'Maryborough',
       'Showgrounds/Flemington', 'Traralgon', 'Bairnsdale', 'City',
       'Clockwise', 'Richmond (MCG)', 'City (Southern Cross)',
       'South Morang/Mernda', 'South Morang', 'Marshall',
       'Melbourne Broadmeadows', 'Melbourne Seymour',
       'Seymour Broadmeadows', 'Shepparton Seymour',
       'Flinders Street Southern Cross Station', 'Melbourne Melton',
       'Ballarat Melton', 'Ararat Ballarat', 'Melbourne Ballarat',
       'Melbourne Sunbury', 'Bendigo Sunbury',
       'Melbourne Bendigo or Heathcote',
       'Echuca/

In [42]:
test['trip_headsign'].unique()

array(['Sunbury', 'City (Flinders Street)', 'Frankston', 'Stony Point',
       'Glen Waverley', 'Hurstbridge', 'Sandringham', 'Belgrave',
       'Alamein', 'Lilydale', 'Craigieburn', 'Mernda', 'Pakenham',
       'Werribee', 'Upfield', 'Williamstown', 'Flinders Street',
       'Cranbourne', 'Showgrounds/Flemington', 'Clockwise',
       'South Morang/Mernda', 'South Morang',
       'Flinders Street Southern Cross Station',
       'Showgrounds/Flemington Southern Cross Station'], dtype=object)

In [4]:
def interval_30m(time_string): 
    time_split = time_string.split(' - ')
    start_time = time_split[0]
    start = start_time.split(':')
    start_hr = int(start[0])
    start_min = int(start[1])

    end_time = time_split[1]
    end = end_time.split(':')    
    end_hr = int(end[0])
    end_min = int(end[1])

    #00:00 - 29:59 ; 30:00 - 59:59
    if start_min < 30:
        start_min = 0
        end_min = 29
    else:
        start_min = 30
        end_min = 59

    new_interval = '%.2d:%.2d:00 - %.2d:%.2d:59'% (start_hr,start_min,end_hr,end_min)

    return new_interval

In [5]:
folder_location = 'C:/Users/Administrator/Desktop/gtfs/'
file_name = 'final_result.zip'
result_filename = 'new_result.csv'

#dfst = load_file(file_folder_location,file_name)
with open(os.path.join(folder_location,file_name), 'rb') as f:
    zip_file = zipfile.ZipFile(BytesIO(f.read()))
    
dfst = pd.read_csv(zip_file.open('final_result.csv'))

In [6]:
dfst['30m_interval'] = dfst['interval'].apply(lambda x: interval_30m(x))

In [7]:
result = dfst.groupby(['stop_id','stop_name','date','30m_interval','direction'])['count'].sum().to_frame(name = '30m_count').reset_index()


In [10]:
save_csvFile(file_folder_location=folder_location,file_name=result_filename,df=result)
add_to_zipfile(file_folder_location=folder_location, zip_name='new_result.zip', csv_name=result_filename)

In [25]:
stops = result.loc[:,['stop_id','stop_name']]


In [27]:
stops = stops.groupby(['stop_id','stop_name']).size().to_frame(name = 'count').reset_index()

In [35]:
save_csvFile(file_folder_location=folder_location,file_name='stops.csv',df=stops)