In [1]:
import pandas as pd
import partridge as ptg
import datetime
import numpy as np

import gtfs_utils

In [2]:
from os.path import join

In [3]:
import re
import datetime

In [4]:
def single_timestr_to_seconds(x, *, inverse=False, mod24=False, only_mins=False):
    """
    Given an HH:MM:SS time string ``x``, return the number of seconds
    past midnight that it represents.
    In keeping with GTFS standards, the hours entry may be greater than
    23.
    If ``mod24``, then return the number of seconds modulo ``24*3600``.
    If ``inverse``, then do the inverse operation.
    In this case, if ``mod24`` also, then first take the number of
    seconds modulo ``24*3600``.
    """
    if not inverse:
        try:
            if not only_mins:
                hours, mins, seconds = x.split(":")
                result = int(hours) * 3600 + int(mins) * 60 + int(seconds)
            else:
                hours, mins = x.split(":")
            if mod24:
                result %= 24 * 3600
        except:
            result = np.nan
    else:
        try:
            seconds = int(x)
            if mod24:
                seconds %= 24 * 3600
            hours, remainder = divmod(seconds, 3600)
            mins, secs = divmod(remainder, 60)
            result = "{:02d}:{:02d}:{:02d}".format(hours, mins, secs)
        except:
            result = np.nan
    return result

In [5]:
def timestr_to_seconds(x, *, only_mins=False):
    try:
        hms = x.str.split(':', expand=True)
        if not only_mins:
            result = hms.iloc[:,0].astype(int) * 3600 + hms.iloc[:,1].astype(int) * 60 + hms.iloc[:,2].astype(int)
        else:
            result = hms.iloc[:,0].astype(int) * 3600 + hms.iloc[:,1].astype(int) * 60
    except:
        result = np.nan

    return result

In [6]:
FOLDER = 'data\\siri\\2018-10'
file = 'siri_rt_data.2018-10-08.11.log.gz'

In [7]:
from glob import glob
import os


In [8]:
tf = r"C:\dev\ds\open-bus-explore\data\siri\2018-11\siri_rt_data.2018-11-20.0.log.gz"
def create_trip_df(path, drop=['timestamp', 'desc'], 
                   convert_timestr_to_seconds=True, add_date=True, 
                   add_trailing_zeros=True):
    header = ["timestamp", "desc", "agency_id", 
              "route_id", "route_short_name", "service_id", 
              "planned_start_time", "bus_id", "predicted_end_time", 
              "time_recorded", "lat", "lon"]
    date = datetime.datetime.strptime(re.findall('siri_rt_data\\.([^\\.]+)\\.\\d+\\.log', path)[0], '%Y-%m-%d')
    df = pd.read_csv(path, header=None, error_bad_lines=False)
    df.columns = header
    if drop is not None:
        df = df.drop(drop, axis=1)
    df = (df.assign(agency_id = lambda x: x.agency_id.astype(int))
              .assign(service_id = lambda x: x.service_id.astype(int))
              .assign(route_id = lambda x: x.route_id.astype(int))
              .assign(lat = lambda x: x.lat.astype(float))
              .assign(lon = lambda x: x.lon.astype(float)))
    if convert_timestr_to_seconds:
        df = (df.assign(planned_start_time = lambda x: timestr_to_seconds(x.planned_start_time, only_mins=True))
                .assign(predicted_end_time = lambda x: timestr_to_seconds(x.predicted_end_time, only_mins=True))
                .assign(time_recorded = lambda x: timestr_to_seconds(x.time_recorded)))
    if add_date:
        df = (df.assign(date = date))
    if add_trailing_zeros:
        df = (df
                .assign(planned_start_time = lambda x: x.planned_start_time+':00')
                .assign(predicted_end_time = lambda x: x.predicted_end_time+':00'))
    
    return df
    

In [9]:
from glob import glob
import os
out_folder = 'data\gtfs_stats_csv_hack'
gz_folder = 'data\gtfs_stats_csv_gz_hack'
#os.mkdir(out_folder)
#os.mkdir(gz_folder)

for file in glob('data\gtfs_stats_hack\*route_stats*'):
    base = os.path.basename(file).split('.')[0]
    out_path = os.path.join(out_folder, base+'.csv')
    gz_out_path = os.path.join(gz_folder, base+'.csv.gz')
    if not os.path.exists(gz_out_path):
        print (base)
        r = pd.read_pickle(file, compression='gzip')
        print (r.shape)
        #r.to_csv(out_path)
        r.to_csv(gz_out_path, compression='gzip')
        

2019-03-26_route_stats
(6651, 51)
2019-03-27_route_stats
(6667, 51)
2019-03-28_route_stats
(6742, 51)


In [None]:
t = create_trip_df(tf, drop=['desc'], convert_timestr_to_seconds=False)
t.head()

In [None]:
FOLDER = 'data\\siri\\2018-11'
out_folder = 'data\siri_csv_v2'
if not os.path.exists(out_folder):
    os.mkdir(out_folder)

for file in glob(FOLDER+'/*.log.gz'):
    print(file)
    df = create_trip_df(file, drop=['desc'], convert_timestr_to_seconds=False)
    base = '.'.join(os.path.basename(file).split('.')[:-2])

    #df.to_parquet(bn + '.parq')
    #os.remove(file)
    out_path = os.path.join(out_folder, base+'.csv.gz')
    df.to_csv(out_path, compression='gzip', index=False)

In [10]:
import sys
sys.path.append('C:/dev/ds/open-bus-explore/open-bus/gtfs/retriever')

In [11]:
import s3_wrapper

aki = 'P6OMDOFWYCQNTWE7XEPR'
sak = 'glx9UFBOBNQCtYqSWIUW5OKWyhn9CedVb5tn7La5u6I'
bucket = 'obus-do1'

In [None]:
crud = s3_wrapper.S3Crud(aki, sak, bucket)
s3_wrapper.list_content(crud, regex_argument='(.*\.2018-11-2[5-9]\..*)')

In [None]:
FOLDER = 'data\\siri\\2018-11'
#os.mkdir(FOLDER)
for k in s3_wrapper.list_content(crud, regex_argument='(.*\.2018-11-2[5-9]\..*)'):
    file_name = k.split('/')[-1]
    output_path = os.path.join(FOLDER, file_name)
    if not os.path.exists(output_path):
        print (f'Downloading {file_name}')
        s3_wrapper.download(crud, output_path, k)
    