In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import partridge as ptg
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import datetime
import numpy as np

import gtfs_utils

alt.renderers.enable('notebook')
alt.data_transformers.enable('json')

sns.set_style("white")
sns.set_context("talk")
sns.set_palette('Set2', 10)

In [3]:
import gzip

def parse_trips(path):
    trip = []
    with gzip.open(path) as f:
        lines = f.read().strip().splitlines()
        data = [line.split(b',')[2:] for line in lines if line]
    f.close()
    for line in data:
        line = [element.decode("utf-8") for element in line]
        trip.append({"agency": line[0], "route_id": line[1], "line_num": line[2]
                        , "service_id": line[3], "start_time": line[4], "bus_id": line[5],
                     "end_time": line[6], "time_recorded": line[7],
                     "coordinates": (line[8], line[9])})

    return trip

In [4]:
from os.path import join

In [5]:
import re
import datetime

In [6]:
def single_timestr_to_seconds(x, *, inverse=False, mod24=False, only_mins=False):
    """
    Given an HH:MM:SS time string ``x``, return the number of seconds
    past midnight that it represents.
    In keeping with GTFS standards, the hours entry may be greater than
    23.
    If ``mod24``, then return the number of seconds modulo ``24*3600``.
    If ``inverse``, then do the inverse operation.
    In this case, if ``mod24`` also, then first take the number of
    seconds modulo ``24*3600``.
    """
    if not inverse:
        try:
            if not only_mins:
                hours, mins, seconds = x.split(":")
                result = int(hours) * 3600 + int(mins) * 60 + int(seconds)
            else:
                hours, mins = x.split(":")
            if mod24:
                result %= 24 * 3600
        except:
            result = np.nan
    else:
        try:
            seconds = int(x)
            if mod24:
                seconds %= 24 * 3600
            hours, remainder = divmod(seconds, 3600)
            mins, secs = divmod(remainder, 60)
            result = "{:02d}:{:02d}:{:02d}".format(hours, mins, secs)
        except:
            result = np.nan
    return result

In [7]:
def timestr_to_seconds(x, *, only_mins=False):
    try:
        hms = x.str.split(':', expand=True)
        if not only_mins:
            result = hms.iloc[:,0].astype(int) * 3600 + hms.iloc[:,1].astype(int) * 60 + hms.iloc[:,2].astype(int)
        else:
            result = hms.iloc[:,0].astype(int) * 3600 + hms.iloc[:,1].astype(int) * 60
    except:
        result = np.nan

    return result

In [14]:
def create_trip_df(path):
    df = pd.DataFrame(parse_trips(path))
    date = datetime.datetime.strptime(re.findall('siri_rt_data\\.([^\\.]+)\\.\\d+\\.log', path)[0], '%Y-%m-%d')
    df[['lat', 'lon']] = pd.DataFrame(df.coordinates.values.tolist()).astype(float)
    df = (df.drop('coordinates', axis=1)
          .assign(agency = lambda x: x.agency.astype(int))
          .assign(service_id = lambda x: x.service_id.astype(int))
          .assign(route_id = lambda x: x.route_id.astype(int))
          .assign(start_time = lambda x: timestr_to_seconds(x.start_time, only_mins=True))
          .assign(end_time = lambda x: timestr_to_seconds(x.end_time, only_mins=True))
          .assign(time_recorded = lambda x: timestr_to_seconds(x.time_recorded))
          .assign(date = date)
         )
    return df

In [15]:
FOLDER = 'data\\siri\\2018-08'
file = 'siri_rt_data.2018-08-01.0.log.gz'

In [16]:
df = create_trip_df(join(FOLDER, file))

In [17]:
df.head()

Unnamed: 0,agency,bus_id,end_time,line_num,route_id,service_id,start_time,time_recorded,lat,lon,date
0,5,7591,660,19,2340,33441985,84300,86348,32.035385,34.767807,2018-08-01
1,5,7666,2340,19,2340,33643429,0,86109,0.0,0.0,2018-08-01
2,5,10196,300,16,2323,33425076,84000,86372,32.045105,34.809189,2018-08-01
3,5,7853,1200,16,2323,33425077,85200,86380,32.055714,34.78186,2018-08-01
4,5,10159,2100,16,2323,33425078,0,86109,0.0,0.0,2018-08-01


In [18]:
df.dtypes

agency                    int32
bus_id                   object
end_time                  int32
line_num                 object
route_id                  int32
service_id                int32
start_time                int32
time_recorded             int32
lat                     float64
lon                     float64
date             datetime64[ns]
dtype: object

In [19]:
df.to_parquet(join(FOLDER, 'siri_rt_data.2018-08-01.0.parq'))

In [20]:
pdf = pd.read_parquet(join(FOLDER, 'siri_rt_data.2018-08-01.0.parq'))

In [21]:
pdf.head()

Unnamed: 0,agency,bus_id,end_time,line_num,route_id,service_id,start_time,time_recorded,lat,lon,date
0,5,7591,660,19,2340,33441985,84300,86348,32.035385,34.767807,2018-08-01
1,5,7666,2340,19,2340,33643429,0,86109,0.0,0.0,2018-08-01
2,5,10196,300,16,2323,33425076,84000,86372,32.045105,34.809189,2018-08-01
3,5,7853,1200,16,2323,33425077,85200,86380,32.055714,34.78186,2018-08-01
4,5,10159,2100,16,2323,33425078,0,86109,0.0,0.0,2018-08-01


In [22]:
pdf.dtypes

agency                    int32
bus_id                   object
end_time                  int32
line_num                 object
route_id                  int32
service_id                int32
start_time                int32
time_recorded             int32
lat                     float64
lon                     float64
date             datetime64[ns]
dtype: object

In [24]:
from glob import glob
import os

for file in glob(FOLDER+'/*.log.gz'):
    print(file)
    df = create_trip_df(file)
    bn = os.path.splitext(file)[0]
    df.to_parquet(bn + '.parq')

data\siri\2018-08\siri_rt_data.2018-08-01.0.log.gz
data\siri\2018-08\siri_rt_data.2018-08-01.1.log.gz
data\siri\2018-08\siri_rt_data.2018-08-01.2.log.gz
data\siri\2018-08\siri_rt_data.2018-08-01.3.log.gz
data\siri\2018-08\siri_rt_data.2018-08-01.4.log.gz
data\siri\2018-08\siri_rt_data.2018-08-02.0.log.gz
data\siri\2018-08\siri_rt_data.2018-08-02.1.log.gz
data\siri\2018-08\siri_rt_data.2018-08-02.2.log.gz
data\siri\2018-08\siri_rt_data.2018-08-03.0.log.gz
data\siri\2018-08\siri_rt_data.2018-08-03.1.log.gz
data\siri\2018-08\siri_rt_data.2018-08-03.2.log.gz
data\siri\2018-08\siri_rt_data.2018-08-04.0.log.gz
data\siri\2018-08\siri_rt_data.2018-08-05.0.log.gz
data\siri\2018-08\siri_rt_data.2018-08-05.1.log.gz
data\siri\2018-08\siri_rt_data.2018-08-05.2.log.gz
data\siri\2018-08\siri_rt_data.2018-08-05.3.log.gz
data\siri\2018-08\siri_rt_data.2018-08-06.0.log.gz
data\siri\2018-08\siri_rt_data.2018-08-06.1.log.gz
data\siri\2018-08\siri_rt_data.2018-08-06.2.log.gz
data\siri\2018-08\siri_rt_data.

data\siri\2018-08\siri_rt_data.2018-08-27.9.log.gz
data\siri\2018-08\siri_rt_data.2018-08-28.0.log.gz
data\siri\2018-08\siri_rt_data.2018-08-28.1.log.gz
data\siri\2018-08\siri_rt_data.2018-08-28.10.log.gz
data\siri\2018-08\siri_rt_data.2018-08-28.11.log.gz
data\siri\2018-08\siri_rt_data.2018-08-28.12.log.gz
data\siri\2018-08\siri_rt_data.2018-08-28.2.log.gz
data\siri\2018-08\siri_rt_data.2018-08-28.3.log.gz
data\siri\2018-08\siri_rt_data.2018-08-28.4.log.gz
data\siri\2018-08\siri_rt_data.2018-08-28.5.log.gz
data\siri\2018-08\siri_rt_data.2018-08-28.6.log.gz
data\siri\2018-08\siri_rt_data.2018-08-28.7.log.gz
data\siri\2018-08\siri_rt_data.2018-08-28.8.log.gz
data\siri\2018-08\siri_rt_data.2018-08-28.9.log.gz
data\siri\2018-08\siri_rt_data.2018-08-29.0.log.gz
data\siri\2018-08\siri_rt_data.2018-08-29.1.log.gz
data\siri\2018-08\siri_rt_data.2018-08-29.10.log.gz
data\siri\2018-08\siri_rt_data.2018-08-29.11.log.gz
data\siri\2018-08\siri_rt_data.2018-08-29.12.log.gz
data\siri\2018-08\siri_rt