Notebook 6/1/22 -- initial data analysis to aggregate RT data to hourly totals

In [None]:
import boto3
import json
import pandas as pd

In [None]:
# get objects from S3
# this requires being locally authenticated: https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html

s3 = boto3.resource('s3')
bucket = s3.Bucket('chn-ghost-buses-private')
objects = bucket.objects.all()

In [None]:
# load data

data_dict = {}

for obj in objects:
    print(f"loading {obj}")
    obj_name = obj.key
    # https://stackoverflow.com/questions/31976273/open-s3-object-as-a-string-with-boto3
    obj_body = json.loads(obj.get()['Body'].read().decode('utf-8'))
    data_dict[obj_name] = obj_body

In [None]:
# parse data into actual vehicle locations and errors

data = pd.DataFrame()
errors = pd.DataFrame()

# k, v here are filename: full dict of JSON
for k, v in data_dict.items():
    print(f"processing {k}")
    filename = k
    new_data = pd.DataFrame()
    new_errors = pd.DataFrame()
    # expect ~12 "chunks" per JSON
    for chunk, contents in v.items():
        if 'vehicle' in v[chunk]['bustime-response'].keys():
            new_data = new_data.append(pd.DataFrame(v[chunk]['bustime-response']['vehicle']))
        if 'error' in v[chunk]['bustime-response'].keys():
            new_errors = new_errors.append(pd.DataFrame(v[chunk]['bustime-response']['error']))
    new_data['scrape_file'] = filename
    new_errors['scrape_file'] = filename
    data = data.append(new_data)
    errors = errors.append(new_errors)

In [None]:
# convert data time to actual datetime
data['data_time'] = pd.to_datetime(data['tmstmp'], format='%Y%m%d %H:%M')

In [None]:
data['data_hour'] = data.data_time.dt.hour
data['data_date'] = data.data_time.dt.date

In [None]:
# combine vids into a set (drops duplicates): https://stackoverflow.com/a/45925961
hourly_summary = data.groupby(['data_date', 'data_hour', 'rt'])['vid'].apply(set).reset_index()

In [None]:
# get number of vehicles per hour per route
hourly_summary['count'] = hourly_summary['vid'].apply(len)

In [None]:
hourly_summary.head()

In [None]:
hourly_summary.to_csv('realtime_data_summary_20220601.csv', index = False)