In [47]:
import json
import numpy as np
import pandas as pd

In [48]:
DATA_PATH = '../../data/processed/transports_v2.parquet'
transports = pd.read_parquet(DATA_PATH)

transports.head()

Unnamed: 0,trip_id,product_id,line_text,transport_type,stop_id,arrival_time,departure_time,mean_arrival_delay,mean_departure_delay,median_arrival_delay,median_departure_delay,std_arrival_delay,std_departure_delay,n_arrival_delay,n_departure_delay,n_cancelled,n_through_trip,n_additional_trip,n_entries
0,80:06____:17171:000,Train,RB,RB,8500090,14:50:00,,293.939394,,120.0,,388.229414,,68,0,4,0,0,104
1,80:06____:17261:000,Train,RB,RB,8500090,,15:53:00,,61.621622,,0.0,,129.218022,0,9,1,0,0,104
2,80:800693:3053:000,Train,IRE3,IRE,8503424,11:58:00,12:00:00,151.539474,127.605263,41.0,19.0,627.797068,622.499501,60,73,2,0,0,78
4,80:sbg034:14004,Bus,Bus7349,B,8573327,09:07:00,,2.4,,0.0,,29.44332,,5,0,0,0,0,100
6,80:sbg034:55413,Bus,Bus7349,B,8503474,00:19:00,00:20:00,138.0,96.0,180.0,120.0,107.02591,84.852814,70,68,0,0,0,100


In [49]:
transports['size'] = transports \
    .groupby(['product_id', 'transport_type', 'line_text'])['line_text'] \
    .transform('count')
transports = transports[['product_id', 'transport_type', 'line_text', 'size']]

transports.head()

Unnamed: 0,product_id,transport_type,line_text,size
0,Train,RB,RB,267
1,Train,RB,RB,267
2,Train,IRE,IRE3,124
4,Bus,B,Bus7349,806
6,Bus,B,Bus7349,806


In [50]:
data = {
    'name': 'transports',
    'children': []
}

for product_id, product in transports.groupby('product_id'):
    data['children'].append({
        'name': product_id,
        'children': []
    })

    for transport_type, transport in product.groupby('transport_type'):
        data['children'][-1]['children'].append({
            'name': transport_type,
            'children': []
        })

        for line_text, line in transport.groupby('line_text'):
            data['children'][-1]['children'][-1]['children'].append({
                'name': line_text,
                'size': line['size'].iloc[0] # Get first because they are all the same
            })


In [51]:
# Save data as json
DATA_SRC_PATH = '../../data/circle_packing_data.json'


def handle_datatypes(obj):
    if isinstance(obj, np.int64):
        return int(obj)
    elif isinstance(obj, np.float64):
        return float(obj)
    elif isinstance(obj, np.bool_):
        return bool(obj)
    else:
        raise TypeError(f"Unhandled data type: {type(obj)}")

with open(DATA_SRC_PATH, 'w') as f:
    json.dump(data, f, default=handle_datatypes)