In [6]:
import pandas as pd
import numpy as np
from scipy import stats
from datetime import datetime as dt

In [7]:
# read the slotted interval data file from disk
print(dt.now())
df_all = pd.read_csv('data/intervals_slotted.csv')
print(dt.now())

2019-02-16 11:27:46.276297
2019-02-16 11:27:53.239725


In [93]:
def make_output(a, df, trim_value, s, o, pc, pl, rl):
    
    # this removes the rows considered to be 'outliers'
    # trimmed will be what is to the left of the threshold (the non-outliers)
    trimmed = df[df.interval < trim_value]

    # set the groupby using the timeslot
    by_slot = trimmed.groupby('SLOT')
    
    for name, group in by_slot:
        d = {}
        ag = group["interval"]
#        print("name: {}, size: {}".format(name, ag.size))
        d['agency'] = a
        d['slot'] = name
        d['size'] = ag.size
        d['max'] = np.max(ag)
        d['min'] = np.min(ag)
        for i, p in enumerate(pc):
            d[pl[i]] = np.percentile(ag, p, interpolation='higher')
            d[rl[i]] = ag.size - int(ag.size * p/100)
        # add to the output df
        o.loc[len(o)] = d

In [102]:
def get_agency_list(method):
    a_list = []
    if method == 'f':
        agency_list_file = 'agency-list-001.txt'
        fo = open(agency_list_file, "r")
        for line in fo:
            a_list.append(line.rstrip('\n'))
    else:
        a_list = ['34003-1', '34003-2']

    return a_list

In [103]:
pctiles = [50, 75, 85, 90, 95, 96, 97, 98, 99, 100]
colnames = ['agency', 'slot', 'size', 'min', 'max']
plist = []
rlist = []

for p in pctiles:
    colnames.append(str(p) + '-p')
    plist.append(str(p) + '-p')
    colnames.append(str(p) + '-r')
    rlist.append(str(p) + '-r')

out_df = pd.DataFrame(columns=colnames)
    
# loop through the agency list, make the output

for agency in get_agency_list('f'):
    
    agency_bool = df_all['uniquename'] == agency
    agency_df = df_all[agency_bool]
    
    outlier_threshold = 100000000
    
    saveoutput = 1
    print("agency: {}, time start: {}".format(agency, dt.now()))
    make_output(agency, agency_df, outlier_threshold, saveoutput, out_df, pctiles, plist, rlist)


agency: 34003-2, time start: 2019-02-16 14:06:38.628875
agency: 34003-3, time start: 2019-02-16 14:06:38.933003
agency: 34003-4, time start: 2019-02-16 14:06:39.244877
agency: 34003-5, time start: 2019-02-16 14:06:39.564594
agency: 34003-6, time start: 2019-02-16 14:06:39.883819
agency: 34003-7, time start: 2019-02-16 14:06:40.204460
agency: 34003-8, time start: 2019-02-16 14:06:40.523170
agency: 34003-9, time start: 2019-02-16 14:06:40.838591
agency: 34003-10, time start: 2019-02-16 14:06:41.157880
agency: 34003-11, time start: 2019-02-16 14:06:41.480136
agency: 34003-12, time start: 2019-02-16 14:06:41.803060
agency: 34003-13, time start: 2019-02-16 14:06:42.124503
agency: 34003-14, time start: 2019-02-16 14:06:42.446359
agency: 34003-15, time start: 2019-02-16 14:06:42.777934
agency: 34003-17, time start: 2019-02-16 14:06:43.107881
agency: 34003-18, time start: 2019-02-16 14:06:43.431889
agency: 34003-19, time start: 2019-02-16 14:06:43.752970
agency: 34003-20, time start: 2019-02-1

In [104]:
# save to a csv
out_df.to_csv('output/percentile-data-all.csv')