In [21]:
%matplotlib notebook
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.figure as fig
from matplotlib.ticker import StrMethodFormatter
import seaborn as sns
from datetime import datetime as dt
from dateutil.parser import parse
import re
import csv

In [22]:
# we only want the interval data from a single agency
# other ones for NM:  32000-2, 32000-3, 32000-901, 32000-4, 32000-61, 32000-8, 32000-9, 32000-18
# NC:  34003-31, 34003-32, 34003-35, ...
agency = "34003-2"
# read the file in chunks, then append only the data that matches that agency
print(dt.now())
iter_csv = pd.read_csv('data/intervals.csv', iterator=True, chunksize=1000)
df = pd.concat([chunk[chunk['uniquename'] == agency] for chunk in iter_csv])
print(dt.now())

2019-02-14 12:26:59.800406
2019-02-14 12:27:17.618647


In [23]:
# add a new column which is the timestamp as a datetime object
df['ts'] = df.apply(lambda row: parse(row.timestamp), axis=1)

# add a new column for the hour of the day
df['HR'] = df.apply(lambda row: row.ts.hour, axis=1)

# add a new column for the day of the week
df['DOW'] = df.apply(lambda row: row.ts.weekday(), axis=1)

# this function will return a 'code' for (W)eekend, (N)ight, or (D)ay
def timeslot(h,d):
    # h - hour of day
    # d - day of week (0=Monday)
    if d > 4:             # day 5 or 6 (saturday or sunday)
        return "W"        # then Weekend
    elif h < 8 or h > 17: # before 7:59a or after 5:59p 
        return "N"        # then Night
    else:                 # otherwise
        return "D"        # Day

# make a new column with the timeslot code in it
df['SLOT'] = df.apply(lambda row: timeslot(row.HR, row.DOW), axis=1)

In [32]:
# this removes the rows considered to be 'outliers'
# trimmed will be what is to the left of the threshold (the non-outliers)
trim_value = 150000000
trimmed = df[df.interval < trim_value]

# set the # of bins
bins_to_use = 100

# set the groupby using the timeslot
by_slot = trimmed.groupby('SLOT')

In [33]:
saveoutput = 1

In [34]:
# show all the slots in one plot
fig = plt.figure()
ax = plt.subplot()
ax.grid(True, which='both')  # set a grid
fig.set_size_inches(10, 6)   # size of the full figure
rugkws={"height": .025, "color": "999999"}  # rug display parameters
histkws={"linewidth": 1, "alpha": 0.5}        # histogram display parameters

for name, group in by_slot:
    sns.distplot(group.interval, label=name, kde=False, bins=bins_to_use, hist_kws=histkws, rug=True, rug_kws=rugkws)
    # print the basic stats
    print(name, group["interval"].describe().apply(lambda x: format(x, '.2f')))
    if(saveoutput):
        group["interval"].describe().apply(lambda x: format(x, '.2f')).to_csv('output/' + agency + '-' + name + '.csv')

plt.title(agency)    # output the title
plt.legend()         # display a legend
if(saveoutput):
    plt.savefig('output/' + agency)
#    group["interval"].describe().apply(lambda x: format(x, '.2f')).to_csv('output/' + agency + '.csv')
    

<IPython.core.display.Javascript object>

D count     1682.00
mean      1476.40
std       1492.05
min          2.00
25%        893.00
50%        912.00
75%       1790.00
max      26084.00
Name: interval, dtype: object
N count     1467.00
mean      2118.83
std       2381.08
min        126.00
25%        896.00
50%        938.00
75%       2672.50
max      18903.00
Name: interval, dtype: object
W count     1065.00
mean      2105.98
std       2044.06
min        131.00
25%        897.00
50%        950.00
75%       2696.00
max      18568.00
Name: interval, dtype: object


In [35]:
# show the slots in separate graphs 
for slot in list('DNW'):
    ddf = trimmed[trimmed.SLOT == slot]
    fig = plt.figure()
    ax = plt.subplot()
    ax.grid(True, which='both')  # set a grid
    fig.set_size_inches(10, 6)   # size of the full figure
    rugkws={"height": .025, "color": "999999"}  # rug display parameters
    histkws={"linewidth": 1, "alpha": 0.5}        # histogram display parameters

    sns.distplot(ddf.interval, label=slot, kde=False, bins=bins_to_use, hist_kws=histkws, rug=True, rug_kws=rugkws)
    # print the basic stats
    print(slot, ddf["interval"].describe().apply(lambda x: format(x, '.2f')))
    
    title = agency + "-" + slot
    plt.title(title)    # output the title
    plt.legend()         # display a legend
    if(saveoutput):
        plt.savefig('output/' + title)

<IPython.core.display.Javascript object>

D count     1682.00
mean      1476.40
std       1492.05
min          2.00
25%        893.00
50%        912.00
75%       1790.00
max      26084.00
Name: interval, dtype: object


<IPython.core.display.Javascript object>

N count     1467.00
mean      2118.83
std       2381.08
min        126.00
25%        896.00
50%        938.00
75%       2672.50
max      18903.00
Name: interval, dtype: object


<IPython.core.display.Javascript object>

W count     1065.00
mean      2105.98
std       2044.06
min        131.00
25%        897.00
50%        950.00
75%       2696.00
max      18568.00
Name: interval, dtype: object
