In [2]:
%matplotlib notebook
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.figure as fig
from matplotlib.ticker import StrMethodFormatter
import seaborn as sns
from datetime import datetime as dt

In [35]:
# we only want the interval data from a single agency
# other ones for NM:  32000-2, 32000-3, 32000-901, 32000-4, 32000-61, 32000-8, 32000-9, 32000-18
agency = "34003-14"
# read the file in chunks, then append only the data that matches that agency
print(dt.now())
iter_csv = pd.read_csv('data/intervals.csv', iterator=True, chunksize=1000)
df = pd.concat([chunk[chunk['uniquename'] == agency] for chunk in iter_csv])
print(dt.now())

2019-01-17 15:47:10.021911
2019-01-17 15:47:27.503529


In [36]:
print("{} records for agency: {}".format(len(df), agency))
print("first 5:")
df.head()

4228 records for agency: 34003-14
first 5:


Unnamed: 0.1,Unnamed: 0,interval,origindex,timestamp,uniquename
2481352,2481352,922.0,4455.0,2018-09-13 02:03:15,34003-14
2481353,2481353,894.0,4950.0,2018-09-13 02:18:09,34003-14
2481354,2481354,917.0,5477.0,2018-09-13 02:33:26,34003-14
2481355,2481355,888.0,5917.0,2018-09-13 02:48:14,34003-14
2481356,2481356,880.0,6355.0,2018-09-13 03:02:54,34003-14


In [37]:
print("some basic stats:")
df["interval"].describe().apply(lambda x: format(x, '.2f'))

some basic stats:


count     4228.00
mean      1827.04
std       2142.35
min          2.00
25%        890.00
50%        913.00
75%       1798.00
max      28690.00
Name: interval, dtype: object

In [38]:
# the series of interest
series_of_interest = df["interval"]
# # of bins
bins_to_use = 100

In [39]:
# use the stats module to do the same histogram calculations as sns.distplot
# using the same bucketing parameters
bin_maxes, bin_edges, binnumber = stats.binned_statistic(series_of_interest, series_of_interest, statistic=max, bins=bins_to_use)

  result = result[core]


In [40]:
# graph the distribution of intervals using the .distplot function
# the .patches objects represent each graphed bin
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(18, 6)
rugkws={"height": .025, "color": "999999"}
histkws={"linewidth": 2, "alpha": 1, "color": "b"}
p = sns.distplot(series_of_interest, bins=bins_to_use, rug=True, rug_kws=rugkws, hist_kws=histkws, kde=False).patches

<IPython.core.display.Javascript object>

In [41]:
# get the heights of the bins
heights = [item.get_height() for item in p]
# the highest
print("the highest bar is {}".format(max(heights)))
# sort the list, biggest first
sorted_heights = sorted(heights, reverse=True)
# top X
x = 5
print("info for top {}:".format(x))
for i in range(0,x):
    thebin = heights.index(sorted_heights[i])
    print("# {} - height: {}, bin: {}, max interval in bin: {}, right-edge of bin: {}".format(i+1,sorted_heights[i], thebin, bin_maxes[thebin], bin_edges[thebin+1]))
    

the highest bar is 2378.0
info for top 5:
# 1 - height: 2378.0, bin: 3, max interval in bin: 1148.0, right-edge of bin: 1149.52
# 2 - height: 444.0, bin: 6, max interval in bin: 2002.0, right-edge of bin: 2010.1599999999999
# 3 - height: 388.0, bin: 2, max interval in bin: 862.0, right-edge of bin: 862.64
# 4 - height: 194.0, bin: 18, max interval in bin: 5452.0, right-edge of bin: 5452.72
# 5 - height: 181.0, bin: 9, max interval in bin: 2868.0, right-edge of bin: 2870.8
