In [1]:
import numpy as np
import pandas as pd
import datetime as dt

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import seaborn as sns

import sys
import operator

import fastparquet
import snappy

import ipywidgets as widgets

  from pandas.core.index import CategoricalIndex, RangeIndex, Index, MultiIndex


In [2]:
 def showall(df):
    #shows entire dataframe
    assert df.shape[0] < 5000
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        display(df)

In [3]:
alldays_timestamped = pd.read_parquet('TimestampToSep11new.parquet')
alldays_timestamped.sort_index(level='Timestamp', inplace=True)

In [4]:
format_dict = {'unrestricted spaces': ['Space 3'],
              'location column': 'Vehicle Location',
              'violator column': 'Violator',
              'enf start/end': ('18:00', '22:00'),
              }

In [10]:
class CurbData:
    """Stores curb utilization data and associated functions"""
    def __init__(self, timestamped_df, format_dict):
        self.format = format_dict
        self.df_all = timestamped_df
        self.df_filtered = self.time_filter(self.df_all)
        self.viol_counts = self.violator_timestamp_table(self.df_filtered)
        self.blockers_exceed_viol = self.viol_counts[self.viol_counts['Blockers Less Violators'] > 0].shape[0] / 60
        
    def violator_timestamp_table(self, timestamped_df):
        timestamped_df = timestamped_df[~timestamped_df.loc[
            :, self.format['location column']].isin(self.format['unrestricted spaces'])]
        outputdf = pd.DataFrame()
    
        def counts_to_df(counts_series, viol_label):
            new_column = viol_label
            try:
                df = pd.DataFrame(counts_series[:, viol_label])
                df = df.rename({self.format['violator column']: viol_label}, axis=1)
            except:
                print('index failed', 
                      len(mask), 
                      len(counts_series.loc[:, viol_label]))
                print('lbl:{}, value:{}'.format(viol_label, viol_value))
            return df

        df_grouped = timestamped_df.groupby(level=0)
        viol_counts = df_grouped[self.format['violator column']].value_counts()
        viol_labels = timestamped_df[self.format['violator column']].unique()
        for label in viol_labels:
            #for value in viol_counts.loc[:,label].unique():
            countdf = counts_to_df(counts_series=viol_counts, viol_label=label)
            if type(countdf) == pd.core.frame.DataFrame:
                outputdf = outputdf.join(countdf, how='outer')

        violdf = outputdf[outputdf.columns[outputdf.columns.str.contains(self.format['violator column'])]].fillna(value=0)
        blockdf = outputdf[outputdf.columns[outputdf.columns.str.contains('Blocking')]].fillna(value=0)
        outputdf['Total Violators'] = violdf.sum(axis=1)
        outputdf['Total Blockers'] = blockdf.sum(axis=1)

        outputdf['Blockers Less Violators'] = outputdf['Total Blockers'] - outputdf['Total Violators']
        outputdf['Any Violator'] = outputdf['Total Violators'].apply(lambda x: 1 if x > 0 else 0)
        outputdf['Any Blocking'] = outputdf['Total Blockers'].apply(lambda x: 1 if x > 0 else 0)

        return outputdf
    
    def time_filter(self, df):
    #filters df to enforcement interval provided in format (as 24-hr time hh:mm)
        
        df.sort_index(level='Timestamp', inplace=True)
        enf_start = self.format['enf start/end'][0].split(':')
        after_start = df.index.time > dt.time(int(enf_start[0]), int(enf_start[1]))
        df_after_st = df.loc[after_start]
        enf_end = self.format['enf start/end'][1].split(':')
        before_end = df_after_st.index.time < dt.time(int(enf_end[0]), int(enf_end[1]))
        df_in_interval = df_after_st.loc[before_end]
        
        return df_in_interval
    
    def blk_viol_times(self, condition=None):
        df = self.viol_counts
        if condition == 'Blocking':
            df = df[df['Any Blocking'] == 1]
        elif condition == 'Violator':
            df = df[df['Any Violator'] == 1]
            
        block_sec = df.sum()['Any Blocking']
        block_min = int(block_sec / 60)
        viol_sec = df.sum()['Any Violator']
        viol_min = int(viol_sec / 60)
        return {'block_sec': block_sec, 'block_min': block_min, 'viol_sec': viol_sec, 'viol_min': viol_min}

    def conditional(self, condition):
        times = self.blk_viol_times(condition=condition)
        if condition == 'Blocking':
            print(("Out of the {} minutes the bike lane was blocked in the study period,"
                   " at least one violator was parked in the loading zone for {} minutes ({} % of the time!)".format(
                       times['block_min'], times['viol_min'], 
                       int((times['viol_min']/times['block_min'])*100 ))))     
        elif condition == 'Violator':
            print(("Out of the {} minutes that at least one violator was parked in the loading zone,"
                   " the bike lane was blocked for {} minutes ({} % of the time!)".format(
                       times['viol_min'], times['block_min'], 
                       int((times['block_min']/times['viol_min'])*100 ))))
        return times
    
    

In [11]:
test1 = CurbData(alldays_timestamped, format_dict)

In [12]:
test1.blk_viol_times()

{'block_sec': 47675.0,
 'block_min': 794,
 'viol_sec': 272191.0,
 'viol_min': 4536}

In [13]:
test1.format

{'unrestricted spaces': ['Space 3'],
 'location column': 'Vehicle Location',
 'violator column': 'Violator',
 'enf start/end': ('18:00', '22:00')}

In [14]:
test1.conditional('Violator')

Out of the 4536 minutes that at least one violator was parked in the loading zone, the bike lane was blocked for 781 minutes (17 % of the time!)


{'block_sec': 46862.0,
 'block_min': 781,
 'viol_sec': 272191.0,
 'viol_min': 4536}

In [15]:
test1.blockers_exceed_viol

27.433333333333334

In [17]:
test1.df_filtered.index[-1] - test1.df_filtered.index[0]

Timedelta('20 days 03:59:58')

In [None]:
##consider dropping the subclass and just using a method to filter from CurbData!
##.    can chain: curbdata.subset('Tuesday').aggregate_activities(by='hour'), or reassign ofc
##speed this up if it always builds from a CurbData object?
##or do we need to keep build from df functionality?
class SubsetCurbData(CurbData):
    def __init__(self, timestamped_df, format_dict, subset):
        super().__init__(self, timestamped_df, format_dict)
        self.df_subset = self.subset_df(subset)
        self.oneday_dfs = []
    
    def subset_df(self, subset):
        def from_weekday(weekday):
            '''returns df_subset of all matching weekday
            (or, weekdays and weekends)
            also, appends each single day df to self.oneday_dfs
            '''
        def from_timestamps(timestamps):
            '''simply returns df_subset between two timestamps
            opt, if needed split single days if we accept
            input > 1day...
            '''
    
    def aggregate_activities(self, by=None):
        assert by == None or 'day' or 'hour'
        '''return counts and total time for each type of activity in filter interval
        also pct blocking, pct viol presence 
        '''

## keep on truckin'!
* fully abstract all functions, include in class
* write better, more intuitive stats summary functions
* write documentation/utility/guesser to provide format info
* class-based plotting system
* ???
* profit

In [None]:
def lz_total_time(lz_onlydf):
    #returns seconds of loading zone time contained in filtered dataframe
    assert max(lz_onlydf.index).hour < 22 and min(lz_onlydf.index).hour > 6
    seconds = 0
    dates = lz_onlydf.index.strftime('%m%d%Y').unique()
    for date in dates:
        index = lz_onlydf[lz_onlydf.index.strftime('%m%d%Y') == date].index
        td = max(index) - min(index)
        seconds += td.seconds
    return seconds

In [None]:
##recheck -- can be greatly simplified with new format?
##TODO allow split on hour-- better graph?
def stats_from_df(df, lzonly=True, split_on='dates'):
    if lzonly:
        df = time_filter(df)
        df = time_filter(df, hour=22, opr='<')
        
    viol_df = violator_timestamp_table(df)
    if split_on == 'dates':
        dt_string = '%m%d%Y'
    elif split_on == 'hours':
        dt_string = '%m%d%Y%H'
        
    dates = viol_df.index.strftime(dt_string).unique()
    for date in dates:
        one_viol_df = viol_df[viol_df.index.strftime(dt_string) == date]

        print(min(one_viol_df.index), '–', max(one_viol_df.index))
        block_sec = one_viol_df.sum()['Any Blocking']
        viol_sec = one_viol_df.sum()['Any Violator']
        total_sec = lz_total_time(one_viol_df)
        one_df = pd.DataFrame([[viol_sec, block_sec, total_sec]], 
                              index = [dt.datetime.strptime(date, dt_string)],
                             columns = ['Seconds Violator Present', 'Seconds Bike Lane Blocked', 'Total Seconds'])
        try:
            output_df = output_df.append(one_df)
        except UnboundLocalError:
            output_df = one_df

    output_df['Minutes Violator Present'] = output_df['Seconds Violator Present'] / 60
    output_df['Minutes Bike Lane Blocked'] = output_df['Seconds Bike Lane Blocked'] / 60
    
    return output_df
    

In [None]:
bigstats = stats_from_df(alldays_timestamped, split_on='dates')

In [None]:
bigstats.shape

In [None]:
bigstats

In [None]:
##TODO color day of study, see if compliance improved at all over time?
sns.set()
sns.regplot(x='Minutes Violator Present', y='Minutes Bike Lane Blocked', data = bigstats, 
          ci=95)
plt.savefig(
    'Figures/scatter.png', dpi=600, format="png", bbox_inches='tight')

In [None]:
bigstats_reindexed = bigstats.reset_index()

In [None]:
ax = sns.scatterplot(x='index', y='Minutes Bike Lane Blocked', data = bigstats_reindexed, 
          ci=95)
start = min(bigstats_reindexed['index']).to_pydatetime()
end = max(bigstats_reindexed['index']).to_pydatetime()
ax.set_xlim([start, end])
fig, ax = plt.subplots()
ax.xaxis_date()
ax.xaxis.set_major_formatter(mdates.DateFormatter('%-I:%M %p'))
fig.autofmt_xdate()
plt.show()

In [None]:
ax

## Moving Forwards
### short-term
* general speed-up (see Pandas advice, for loops, cython?, weird parallel thing, etc?)
    * Pandas 1.0 should help some too
* loading zone ends at 10pm (done)
* show TNC/CNS activity on plot? (done)
* Scatter/fit line of seconds blocked+seconds any violator? (done)
* send visual/summary package to Juan for city meeting (Google Slides) (draft 1-2wk) (drafted)

### longer-term
* rewrite to call blocking stats explicitly for each interval, support datetimeindex? (essentially done)
* count # of cars in bike lane, compare to 1 violator, 2 violator  (done!)
* --> model counterfactual, bike lane availability under perfect enforcement, (get sizing reccomendation, is 2 enough??, what about if there is no enforcement?? (bigger?) )
* OpenCV (w/ David?) (working with Alberto@Strada Labs)
* interactive visualization tool, summary stats (late Feb?) (goal of project!) (IP)
    * input data standards for viz tool
* eventual paper on data/stats
* eventual tool description


In [None]:
alldays_timestamped.head()