In [20]:
import numpy as np
import pandas as pd
import datetime as dt

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cbook as cbook

import sys
import operator

import fastparquet
import snappy

%load_ext cython

The cython extension is already loaded. To reload it, use:
  %reload_ext cython


In [21]:
alldays = pd.read_csv('VehtoSep11new.csv')

In [22]:
def showall(df):
    #shows entire dataframe
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        display(df)

In [23]:
%%cython

def zeropad_dates(str_date):
    """pads month and day with zeroes"""
    padded_date = ''
    split_date = str_date.split('/')
    for value in split_date:
        if len(value) < 2:
            value = '0' + value
        padded_date += (value + '/')
    padded_date = padded_date[:-1]
    return padded_date

In [24]:
def classify_violator(row, st_time='18:00:00', end_time='22:00:00', legal_minutes=15):
    ##TODO speedup candidate?
    ##TODO rewrite cleaner?
    """classify a row as a violator or not
    considers +15min parked from 18:00-22:30 a violation
    """
    begin, end = row['Begin Date'] + row['Begin Time'], row['Begin Date'] + row['End Time']
    time_parked = {'Start':datetime_from_str(begin), 'End':datetime_from_str(end)}
    datestr = begin[:10]
    
    enf_start = datestr + st_time
    enf_end = datestr + end_time
    enforcement_times = {'Start':datetime_from_str(enf_start), 'End':datetime_from_str(enf_end)}
    
    latest_start = max(time_parked['Start'], enforcement_times['Start'])
    earliest_end = min(time_parked['End'], enforcement_times['End'])
    delta = (earliest_end - latest_start).seconds
    
    suffix = ''
    if row['CNS?']:
        suffix = ', CNS'
    elif row['TNC?']:
        suffix = ', TNC'
    
    #maximum parking duration during enforcement interval
    legal_duration_seconds = legal_minutes*60
    max_observation = 60**2 * 11
    #Space 3 not part of loading zone, so vehicles there aren't violators
    if (delta > legal_duration_seconds and
            delta < max_observation and
            row['Vehicle Location'] in ['Space 1', 'Space 2']):
        return 'Violator' + suffix
    elif row['Vehicle Location'] == 'SB bike lane':
        return 'Bike Lane Blocking' + suffix
    else:
        return 'Likely Non-Violator' + suffix
    
def classify_violators(df):
    #add boolean violator column to original (pre-timestamp) df. Also zeropads dates.
    df.dropna(subset=['Begin Time', 'End Time'], inplace=True)
    #added unique id to each activity
    df = df.reset_index().rename(columns={'index':'Activity Id'})
    df['Begin Date'] = df['Begin Date'].apply(zeropad_dates)
    df['Violator'] = df.apply(classify_violator, axis=1)
    return df


In [25]:
%%cython
import pandas as pd
import numpy as np
import datetime as dt
cimport numpy

def datetime_from_str(string):
    """converts string format dates/times from spreadsheet
    into Python Datetime objects
    """
    try:
        #print('ran, string:{}'.format(string))
        format_str = '%m/%d/%Y%I:%M:%S %p'
        to_dt = dt.datetime.strptime(string, format_str)
    except ValueError:
        #handles small portion of values in 24hr format
        try:
            #print('excepted, string:{}'.format(string))
            format_str = '%m/%d/%Y%H:%M:%S'
            to_dt = dt.datetime.strptime(string, format_str)
        
        except ValueError:
            #use dict to handle both text-described times
            text_times_dict = {'before 12':'12:00:00', 'end of the day':'22:30:00'}
            #print('excepted2, string:{}'.format(string))
            texttime = string[10:]
            time = text_times_dict[texttime]
            combined = string[:10] + time
            return datetime_from_str(combined)
    return to_dt

def cy_timestamps_from_interval(dt_start, dt_end):
    """creates range of np.datetime64 for every second in interval"""
    dt64_st = np.datetime64(dt_start)
    #add one second to include end value in range
    dt64_end = np.datetime64(dt_end) + np.timedelta64(1, 's')
    return np.arange(dt64_st, dt64_end, dtype='datetime64[s]')

def timestamps_from_row(row):
    """extracts interval from a row of the original spreadsheet"""
    start = row[1]['Begin Date'] + row[1]['Begin Time'] 
    end = row[1]['Begin Date'] + row[1]['End Time'] 
    
    dt_start = datetime_from_str(start)
    dt_end = datetime_from_str(end)
    
    return cy_timestamps_from_interval(dt_start, dt_end)

def index_for_row(row):
    """creates pd.DateTimeIndex for a single row of the original spreadsheet"""
    timestamps = timestamps_from_row(row)
    return pd.Index(timestamps, name='Timestamp')
    
def df_for_row(row):
    """generates timestamped dataframe from a 
    single row of the violator-classified dataframe
    """
    try:
        ind = index_for_row(row)
        row_df = pd.DataFrame(row[1]).swapaxes('index', 'columns')
        row_df = pd.concat([row_df]*len(ind))
        row_df.index = ind
    except ValueError:
        display(row_df)
        print(ind)
        return
    return row_df

def cy_timestamp_df(df):
    """Generates timestamped dataframe from violator-classified dataframe."""
    i = 0
    df_list = []
    timestamped_df = pd.DataFrame()
    for row in df.iterrows():
        if i % 100 == 0:
            #TODO rewrite to include %complete, ETA?
            print('Processing Row: {}, {}% complete'.format(i, int((i/df.shape[0])*100)))
        i += 1
        df_list += [df_for_row(row)]
        
    timestamped_df = timestamped_df.append(df_list)
    return timestamped_df

In [26]:
def timestamp_and_classify_violators(df):
    """simple wrapper to classify biolators then timestamp"""
    return cy_timestamp_df(classify_violators(df)) 

In [27]:
def reset_viol_time(df, hour=22, opr='=='):
    """switches Violator to Likely Non-Violator
    for rows in specified range of timestamped dataframe
    """
##TODO reduce redunancy? 
##TODO also, currently doesn't handle TNC/CNS violators. 
##Not a huge deal since they rarely span interval, but may want to fix
    ops = {'>': operator.gt,
           '<': operator.lt,
           '>=': operator.ge,
           '<=': operator.le,
           '==': operator.eq}

    df.sort_index(level='Timestamp', inplace=True)
    times = df.index.get_level_values(level='Timestamp')
    hr = ops[opr](times.hour, hour)
    filtered = df.loc[hr]
    filtered.loc[:, 'Violator'] = filtered.loc[:, 'Violator'].apply(lambda x: 'Likely Non-Violator' if x == 'Violator' else x)
    df = df.loc[~hr]
    df = df.append(filtered)
    return df

In [28]:
alldays_timestamped = timestamp_and_classify_violators(alldays)

Processing Row: 0, 0% complete
Processing Row: 100, 4% complete
Processing Row: 200, 8% complete
Processing Row: 300, 13% complete
Processing Row: 400, 17% complete
Processing Row: 500, 22% complete
Processing Row: 600, 26% complete
Processing Row: 700, 31% complete
Processing Row: 800, 35% complete
Processing Row: 900, 39% complete
Processing Row: 1000, 44% complete
Processing Row: 1100, 48% complete
Processing Row: 1200, 53% complete
Processing Row: 1300, 57% complete
Processing Row: 1400, 62% complete
Processing Row: 1500, 66% complete
Processing Row: 1600, 70% complete
Processing Row: 1700, 75% complete
Processing Row: 1800, 79% complete
Processing Row: 1900, 84% complete
Processing Row: 2000, 88% complete
Processing Row: 2100, 93% complete
Processing Row: 2200, 97% complete


In [29]:
alldays_timestamped = reset_viol_time(alldays_timestamped)
alldays_timestamped = reset_viol_time(alldays_timestamped, hour=18, opr='<')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [30]:
alldays_timestamped.shape

(1595953, 16)

In [31]:
alldays_timestamped.dropna(subset=['Duration']).shape

(1581475, 16)

In [32]:

## TODO ensure sort happens in timestamper...
alldays_timestamped.sort_index(level='Timestamp', inplace=True)
alldays_timestamped.to_parquet('TimestampToSep11new.parquet')