In [None]:
import csv
import datetime, pytz
from datetime import datetime, timedelta
from collections import defaultdict
import matplotlib.pyplot as plt

In [None]:
def bin_by_time(input_csv, time):
    """
    Bins all temperature violations (as defined by NY law) in the csv file by hour
    :param input_csv: path to input csv
    :type input_csv: str
    :param time: the aspect of timestamp to bin by, e.g. month, day, hour.
    :type time: str
    :return: dictionary of lists where keys are time slices, values are lists of records where a violation occured
    :rtype: dict(list)
    """
    
    # keys are values of time parameter, values are lists of rows
    violations = defaultdict(list)
    
    with open(input_csv, newline='') as csv_file:
        # skip headers
        headers = True
        data_reader = csv.reader(csv_file)
        for row in data_reader:
            if not headers:
                # if this is a violation
                if row[5].lower() == 'true':
                    created = datetime.strptime(row[3], "%Y-%m-%d %H:%M:%S")
                    # datetime formatting to convert UTC to EST or EDT as appropriate
                    # for EDA only! stick to UTC for actual automation.
                    # hardcoding timezone since our domain is very constrained.
                    created = created.replace(tzinfo=pytz.utc)
                    created = created.astimezone(pytz.timezone('US/Eastern'))
                    violations[getattr(created, time)].append(row)
            else:
                headers = False
    return violations
            

def plot_time_bins(time_bins, label):
    """
    Make a bar graph of violations with a bin for each time
    :param time_bins: a dictionary of lists keyed by time, value is list of violations occurring during that time
    :type time_bins: dict[list]
    :param label: the label for the x axis of the plot (what type of time slice this is)
    :type label: str
    """
    
    violations_per_time = []
    tick_labels = []
    # get tick labels in order
    for time_slice, records in sorted(time_bins.items(), key=lambda x: x[0]):
        violations_per_time.append(len(records))
        tick_labels.append(time_slice)
        print(str(time_slice) + ': ' + str(len(records)))
    
    plt.bar(range(len(violations_per_time)), violations_per_time, tick_label=tick_labels)
    plt.xlabel(label)
    plt.ylabel('total violations')
    plt.show()
        

In [None]:
time_bins = bin_by_time('./data/clean_100117_013118.csv', 'month')
plot_time_bins(time_bins, 'month')

In [None]:
time_bins = bin_by_time('./data/clean_100117_013118.csv', 'hour')
plot_time_bins(time_bins, 'hour (US/Eastern)')

This is exactly what we would expect. It is easier to avoid violations at night when the temperature is allowed to be lower.

In [None]:
def get_violations_by_user(input_csv):
    """
    Get violations for each user 
    :param input_csv: path to input csv
    :type input_csv: str
    :returns: dictionary of lists where key is the user, value is list of all violation records
    :rtype: dict(list)
    """
    
    user_violations = defaultdict(list)
    
    with open(input_csv, newline='') as csv_file:
        # skip headers
        headers = True
        data_reader = csv.reader(csv_file)
        for row in data_reader:
            if not headers:
                # if this is a violation
                if row[5].lower() == 'true':
                    user_violations[int(row[0])].append(row)
            else:
                headers = False
    return user_violations


def datetime_replace_under(datetime, time_slice, replace_val=0):
    """replace all values under the time slice specified in the datetime"""
    if time_slice == 'years':
        return datetime.replace(month=0, day=0, hour=0, minute=0, second=0)
    elif time_slice == 'months':
        return datetime.replace(day=0, hour=0, minute=0, second=0)
    elif time_slice == 'days':
        return datetime.replace(hour=0, minute=0, second=0)
    elif time_slice == 'hours':
        return datetime.replace(minute=0, second=0)
    elif time_slice == 'minutes':
        return datetime.replace(second=0)
    
           
def get_longest_consecutive_violations(input_csv, time_slice):
    """
    Get longest consecutive violations over time slice for each unit 
    :param input_csv: path to input csv
    :type input_csv: str
    :param time_slice: the aspect of timestamp to check over, e.g. days, hours, months
    :type time_slice: str
    :returns: dictionary where key is the user, value is longest consecutive streak of violations by time slice
    :rtype: dict
    """
    
    # TODO: this is an inefficient way to do things, be more clever if input data becomes too large
    
    consecutive_violations_by_user = {}
    
    for user, violations in get_violations_by_user(input_csv).items():
        # compress violations by time slice to make checking consecutive violations simple
        compressed_violation_times = set()
        for violation in violations:
            created = datetime.strptime(violation[3], "%Y-%m-%d %H:%M:%S")
            # replace everything within time slice so won't be added to set multiple times
            created = datetime_replace_under(created, time_slice)
            compressed_violation_times.add(created)
        # start at 1 violation
        max_consecutive_violations = 1
        current_consecutive_violations = 1
        sorted_cvt = sorted(compressed_violation_times)
        # sort violations by created timestamp to make checking consecutive violations simple
        for i, violation_time in enumerate(sorted_cvt):
            # nothing consecutive to check for on most recent violation
            if i < len(sorted_cvt) - 1:
                # allows us to directly pass our desired timedelta for any time slice without a bunch of conditionals
                timedelta_args = {time_slice: 1}
                # note we opt not to account for rollover (e.g. new day, month, year)
                consecutive_time = violation_time + timedelta(**timedelta_args)
                # if the next consecutive time slice also has a violation
                if sorted_cvt[i+1] == consecutive_time:
                    current_consecutive_violations += 1
                # streak of consecutive violations is broken
                else:
                    if max_consecutive_violations < current_consecutive_violations:
                        max_consecutive_violations = current_consecutive_violations
                    # reset consecutive violation counter
                    current_consecutive_violations = 1
            # if we ended on a streak check to make sure it's not the longest streak
            else:
                if max_consecutive_violations < current_consecutive_violations:
                    max_consecutive_violations = current_consecutive_violations
                
        
        consecutive_violations_by_user[user] = max_consecutive_violations
    return consecutive_violations_by_user


def plot_consecutive_violations(consec_violations_by_user, label):
    """
    Make a bar graph of longest consecutive violations for user
    :param consec_violations_by_user: dictionary where key is the user, value is longest consecutive streak of violations by time slice
    :type consec_violations_by_user: dict
    :param label: the label for the y axis of the plot (what type of time slice this is)
    :type label: str
    """
    
    streaks = []
    tick_labels = []
    # get tick labels in order
    for user_id, streak_length in sorted(consec_violations_by_user.items(), key=lambda x: x[0]):
        streaks.append(streak_length)
        tick_labels.append(user_id)
        print(str(user_id) + ': ' + str(streak_length))
    
    fig = plt.figure(figsize=(15,5))
    plt.bar(range(len(streaks)), streaks, tick_label=tick_labels)
    plt.xlabel('User ID')
    plt.ylabel('Consecutive {0} with Violation'.format(label))
    plt.show()
    


In [None]:
consec = get_longest_consecutive_violations('./data/clean_100117_013118.csv', 'hours')
plot_consecutive_violations(consec, 'hours')

In [None]:
consec = get_longest_consecutive_violations('./data/clean_100117_013118.csv', 'days')
plot_consecutive_violations(consec, 'days')