In [None]:
import sys, os
import collections
import csv
import glob as gb
import datetime, pytz
from datetime import datetime, timedelta
from collections import defaultdict
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import numpy as np
import pandas as pd

In [None]:
def get_records(input_csv):
    """
    Gets the records of an input csv and formats for easy random access
    :param input_csv: path to input csv
    :type input_csv: str
    :return: dictionary where keys are the timestamps of the inputted file, values are user id and violation occurance
    :return type: dict(list)
    """
    records = {} 
    with open(input_csv, newline='') as csv_file:
        # skip the first line because it has headers and field names
        headers = True
        data_reader = csv.reader(csv_file) # instantiate a csv reader
        for row in data_reader:
            if not headers:
                created = datetime.strptime(row[3], "%Y-%m-%d %H:%M:%S")
                created = created.replace(tzinfo = pytz.utc)
                created = created.astimezone(pytz.timezone('US/Eastern'))
                # datetime formatting to convert UTC to EST or EDT as appropriate.
                # hardcoding timezone here is acceptable since our domain is very constrained.
                user_id = row[0]
                violation = row[5].lower()
                value = (user_id, violation)
                records[created] = value # add entries to dictionary
            else:
                headers = False
    return records

def read_all_user(inFilePathPerUser):
    """
    Parses through each csv file found in the file path to get all per user records
    :param inFilePathPerUser: path to per-user data directory
    :type inFilePathPerUser: str
    :return: list of dictionaries for per-user records
    :rtype: list
    """
    filePathPerUser =  inFilePathPerUser # file path for the per-user data
    listOfFiles = os.listdir(filePathPerUser) # list of all the files in the per-user data directory
    recordsAllUsers = list()
    for i in range(0, len(listOfFiles)):
        input_csv = filePathPerUser + '/' + listOfFiles[i]
        recordsAllUsers.append(get_records(input_csv)) # add each user's dictionary of records to the list
    return recordsAllUsers

all_user_records = read_all_user('./data/per_user')

In [None]:
def consecutive_violation_hours(user_record):
    """
    Counts and records the length in hours of all consecutive violations
    :param user_record: dictionary where keys are the timestamps of the inputted file, values are user id and violation occurance
    :type user_record: dict
    :return: dictionary where key is user id, value is a list of the lengths in hours of all consecutive violations
    :rtype: dict
    """
    consecutive_violation_hours = {}
    violations = list()
    user_id = list(user_record.values())[0][0]
    count = 0
    for i in range(0, len(user_record) - 1): # iterate over all of the user's records
        time = list(user_record.keys())[i] # get the time of the record
        violation = list(user_record.values())[i][1] # get whether or not there is a violation
        next_time = list(user_record.keys())[i + 1] # get the time of the next record
        time_differential = time - next_time # get the time differential between this record and the next
        time_differential_hours = divmod(time_differential.seconds, 3600)[0] # time differential in hours
        if violation == 'true': # if there is a violation...
            if count == 0: # increase the count but only if the previous record was not a violation
                count = count + 1
            if time_differential.days == 0 and time_differential_hours < 2 and time_differential_hours >= 1:
                count = count + time_differential_hours
                # if the amount of time between the records is between an hour and two hours, increase the count by one
            else:
                if count != 0:
                    violations.append(count)
                count = 0
                # else if the amount of time between the records is more than 2 hours, 
                # add the current count of consequtive violation hours to the list and reset the count.
        else:
            if count != 0:
                violations.append(count - 1)
            count = 0
            # if there stops being a violation, add the current count of consequtive violation hours to the list and reset 
            # the count.
    violations.append(count + 1)
    consecutive_violation_hours[user_id] = violations
    return consecutive_violation_hours
    # NOTE:
    # There are some cases in the users' records where there are pieces of 'missing data.'
    # These might be a result of the heat seek being turned off for a period of time. 
    # Commonly, a recording is taken every hour, but in these cases, there may be records taken at intervals longer than 1 hr. 
    # Hence, it is important to consider the time interval between consecutive records. 
    # At the extreme, one record may have been taken 10 days after the prior record. And of course, even if both records show
    # a violation, these would not be considered consecutive hours of violation.
    # In some cases, the time interval between two consecutive records was more than an hour but less than two hours.
    # These were still considered to be consecutive hours of violation, but any time interval longer than two hours was
    # not considered consecutive hours of violations, because we know too little about what occurred between the interval.
    
    # Because of this afformentioned issue of 'missing data,' this might be a sign that we shouldn't be considering
    # or recording consecutive violation hours to make conclusions. Rather, we may resort to day-wise binning.

def plot_hist_consecutive_violation(consecutive_violations):
    """
    Make a bar graph of consecutive hours of violations
    :param consecutive_violations: list of the lengths in hours of all consecutive violations
    :type consecutive_violations: list
    """
    fig, ax = plt.subplots()
    # plot of the data
    
    consecutive_hours = collections.Counter(list(consecutive_violations.values())[0]).keys()
    # x axis data: length of consecutive violation
    frequency = collections.Counter(list(consecutive_violations.values())[0]).values()
    # y axis data: frequency
    ax.bar(consecutive_hours, frequency, width = 1)
    ax.set_xlabel('Consecutive Violation Hours')
    ax.set_ylabel('Frequency')
    ax.set_title('Histogram of Consecutive violation hours for user ID ' + list(consecutive_violations.keys())[0])
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    ax.set_xbound(0, max(consecutive_hours) + 1)
    plt.show()

In [None]:
for user_record in all_user_records:
    consecutive_violations = consecutive_violation_hours(user_record)
    plot_hist_consecutive_violation(consecutive_violations)

In [None]:
def get_max_consecutive_violation(user_record):
    """
    Get longest consecutive violations for inputted user
    :param user_record: Dictionary where keys are the timestamps of the inputted file, values are user id and violation occurance
    :type user_record: dictionary
    :returns: the longest consecutive violation in hours
    :rtype: Integer
    """
    consecutive_violations = consecutive_violation_hours(user_record)
    consecutive_hours = collections.Counter(list(consecutive_violations.values())[0]).keys()
    maximum = max(consecutive_hours)
    return maximum

def plot_max_consecutive_violation():
    """
    Plot the longest consecutive time period of violation in hours for each user
    """
    user_id = list()
    # x axis data: user id
    max_consecutive_violation = list()
    # y axis data: longest consecutive violation
    for user_record in all_user_records:
        this_user_id = list(user_record.values())[0][0]
        user_id.append(this_user_id)
        this_max_consecutive_violation = get_max_consecutive_violation(user_record)
        max_consecutive_violation.append(this_max_consecutive_violation)
        print("[" + this_user_id + ": " + str(this_max_consecutive_violation) + "]", end = "\t")

    # plot of the data
    fig = plt.figure(figsize=(20,5))
    plt.bar(user_id, max_consecutive_violation)
    plt.xlabel('User ID')
    plt.ylabel('Longest Consecutive Hours of Violation')
    plt.show()

In [None]:
plot_max_consecutive_violation()
# Compared my results below to those of Daniel. Found some discrepancies. 
# TODO: confirm the validity of the data below by checking manually with the per-user data. 

In [None]:
def bin_by_time(input_csv, time, day_night):
    """
    Bins all temperature violations (as defined by NY law) in the csv file by hour
    :param input_csv: path to input csv
    :type input_csv: str
    :param time: the aspect of timestamp to bin by, e.g. month, day, hour.
    :type time: str
    :param day_night: the time of day (day or night) to look at.
    :type day_night: str
    :return: dictionary of lists where keys are time slices, values are lists of records where a violation occured
    :rtype: dict(list)
    """
    
    # keys are values of time parameter, values are lists of rows
    violations = defaultdict(list)
    
    with open(input_csv, newline='') as csv_file:
        # skip headers
        headers = True
        data_reader = csv.reader(csv_file)
        for row in data_reader:
            if not headers:
                # if this is a violation
                if row[5].lower() == 'true':
                    created = datetime.strptime(row[3], "%Y-%m-%d %H:%M:%S")
                    # datetime formatting to convert UTC to EST or EDT as appropriate
                    # for EDA only! stick to UTC for actual automation.
                    # hardcoding timezone since our domain is very constrained.
                    created = created.replace(tzinfo=pytz.utc)
                    created = created.astimezone(pytz.timezone('US/Eastern'))
                    if day_night == "day":
                        if created.hour >= 6 and created.hour <= 21:
                            violations[getattr(created, time)].append(row)
                            # looking only at day time hours
                    elif day_night == "night":
                        if created.hour >= 22 or created.hour <= 5:
                            violations[getattr(created, time)].append(row)
                            # looking only at night time hours
                    else:
                        violations[getattr(created, time)].append(row)
                    # modified the code from Daniel to output only Day time hour data, Night time hour data, or All
            else:
                headers = False
    return violations

def plot_time_bins(time_bins, label):
    """
    Make a bar graph of total violations across all users with a bin for each time
    :param time_bins: a dictionary of lists keyed by time, value is list of violations occurring during that time
    :type time_bins: dict[list]
    :param label: the label for the x axis of the plot (what type of time slice this is)
    :type label: str
    """
    
    violations_per_time = []
    tick_labels = []
    # get tick labels in order
    for time_slice, records in sorted(time_bins.items(), key=lambda x: x[0]):
        violations_per_time.append(len(records))
        tick_labels.append(time_slice)
        print(str(time_slice) + ': ' + str(len(records)))
    
    plt.bar(range(len(violations_per_time)), violations_per_time, tick_label=tick_labels)
    plt.xlabel(label)
    plt.ylabel('total violations')
    plt.show()

In [None]:
time_bins = bin_by_time('./data/clean_100117_013118.csv', 'hour', 'day')
plot_time_bins(time_bins, 'day hours')
time_bins = bin_by_time('./data/clean_100117_013118.csv', 'hour', 'night')
plot_time_bins(time_bins, 'night hours')
time_bins = bin_by_time('./data/clean_100117_013118.csv', 'hour', 'all')
plot_time_bins(time_bins, 'all hours')