In [14]:
'''
Investiages the correlation between frequency and severity of violations
through per user analyses

1. Find frequency and severity for each user
    - freq = (# hours of violation) / (total # hours recorded)
    - sevr = sum of (required temperature - measured temperature)
                for all violations
2. Plot each user as a datapoint and perform regression
3. Conclude
'''

import csv
import sys
import os
from datetime import datetime

In [41]:
def calculate_sevr(time, measured_temp, outside_temp):
    '''
    Calculates (required temperature - measured temperature),
    taking into account the time to adjust required temp.
    Returns 0 if measured_temp is not a violation
    :type time: datetime
    :type measured_temp: int
    :type outside_temp: int (optional for nighttime)
    '''
    measured_temp = int(measured_temp)
    try:
        outside_temp = int(outside_temp)
    except Exception:
        outside_temp = None
    diff = 0
    day = [hr for hr in range(6, 22)] # day: 6 AM - 10 PM
    night = [22, 23, 0, 1, 2, 3, 4, 5] # night: 10 PM - 6 AM
    if time.hour in day:
        if not outside_temp:
            # Daytime calculation requires outside_temp, but csv files sometimes don't
            # satisfy this condition so treating not-enough-info as just non-violationf or now
            pass
            # raise ValueError('Daytime calculation requires outside_temp information')
        elif outside_temp < 55:
            required_temp = 68
            diff = required_temp - measured_temp
        else:
            # Not a violation
            pass
        
    if time.hour in night:
        required_temp = 62
        diff = required_temp - measured_temp
    
    # Turning non-violating diff's into 0 
    if diff < 0:
        diff = 0
        
    return diff
        

In [42]:
class Row:
    def __init__(self, user_id, sensor_id, temp, created_at, outdoor_temp, violation):
        self.user_id = user_id
        self.sensor_id = sensor_id
        self.temp = temp
        self.created_at = created_at
        self.outdoor_temp = outdoor_temp
        self.violation = violation
        
    def __repr__(self):
        return f'<Row Object> user_id: {self.user_id}'
    
    def __str__(self):
        return __repr__(self)
    
    def __eq__(self, another_row):
        '''
        Two Row objects are equal if they have the same user_id
        '''
        return self.user_id == another_row.user_id
    

In [43]:
class User:
    def __init__(self, row):
        self.user_id = row.user_id
        self.row_list = [row]
        self.num_violation = 0
        if row.violation:
            self.num_violation += 1
        
    def add_row(self, row):
        if self.user_id != row.user_id:
            raise ValueError('This row does not belong to this user')
        self.row_list.append(row)
        if row.violation:
            self.num_violation += 1
        
    def __repr__(self):
        return f'<User Object> user_id: {self.user_id}'
    
    def __str__(self):
        return __repr__(self)
    
    def freq(self):
        '''
        Calculates the frequency of violation for this user
        freq = (# hours of violation) / (total # hours recorded)
        '''
        return self.num_violation / len(self.row_list)
    
    def sevr(self):
        '''
        Calculates the severity of violation for this user
        '''
        severity = 0
        for row in self.row_list:
            severity += calculate_sevr(row.created_at, row.temp, row.outdoor_temp)
        return severity
        

In [44]:
def import_file(path):
    '''
    Imports dataset in a standardized manner
    '''
    
    users = []

    with open(path, 'r', newline='') as f:
        csv_reader = csv.reader(f)
        next(csv_reader)
        for row in csv_reader:
            row[0] = int(row[0])
            row[2] = int(row[2])
            row[3] = datetime.strptime(row[3], "%Y-%m-%d %H:%M:%S")
            row[4] = row[4]
            if row[5] == 'true':
                row[5] = True
            elif row[5] == 'false':
                row[5] = False
            else:
                raise ValueError('unexpected value for violation')
            row = Row(*row)
            user_found = False
            for user in users:
                if row.user_id == user.user_id:
                    user.add_row(row)
                    user_found = True
            if not user_found:
                users.append(User(row))
    return users

In [45]:
def compute_freq_sevr(users):
    '''
    Returns datapoints in the format (freq, sevr) for each user in users
    '''
    datapoints = []
    for user in users:
        freq = user.freq()
        sevr = user.sevr()
        datapoint = (freq, sevr)
        datapoints.append(datapoint)
    return datapoints

In [46]:
users = import_file('./data/dataset/clean_013118_053118.csv')
print(compute_freq_sevr(users))

[(0.0024104683195592287, 40), (0.027123483226266953, 206), (0.09607438016528926, 1489), (0.022382920110192838, 257), (0.0, 0), (0.009986225895316805, 55), (0.008861320336730172, 44), (0.04509466437177281, 319), (0.4056473829201102, 4551), (0.0, 0), (0.04511019283746556, 336), (0.0027041644131963224, 2), (0.0165, 81), (0.0038461538461538464, 47), (0.0019851116625310174, 6), (0.1771274547554871, 2405), (0.04338842975206612, 205), (0.09641873278236915, 771), (0.13739669421487602, 1158), (0.23519283746556474, 2382), (0.0, 0), (0.046153846153846156, 179), (0.09645194626248708, 1012), (0.008634646519158122, 47), (0.0010334137099552187, 4), (0.04581467447468136, 410), (0.03548053737512918, 410), (0.3644505683775405, 4895), (0.0, 0), (0.08307264511644073, 289), (0.0, 0), (0.0004847309743092584, 2), (0.18281845112145578, 1003), (0.005475989890480202, 27), (0.0014800197335964479, 6), (0.002530044275774826, 1), (0.38546458141674333, 3701), (0.017937219730941704, 31), (0.0024937655860349127, 0), (