### Notes
* Some rows don't have outdoor temperature data

In [None]:
import sys, os
import csv
from datetime import datetime
import pytz
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
def import_file(filepath):
    """
    Imports dataset into a standard format for investigation
    :param filepath: filepath to datset csv
    :type filepath: str
    """
    # list of dicts
    dataset = []
    # open the file
    with open(filepath, 'r', newline='') as csv_file:
        # define a csv reader that will parse the csv format for us
        csv_reader = csv.reader(csv_file)
        # skip the header row
        next(csv_reader)
        # iterate through the rest
        for row in csv_reader:
            temp_dict = {}
            temp_dict['user_id'] = row[0]
            temp_dict['sensor_id'] = row[1]
            temp_dict['temp'] = row[2]
            temp_dict['created_at'] = datetime.strptime(row[3], "%Y-%m-%d %H:%M:%S")
            # sometimes outdoor_temp is empty
            if row[4] == '':
                temp_dict['outdoor_temp'] = 0
            else:
                temp_dict['outdoor_temp'] = int(row[4])
            if row[5] == 'true':
                temp_dict['violation'] = 1
            elif row[5] == 'false':
                temp_dict['violation'] = 0
            else:
                raise ValueError('unexpected violation boolean')
            dataset.append(temp_dict)
    
    return dataset

In [None]:
def vio_per_user(dataset):
    user_dict = {}
    for row in dataset:
        if row['violation']:
            if row['user_id'] in user_dict:
                # if we've seen this user already, just increment by 1
                user_dict[row['user_id']] += 1
            else:
                # if not, make an entry and set to 1
                user_dict[row['user_id']] = 1
    return user_dict

In [None]:
def summary_stats(dataset):
    """
    Print some very basic information about the provided dataset.
    :param dataset: returned by import_file()
    :type dataset: list(dict)
    """
    # Unique users
    user_ids = [data['user_id'] for data in dataset]
    # set doesn't have unique elements!
    user_set = set(user_ids)
    print('Unique users: ', str(len(user_set)))
    
    # more to come!

In [None]:
def violation_hist(dataset):
    """
    Plot the distribution of violations
    :param dataset: returned by import_file()
    :type dataset: list(dict)
    """
    # get violations per user
    user_dict = vio_per_user(dataset)
    # we actually only need the user counts
    counts = [user_dict[x] for x in user_dict]
    # plot histogram
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize = (10,5))
    ax.hist(counts, bins=70)
    ax.set_title('Histogram of Violations from 10/01/17 to 01/31/18')
    ax.set_xlim([0,700])
    ax.set_ylabel('Number of Tenants')
    ax.set_xlabel('Number of Violations (10 per bin)')
    plt.show()

In [None]:
def timeseries_line(dataset):
    """
    Plot the timeseries of violations per user
    This is intended to show the overall distribution, not individual users.
    :param dataset: returned by import_file()
    :type dataset: list(dict)
    """
    # split up the dataset so it's per user
    dataset_by_user = {}
    # Unique users
    user_ids = [data['user_id'] for data in dataset]
    # set doesn't have unique elements!
    user_set = set(user_ids)
    # give each user a list
    for user in user_set:
        dataset_by_user[user] = []
    # split up the dataset by user
    for row in dataset:
        dataset_by_user[row['user_id']].append(row)
    
    # bin by date
    # I've avoided pandas, but it would be very useful here...
    # maybe I should've used pandas from the beginning. Next time.
    binned_by_user = {}
    for user in dataset_by_user:
        userdata = dataset_by_user[user]
        user_df = pd.DataFrame(userdata, index=range(len(userdata)))
        # print(user_df)
        # here's the only reason I'm using pandas
        # https://stackoverflow.com/questions/39002122/binning-time-series-with-pandas
        # bin by date and sum violations
        user_df.set_index('created_at', inplace=True)
        df_group = user_df.groupby(pd.Grouper(level='created_at', freq='D'))['violation'].sum()
        #print(df_group)
        binned_by_user[user] = df_group
        
    
    # plot everything
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))
    for user in binned_by_user:
        user_df = binned_by_user[user]
        plt.figure()
        user_df.plot()

In [None]:
def timeseries_with_outdoor_temp(dataset):
    """
    Plot the timeseries of violations per user along with the average outside temp on that day
    This is intended to show the overall distribution, not individual users.
    :param dataset: returned by import_file()
    :type dataset: list(dict)
    """
    # split up the dataset so it's per user
    dataset_by_user = {}
    # Unique users
    user_ids = [data['user_id'] for data in dataset]
    # set doesn't have unique elements!
    user_set = set(user_ids)
    # give each user a list
    for user in user_set:
        dataset_by_user[user] = []
    # split up the dataset by user
    for row in dataset:
        dataset_by_user[row['user_id']].append(row)
    
    # bin by date
    # I've avoided pandas, but it would be very useful here...
    # maybe I should've used pandas from the beginning. Next time.
    binned_by_user = {}
    for user in dataset_by_user:
        userdata = dataset_by_user[user]
        user_df = pd.DataFrame(userdata, index=range(len(userdata)))
        # print(user_df)
        # here's the only reason I'm using pandas
        # https://stackoverflow.com/questions/39002122/binning-time-series-with-pandas
        # bin by date and sum violations
        user_df.set_index('created_at', inplace=True)
        df_vio_group = user_df.groupby(pd.Grouper(level='created_at', freq='D'))['violation'].sum()
        
        outdoor_temp_df = pd.DataFrame(userdata, index=range(len(userdata)))
        outdoor_temp_df.set_index('created_at', inplace=True)
        # Taking mean outdoor temp over day
        outdoor_temp_df_group = outdoor_temp_df.groupby(pd.Grouper(level='created_at', freq='D'))['outdoor_temp'].agg('mean')
        
        merged = pd.concat([df_vio_group, outdoor_temp_df_group], axis=1)
        binned_by_user[user] = merged
        
    
    # plot everything
    #fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))
    for user in binned_by_user:
        user_df = binned_by_user[user]
        #plt.figure()
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))
        ax.plot(user_df.index.values, user_df['violation'], 'b-', label='violation')
        ax2 = ax.twinx()
        ax2.plot(user_df.index.values, user_df['outdoor_temp'], '-', color='orange', label='out_temp')
        ax.legend()
        ax2.legend()
        #user_df.plot()

In [None]:
dataset = import_file('./data/clean_100117_013118.csv')
summary_stats(dataset)
violation_hist(dataset)
timeseries_line(dataset)

In [None]:
# Check out results for expanded data set
dataset = import_file('./data/clean_100117_053118.csv')
summary_stats(dataset)
violation_hist(dataset)
timeseries_line(dataset)

In [None]:
# Include outdoor temp
dataset = import_file('./data/clean_100117_053118.csv')
summary_stats(dataset)
timeseries_with_outdoor_temp(dataset)

It seems that spikes downward in temperature (not just low temperature) are correlated with increase in violations