# Exploratory Data Analysis Overview

## Table of Contents

* I Data Overview
    * I.1 What do our tenants look like?
    * I.2 How is the quality of the data?
    * I.3 Conclusion & Impact
* II Quantifying Violations


In [None]:
import sys, os
import csv
from datetime import datetime
import pytz
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd

## I.1 What does our tenants data look like?

In [None]:
def user_import(filepath):
    """
    Given a filepath to a dataset, return a list of dataframes
    per user.
    """
    # open the file
    with open(filepath, 'r') as inputfile:
        # import csv with pandas
        file_df = pd.read_csv(inputfile, header = 0)
        
        file_df['created_at'] = pd.to_datetime(file_df['created_at'], format="%Y-%m-%d %H:%M:%S", utc=True)
        file_df['created_at'] = file_df['created_at'].dt.tz_convert(pytz.timezone('US/Eastern'))
        # determine list of unique users
        userlist = file_df['user_id'].tolist()
        userset = set(userlist)
        
        user_list = []
        # iterate through each user and save out data
        for user in userset:
            user_df = file_df.loc[file_df['user_id'] == user]
            #user_df['user_id'] = user
            user_df.reset_index(drop=True, inplace=True)
            user_list.append(user_df)

    return file_df, user_list

def summary_stats(file_dataset, user_dataset):
    """
    Print some very basic information about the provided dataset.
    :param file_dataset: first dataset returned by user_import()
    :type file_dataset: DataFrame
    :param user_dataset: second dataset returned by user_import()
    :type user_dataset: list(DataFrame)
    """
    print('Number of unique tenants: ', str(len(user_dataset)))
    
    print('Number of datapoints: ', str(len(file_dataset)))
    
    from_date = min(file_dataset['created_at'])
    to_date = max(file_dataset['created_at'])
    print('Range of datapoints: ', from_date.strftime("%Y-%m-%d %H:%M:%S"), ' to ', \
          to_date.strftime("%Y-%m-%d %H:%M:%S"))

In [None]:
def coverage_plot(user_dataset):
    """
    Plot the time coverage of individual user data.
    :param user_dataset: second dataset returned by user_import()
    :type user_dataset: list(DataFrame)
    """
    # Define a new DataFrame for our use
    coverage_df = pd.DataFrame(columns=['from', 'to', 'len', 'user_id'])
    
    # Calculate each of the columns
    for user_df in user_dataset:
        # Earliest timestamp
        user_from = min(user_df['created_at'])
        # Latest timestamp
        user_to = max(user_df['created_at'])
        # Range of timestamps
        user_range = user_to - user_from
        # Fill in coverage DataFrame
        coverage_df = coverage_df.append({'from': user_from, 'to':user_to, 'len': user_range, \
                                          'user_id': user_df.loc[0, 'user_id']}, ignore_index=True)
    
    # Sort users by range, so users with longest uptime is first
    coverage_df.sort_values('len', ascending=False, inplace=True)
    coverage_df.reset_index(drop=True, inplace=True)
    
    # matplotlib magic
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(17,22))
    ax.xaxis_date()
    y_labels =[]
    deltas = []
    lefts = []
    
    # For each user...
    for i in range(len(coverage_df)):
        # User id label for the y-axis
        y_labels.append(str(coverage_df.iloc[i]['user_id']))
        # Convert datetime into matplotlib date number format
        from_num = mdates.date2num(coverage_df.iloc[i]['from'])
        to_num = mdates.date2num(coverage_df.iloc[i]['to'])
        # Recalculate range in matplotlib date number format
        delta_num = to_num - from_num
        deltas.append(delta_num)
        lefts.append(from_num)
    print(y_labels)
    
    # Plot using "horizontal bar graph" format
    # See matplotlib docs for definitions of each function
    ax.barh(range(len(coverage_df)), deltas, left=lefts, height=0.7, align='center', zorder=3)
    ax.set_xlim((lefts[0]-10, lefts[0]+deltas[0]+10))
    ax.set_yticks(range(len(coverage_df)))
    ax.set_yticklabels(y_labels)
    ax.set_ylabel('user_id', fontweight='bold', fontsize='large')
    ax.set_xlabel('time', fontweight='bold', fontsize='large')
    ax.set_title('2017-2018 Heat Season Tenant Data Time-Coverage Summary', fontweight='bold', fontsize='large')
    ax.invert_yaxis()
    ax.grid(b=True, which='major', axis='both', linestyle='--', zorder=1)
    return

In [None]:
file_dataset, user_dataset = user_import('data/clean_100117_053118.csv')
summary_stats(file_dataset, user_dataset)

In [None]:
coverage_plot(user_dataset)

## I.2 What is the quality of the data?

In [None]:
# determine whether data is actually coming in hourly; calculate all data intervals
def interval_calc(user_dataset):
    """
    Iterate through all users and calculate the intervals between data points.
    :param user_dataset: second dataset returned by user_import()
    :type user_dataset: list(DataFrame)
    """
    intervals = []
    users = []
    for user in user_dataset:
        temp = []
        for i in range(0, len(user)-1):
            now = user.iloc[i]['created_at']
            pre = user.iloc[i+1]['created_at']
            delta = now - pre # we expect this to be 3600
            temp.append((delta.total_seconds()) / 3600.0)
        intervals.append(temp)
        users.append(user.iloc[0]['user_id'])
    return intervals

# def plot_intervals(intervals):
#     fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))
#     ax.hist(all_intervals, bins=np.logspace(np.log10(0.001),np.log10(1000), 100))
#     #ax.set_yscale('log')
#     ax.set_ylim(0,50)
#     ax.set_xscale('log')

In [None]:
intervals = interval_calc(user_dataset)
all_intervals = np.array([intv for sub in intervals for intv in sub])

In [None]:
print('Intervals exactly 1 hour apart:')
print(len([a for a in all_intervals if (a <= 1.00 and a >= 1.00)]) / len([a for a in all_intervals]))
print('Intervals 1 hour +- 1 second apart:')
print(len([a for a in all_intervals if (a <= 1.000277778 and a >= 0.999722222)]) / len([a for a in all_intervals]))
print('Intervals 1 hour +- 2 seconds apart:')
print(len([a for a in all_intervals if (a <= 1.000555556 and a >= 0.999444444)]) / len([a for a in all_intervals]))

In [None]:
def outdoor_uptime(file_dataset):
    """
    Calculate the percentage of all measurements that have outdoor temperature.
    :param file_dataset: first dataset returned by user_import()
    :type file_dataset: DataFrame
    """
    
    # count the number of rows that don't have NaN values in the outdoor_temp column.
    outdoor_dataset = file_dataset.loc[file_dataset['outdoor_temp'].notna()]
    return len(outdoor_dataset) / len(file_dataset)

od_up = outdoor_uptime(file_dataset)
print('The outdoor_temp uptime is ', od_up )

In [None]:
def overall_uptime(user_dataset):
    """
    Calculate the uptime of measurements taken by each user's sensor.
    We expect one measurement per hour, or 24 measurements every day.
    :param user_dataset: second dataset returned by user_import()
    :type user_dataset: list(DataFrame)
    """
    uptimes = []
    for user in user_dataset:
        # Determine the time the sensor was installed for
        from_date = min(user['created_at'])
        to_date = max(user['created_at'])
        date_range = to_date - from_date
        
        # expected data points
        # note the fenceposted +1
        exp_len = int(date_range.total_seconds() / 3600.0) + 1
        
        # calculate the uptime
        if exp_len == 0:
            perc = 0
        else:
            # number of datapoints / expected number of datapoints
            perc = len(user) / exp_len
        uptimes.append(perc)
    return uptimes

def plot_uptime(uptime):
    """
    Plot the uptime of measurements taken by each user's sensor as a histogram.
    :param uptime: output from overall_uptime()
    :type uptime: list
    """
    # simple matplotlib histogram
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5))
    ax.hist(uptime, 20)
    ax.set_title('2017-2018 Heat Season Tenant Data Uptime')
    ax.set_xlabel('Uptime Percentage (assuming data every 1hr)')
    ax.set_ylabel('Number of Tenants')
    
total_up = overall_uptime(user_dataset)
print(total_up)
plot_uptime(total_up)

Notes:
* Pretty good uptime, one of the datapoints was collected twice as often (every 30)
* Variation is motivator for cumulative instead of consecutive metrics

In [None]:
def split_daynight(file_dataset):
    """
    Split overall dataset into day and night
    :param file_dataset: first dataset returned by user_import()
    :type file_dataset: DataFrame
    """
    # day and night time definitions
    day = [hr for hr in range(6, 22)]
    night = [22, 23, 0, 1, 2, 3, 4, 5]
    # pull out the rows that have 'created_at' column values with hour values in either range
    # .dt interprets the Series as datetime
    # .hour gets the hour value
    # .isin checks if the value is in a list
    day_dataset = file_dataset.loc[file_dataset['created_at'].dt.hour.isin(day)]
    night_dataset = file_dataset.loc[file_dataset['created_at'].dt.hour.isin(night)]
    
    return day_dataset, night_dataset

def day_scatter(file_dataset):
    """
    Plot a scatterplot of the daytime dataset
    :param file_dataset: first dataset returned by split_daynight()
    :type file_dataset: DataFrame
    """
    # Split the dataset into violation and non-violation
    vio_true = file_dataset.loc[file_dataset['violation'] == True]
    vio_false = file_dataset.loc[file_dataset['violation'] == False]
    
    # For both datasets, get the Series for indoor/outdoor temperatures, 
    # as well as the timestamp for heatmapping
    vt_yval = vio_true['temp']
    vt_xval = vio_true['outdoor_temp']
    vt_c = mdates.date2num(vio_true['created_at'])
    
    vf_yval = vio_false['temp']
    vf_xval = vio_false['outdoor_temp']
    vf_c = mdates.date2num(vio_false['created_at'])
    
    # Getting the min and max timestamps for accurate heatmapping
    cb_min = min((min(vf_c), min(vt_c)))
    cb_max = max((max(vf_c), max(vt_c)))
    
    # Defining the datapoints to plot the temp violation cutoffs
    base = range(0, 100)
    day_room = [68] * 100
    day_out = [55] * 100
    
    # matplotlib magic
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20,8))
    ax1.scatter(vf_xval, vf_yval, c='g', marker='.', label='nominal')
    ax1.scatter(vt_xval, vt_yval, c='r', marker='.', label='violation')
    ax1.plot(base, base, 'b-')
    ax1.plot(base, day_room, 'k-')
    ax1.plot(day_out, base, 'k-')
    ax1.legend()
    
    ax1.set_xlabel('outdoor temp. (F)')
    ax1.set_ylabel('indoor temp. (F)')
    fig.suptitle('2017-2018 Heat Season Tenant Daytime Indoor vs Outdoor Temp', fontsize='x-large')
    
    sc = ax2.scatter(vf_xval, vf_yval, c=vf_c, marker='.', cmap='winter', vmin=cb_min, vmax=cb_max)
    ax2.scatter(vt_xval, vt_yval, c=vt_c, marker='.', cmap='winter', vmin=cb_min, vmax=cb_max)
    ax2.plot(base, base, 'b-')
    ax2.plot(base, day_room, 'k-')
    ax2.plot(day_out, base, 'k-')
    
    ax2.set_xlabel('outdoor temp. (F)')
    ax2.set_ylabel('indoor temp. (F)')
    #ax2.set_title('2017-2018 Heat Season Tenant Indoor vs Outdoor Temp')
    
    cbar = fig.colorbar(sc, ax = np.array([ax1,ax2]).ravel().tolist(), ticks=[cb_min, cb_max])
    cbar.ax.set_yticklabels([mdates.num2date(cb_min, pytz.timezone('US/Eastern')).strftime("%Y-%m-%d"), \
                            mdates.num2date(cb_max, pytz.timezone('US/Eastern')).strftime("%Y-%m-%d")])
    
def night_scatter(file_dataset):
    """
    Plot a scatterplot of the nighttime dataset
    :param file_dataset: second dataset returned by split_daynight()
    :type file_dataset: DataFrame
    """
    # Split the dataset into violation and non-violation
    vio_true = file_dataset.loc[file_dataset['violation'] == True]
    vio_false = file_dataset.loc[file_dataset['violation'] == False]
    
    # For both datasets, get the Series for indoor/outdoor temperatures, 
    # as well as the timestamp for heatmapping
    vt_yval = vio_true['temp']
    vt_xval = vio_true['outdoor_temp']
    vt_c = mdates.date2num(vio_true['created_at'])
    
    vf_yval = vio_false['temp']
    vf_xval = vio_false['outdoor_temp']
    vf_c = mdates.date2num(vio_false['created_at'])
    
    # Getting the min and max timestamps for accurate heatmapping
    cb_min = min((min(vf_c), min(vt_c)))
    cb_max = max((max(vf_c), max(vt_c)))

    # Defining the datapoints to plot the temp violation cutoffs
    base = range(0, 100)
    night_room = [62] * 100
    
    # matplotlib magic
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20,8))
    ax1.scatter(vf_xval, vf_yval, c='g', marker='.', label='nominal')
    ax1.scatter(vt_xval, vt_yval, c='r', marker='.', label='violation')
    ax1.plot(base, base, 'b-')
    ax1.plot(base, night_room, 'k-')
    ax1.legend()
    
    ax1.set_xlabel('outdoor temp. (F)')
    ax1.set_ylabel('indoor temp. (F)')
    fig.suptitle('2017-2018 Heat Season Tenant Nighttime Indoor vs Outdoor Temp', fontsize='x-large')
    
    sc = ax2.scatter(vf_xval, vf_yval, c=vf_c, marker='.', cmap='winter', vmin=cb_min, vmax=cb_max)
    ax2.scatter(vt_xval, vt_yval, c=vt_c, marker='.', cmap='winter', vmin=cb_min, vmax=cb_max)
    ax2.plot(base, base, 'b-')
    ax2.plot(base, night_room, 'k-')
    
    ax2.set_xlabel('outdoor temp. (F)')
    ax2.set_ylabel('indoor temp. (F)')
    #ax2.set_title('2017-2018 Heat Season Tenant Indoor vs Outdoor Temp')
    
    cbar = fig.colorbar(sc, ax = np.array([ax1,ax2]).ravel().tolist(), ticks=[cb_min, cb_max])
    cbar.ax.set_yticklabels([mdates.num2date(cb_min, pytz.timezone('US/Eastern')).strftime("%Y-%m-%d"), \
                            mdates.num2date(cb_max, pytz.timezone('US/Eastern')).strftime("%Y-%m-%d")])
    

In [None]:
day, night = split_daynight(file_dataset)
day_scatter(day)
night_scatter(night)

Notes:
* Both during day and night, we see severe violations where indoor=outdoor temp. We want to capture this severity if possible.
* At night, in some cases, it is significantly colder indoors than outdoors. Why?
* In other cases, we see more persistent yet less severe violations. Capture these too.

## What do Violations Look Like?
Should we be looking at:
* Consecutive violation hours in the past 24 hrs
* Consecutive violation hours in the past 72 hrs
* Consecutive violation days in the past week
* Percentage violation hours in the past 24 hrs
* Percentage violation hours in the past 72 hrs
* Percentage violation days in the past week
* Severity of each violation
* etc...

In [None]:
# excluse some outlier users with small uptime for this analysis
exclusions=[428, 327, 386]

def exclude_users(user_dataset, exclusions):
    """
    Remove certain users from the dataset, given a list of exclusions
    :param user_dataset: second dataset from user_import()
    :param exclusions: list of users to remove
    :type user_dataset: list(DataFrame)
    :type exclusions: list
    """
    clean_dataset = user_dataset.copy()
    del_ids = []
    i = 0
    while i < len(clean_dataset):
        if clean_dataset[i]['user_id'][0] in exclusions:
            clean_dataset.pop(i)
        else:
            i += 1
    return clean_dataset

print(len(user_dataset))
user_dataset_c = exclude_users(user_dataset, exclusions)
print(len(user_dataset))
print(len(user_dataset_c))

### Consecutive Violations?

In [None]:
def get_records(input_csv):
    """
    Gets the records of an input csv and formats for easy random access
    :param input_csv: path to input csv
    :type input_csv: str
    :return: dictionary where keys are the timestamps of the inputted file, values are user id and violation occurance
    :return type: dict(list)
    Contributed by Daniel J., Justin L.
    """
    records = {} 
    with open(input_csv, newline='') as csv_file:
        # skip the first line because it has headers and field names
        headers = True
        data_reader = csv.reader(csv_file) # instantiate a csv reader
        for row in data_reader:
            if not headers:
                created = datetime.strptime(row[3], "%Y-%m-%d %H:%M:%S")
                created = created.replace(tzinfo = pytz.utc)
                created = created.astimezone(pytz.timezone('US/Eastern'))
                # datetime formatting to convert UTC to EST or EDT as appropriate.
                # hardcoding timezone here is acceptable since our domain is very constrained.
                user_id = row[0]
                violation = row[5].lower()
                value = (user_id, violation)
                records[created] = value # add entries to dictionary
            else:
                headers = False
    return records

def read_all_user(inFilePathPerUser, skip):
    """
    Parses through each csv file found in the file path to get all per user records
    :param inFilePathPerUser: path to per-user data directory
    :type inFilePathPerUser: str
    :return: list of dictionaries for per-user records
    :rtype: list
    Contributed by Jake L., Justin L.
    """
    filePathPerUser =  inFilePathPerUser # file path for the per-user data
    listOfFiles = os.listdir(filePathPerUser) # list of all the files in the per-user data directory
    recordsAllUsers = list()
    for i in range(0, len(listOfFiles)):
        if int(listOfFiles[i][:-4]) in skip:
            continue
        input_csv = filePathPerUser + '/' + listOfFiles[i]
        recordsAllUsers.append(get_records(input_csv)) # add each user's dictionary of records to the list
    return recordsAllUsers

def consecutive_violation_hours(user_record):
    """
    Counts and records the length in hours of all consecutive violations
    :param user_record: dictionary where keys are the timestamps of the inputted file, values are user id and violation occurance
    :type user_record: dict
    :return: dictionary where key is user id, value is a list of the lengths in hours of all consecutive violations
    :rtype: dict
    Contributed by Justin L.
    """
    consecutive_violation_hours = {}
    violations = list()
    user_id = list(user_record.values())[0][0]
    count = 0
    for i in range(0, len(user_record) - 1): # iterate over all of the user's records
        time = list(user_record.keys())[i] # get the time of the record
        violation = list(user_record.values())[i][1] # get whether or not there is a violation
        next_time = list(user_record.keys())[i + 1] # get the time of the next record
        time_differential = time - next_time # get the time differential between this record and the next
        time_differential_hours = divmod(time_differential.seconds, 3600)[0] # time differential in hours
        if violation == 'true': # if there is a violation...
            if count == 0: # increase the count but only if the previous record was not a violation
                count = count + 1
            if time_differential.days == 0 and time_differential_hours < 2 and time_differential_hours >= 1:
                count = count + time_differential_hours
                # if the amount of time between the records is between an hour and two hours, increase the count by one
            else:
                if count != 0:
                    violations.append(count)
                count = 0
                # else if the amount of time between the records is more than 2 hours, 
                # add the current count of consequtive violation hours to the list and reset the count.
        else:
            if count != 0:
                violations.append(count - 1)
            count = 0
            # if there stops being a violation, add the current count of consequtive violation hours to the list and reset 
            # the count.
    if count != 0:
        violations.append(count + 1)
    elif list(user_record.values())[len(user_record) - 1][1] == "true":
        violations.append(count + 1)
        
    consecutive_violation_hours[user_id] = violations
    return consecutive_violation_hours
    # NOTE:
    # There are some cases in the users' records where there are pieces of 'missing data.'
    # These might be a result of the heat seek being turned off for a period of time. 
    # Commonly, a recording is taken every hour, but in these cases, there may be records taken at intervals longer than 1 hr. 
    # Hence, it is important to consider the time interval between consecutive records. 
    # At the extreme, one record may have been taken 10 days after the prior record. And of course, even if both records show
    # a violation, these would not be considered consecutive hours of violation.
    # In some cases, the time interval between two consecutive records was more than an hour but less than two hours.
    # These were still considered to be consecutive hours of violation, but any time interval longer than two hours was
    # not considered consecutive hours of violations, because we know too little about what occurred between the interval.
    
    # Because of this afformentioned issue of 'missing data,' this might be a sign that we shouldn't be considering
    # or recording consecutive violation hours to make conclusions. Rather, we may resort to day-wise binning.

def plot_hist_consecutive_violation(consecutive_violations):
    """
    Make a bar graph of consecutive hours of violations
    :param consecutive_violations: list of the lengths in hours of all consecutive violations
    :type consecutive_violations: list
    Contributed by Justin L.
    Modified by Jake L.
    """
    fig, ax = plt.subplots()
    # plot of the data
    
    ax.hist(consecutive_violations, 89)
    ax.set_xlabel('Consecutive Violation Hours')
    ax.set_ylabel('Frequency')
    #ax.set_title('Histogram of Consecutive violation hours for user ID ' + list(consecutive_violations.keys())[0])
    ax.set_title('2017-2018 Heat Season Histogram of Consecutive Violation Hours')
#     ax.xaxis.set_major_locator(MaxNLocator(integer=True))
#     if consecutive_hours:
#         ax.set_xbound(0, max(consecutive_hours) + 1)
    plt.show()

In [None]:
all_user_records = read_all_user('./data/per_user', exclusions)
# bear with me here, converting data formats...
# list of dictionaries
consec_vios = []
for user_record in all_user_records:
    consec_vios.append(consecutive_violation_hours(user_record))
    
# into a single dictionary
consec_dict = consec_vios[0]
for i in range(1, len(consec_vios)):
    consec_dict.update(consec_vios[i])
    
# into a combined list
total_consec = []
for user in consec_dict:
    total_consec = total_consec + consec_dict[user]

In [None]:
# and plot.
print(max(total_consec))
plot_hist_consecutive_violation(total_consec)

Notes:
* Note that consecutive hours can be across days.
* This also doesn't capture when there are no violations.

### Percentage Violations (over 24 hrs)?

In [None]:
def vio_per_day(user_dataset):
    """
    Count the number of violations detected per day for everyone
    :param user_dataset: second DataFrame returned by user_import()
    :type user_dataset: list(DataFrame)
    """
    perc_list = []
    count_list = []
    for user in user_dataset_c:
        user_df = user.copy()
        user_df.set_index('created_at', inplace=True)
#       df_count = user_df.groupby(pd.Grouper(level='created_at', freq='D'))['violation'].count()
        df_group = user_df.groupby(pd.Grouper(level='created_at', freq='D'))['violation'].sum()
        for i in range(len(df_group)):
#             if df_count.iloc[i] == 0:
#                 perc_list.append(0)
#             else:
#                 perc_list.append(float(df_group.iloc[i]) / float(df_count.iloc[i]))
            count_list.append(float(df_group.iloc[i]))
    return count_list

def plot_hist_count(count_vio):
    """
    Plot a histogram of the violation counts
    :param count_vio: list of binned violations returned by vio_per_day()
    :type count_vio: list
    """
    fig, ax = plt.subplots()
    # plot of the data
    ax.set_xlim([0,24])
    ax.hist(count_vio, 24)
    ax.set_xlabel('Violation Measurements per Day')
    ax.set_ylabel('Frequency')
    ax.set_title('2017-2018 Heat Season Histogram of Violation Hours per Day')
    plt.show()
def plot_hist_count_vio(count_vio):
    """
    Plot a histogram of the violation counts, excluding the bar for no violations
    :param count_vio: list of binned violations returned by vio_per_day()
    :type count_vio: list
    """
    fig, ax = plt.subplots()
    # plot of the data
    ax.set_xlim([1,24])
    ax.hist(count_vio, 24)
    ax.set_xlabel('Violation Measurements per Day')
    ax.set_ylabel('Frequency')
    ax.set_title('2017-2018 Heat Season Histogram of Violation Hours per Day, >1 Violations')
    plt.show()

In [None]:
count_vio = vio_per_day(user_dataset_c)
only_vio = [a for a in count_vio if a != 0]
only_vio.append(0.0)
plot_hist_count(count_vio)
plot_hist_count_vio(only_vio)

geq_ten = [a for a in only_vio if a >= 10]
print('geq_ten: ', len(geq_ten) / (len(only_vio)-1))

Notes:
* The spike at 16 is if there is a violation during the entirety of daytime.
    * I would consider that 'duration severe'

### Correlation between Severity and Frequency? (Show Justin W.'s plot)

In [None]:
def longterm_plot(user_dataset):
    """
    Plot a time coverage plot heatmapped for number of violations per day
    :param user_dataset: second dataset returned by user_import()
    :type user_dataset: list(DataFrame)
    """
    # Sorted user by duration, determined a priori
    sorted_users = ['389', '387', '381', '380', '399', '408', '350', '297', '390', '391', '385', '395', '396', '397', '398', '401', '394', '404', '405', '403', '410', '339', '411', '413', '415', '406', '417', '422', '420', '423', '425', '412', '426', '430', '432', '431', '407', '382', '427', '437', '379', '308', '167', '414', '383', '421', '424', '163', '436', '386', '327', '428']
    
    # New dataset with binned data for each day
    # As in, number of violations per day per user
    grouped_users = {}
    for user_df in user_dataset:
        user_df_i = user_df.set_index('created_at', inplace=False)
        df_group = user_df_i.groupby(pd.Grouper(level='created_at', freq='D'))['violation'].sum()
        grouped_users[user_df['user_id'][0]] = df_group
    
    # matplotlib magic
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(17,22))
    ax.xaxis_date()
    
    counter = 1
    for user in sorted_users[::-1]:
        curr_df = grouped_users[int(user)]
        sc = ax.scatter(curr_df.index.values, [counter]*len(curr_df), c=curr_df.values, cmap='plasma', vmin=0, vmax=24)
        counter += 1
    
    ax.set_yticks(range(1, len(sorted_users)+1))
    ax.set_yticklabels(sorted_users[::-1])
    ax.grid(b=True, which='major', axis='both', linestyle='--', zorder=1)
    cbar = plt.colorbar(sc, ax=ax, ticks=range(25), orientation='vertical')

    return

from matplotlib.colors import LinearSegmentedColormap

def custom_plot(user_dataset):
    """
    Plot a time coverage plot heatmapped for number of violations per day
    :param user_dataset: second dataset returned by user_import()
    :type user_dataset: list(DataFrame)
    """
    
    # Define a color mapping for number of violations
    # This says:
    # Green from 0~2 violations
    # Yellow from 3~9 violations
    # Red from 10~24 violations
    colors = [(0,1,0)]*3 + [(1,1,0)]*7 + [(1,0,0)] * 14
    n_bins = 24
    cm = LinearSegmentedColormap.from_list('custom_cm', colors, N=n_bins)
    
    # Sorted user by duration, determined a priori
    sorted_users = ['389', '387', '381', '380', '399', '408', '350', '297', '390', '391', '385', '395', '396', '397', '398', '401', '394', '404', '405', '403', '410', '339', '411', '413', '415', '406', '417', '422', '420', '423', '425', '412', '426', '430', '432', '431', '407', '382', '427', '437', '379', '308', '167', '414', '383', '421', '424', '163', '436', '386', '327', '428']
    
    # New dataset with binned data for each day
    # As in, number of violations per day per user
    grouped_users = {}
    for user_df in user_dataset:
        user_df_i = user_df.set_index('created_at', inplace=False)
        df_group = user_df_i.groupby(pd.Grouper(level='created_at', freq='D'))['violation'].sum()
        grouped_users[user_df['user_id'][0]] = df_group
        
    # matplotlib magic
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(17,22))
    ax.xaxis_date()
    
    counter = 1
    for user in sorted_users[::-1]:
        curr_df = grouped_users[int(user)]
        sc = ax.scatter(curr_df.index.values, [counter]*len(curr_df), c=curr_df.values, cmap=cm, vmin=0, vmax=24)
        counter += 1
    
    ax.set_yticks(range(1, len(sorted_users)+1))
    ax.set_yticklabels(sorted_users[::-1])
    ax.grid(b=True, which='major', axis='both', linestyle='--', zorder=1)

    
    cbar = plt.colorbar(sc, ax=ax, ticks=range(25), orientation='vertical')

    return

In [None]:
longterm_plot(user_dataset)


In [None]:
custom_plot(user_dataset)