# FMD Scorecard KPIs

## Setup

### Import packages

In [1]:
# workhorse modules
import pandas as pd
import numpy as np
from datetime import timedelta, datetime
import re
from pathlib import Path
import datadotworld as dw
import pyodbc

# local utility functions
from utils import *
from private.config import config

### Set pandas options
This makes Pandas print all rows and columns to the output when requested.

In [2]:
set_pd_params()

pd.options.mode.chained_assignment = None  # default='warn'

### Import the data from Archibus database
Data is a copy of Archibus's `wrhwr` table, with some irrelevant columns left out. 

In [9]:
# Get private credentials using dotenv system
server = config["SERVER"]
user = config["USER"]
password = config["PASSWORD"]
db = config["DB"]

# Connect to Archibus database
conn = pyodbc.connect(
    f"DRIVER=ODBC Driver 17 for SQL Server;SERVER={server};DATABASE={db};UID={user};PWD={password}"
)
cursor = conn.cursor()

# Open a file with our basic SQL query
query_path = Path.cwd() / "sql" / "input_for_FMD_KPIs.sql"
fd = open(query_path, "r")
sqlFile = fd.read()
fd.close()

# Query the database
kpis_raw = pd.read_sql(
    sqlFile, conn, parse_dates=["date_requested", "date_completed", "date_closed"]
)
conn.close()

print(f"The KPIs raw dataframe has {kpis_raw.shape[0]:,} rows.")
kpis_raw.sample(3, random_state=444)

The KPIs raw dataframe has 104,351 rows.


Unnamed: 0,wr_id,date_requested,time_requested,date_completed,time_completed,date_closed,pmp_id,bl_id,cost_total,cost_labor,cost_parts,problem_type,requestor,supervisor,po_number,invoice_number,release_number,name,pmp_id.1,status
37227,60917,2016-02-15,1899-12-30 15:42:54,2016-02-17,1899-12-30 05:56:43,2016-02-17 06:22:34.370,,B00120,266.8,266.8,0.0,SNOW_REMOVAL,ANN.BRAUN,ANTHONY.PATTERSON,,,,Waxter Senior Center,,Clo
71394,101434,2018-02-20,1899-12-30 10:28:06,2018-02-20,1899-12-30 10:28:55,2018-03-14 08:47:08.477,,B00061,66.7,66.7,0.0,HVAC,JIMMY.HOLTHAUS,JIMMY.HOLTHAUS,,,,Clarence M. Mitchell Courthouse,,Clo
59266,88411,2017-05-24,1899-12-30 08:41:41,2017-05-30,1899-12-30 14:53:40,2017-05-31 06:36:28.290,,B00038,325.84,266.8,59.04,DOOR,MAXINE.BROWN,ANTHONY.PATTERSON,,,,Pimlico Academy Public Safety Training Fire Fa...,,Clo


## Data cleaning

### Basic cleaning
- removes white spaces in strings to facilitate matching, 
- drops rows with no problem type, 
- renames a few columns

In [10]:
cond = kpis_raw["wr_id"] == 39638
kpis_raw[cond]

Unnamed: 0,wr_id,date_requested,time_requested,date_completed,time_completed,date_closed,pmp_id,bl_id,cost_total,cost_labor,cost_parts,problem_type,requestor,supervisor,po_number,invoice_number,release_number,name,pmp_id.1,status
26828,39638,2015-07-01,1899-12-30 10:49:50,2015-07-01,1899-12-30 10:50:44,2015-07-01,,B00026,66.7,66.7,0.0,LOCK,JIMMY.HOLTHAUS,JIMMY.HOLTHAUS,,,,Baltimore City Police North Western District,,Clo


In [11]:
def tidy_up_wr(df):
    df = df.copy()
    df = df.loc[:, ~df.columns.duplicated()]
    df = df.dropna(subset=["wr_id", "problem_type"])
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df["wr_id"] = df["wr_id"].astype(int) # .astype(str)
    cond_valid = ~df["problem_type"].str.contains("TEST")
    df = df[cond_valid]
    df["status"] = df["status"].replace("A", "AA", regex=False)
    return df

# apply the tidy up function
wr_tidy = tidy_up_wr(kpis_raw)

print(f"The tidied work orders dataframe has {wr_tidy.shape[0]:,} rows.")

The tidied work orders dataframe has 103,962 rows.


### Remove duplicate work orders
Removes rows where technician says WR is a duplicate in the description, and the status is "Canceled", "Closed", or "Rejected". 

In [12]:
# wr_deduped = drop_dupes(wr_tidy)
wr_deduped = wr_tidy.copy()
# print(f"The deduped work orders dataframe has {wr_deduped.shape[0]:,} rows.")
# print(
#     f"Removing duplicates has cut {wr_tidy.shape[0] - wr_deduped.shape[0]:,} rows from the work orders dataframe."
# )

### Combine date and time columns to get timestamps
This takes the date from a date column and the time from a time column and combines them into a single timestamp.

This transformation allows us to know the time to completion with greater precision. 

In [13]:
# glue the date and time for request
wr_dt = glue_date_time(wr_deduped, "date_requested", "time_requested", "requested_dt")

# glue the date and time for completion
wr_dt = glue_date_time(wr_dt, "date_completed", "time_completed", "completed_dt")

# convert "date closed_order" to date time (this column has no time information)
wr_dt["date_closed"] = wr_dt["date_closed"].astype("datetime64")

In [14]:
wr_dt[
    [
        "wr_id",
        "problem_type",
        "requested_dt",
        "completed_dt",
        "date_closed",
        "status",
    ]
].sample(6, random_state=451)

Unnamed: 0,wr_id,problem_type,requested_dt,completed_dt,date_closed,status
44643,70328,OTHER,2016-07-22 11:38:44,2016-07-22 11:40:51,2016-08-02 17:11:15.973,Clo
35453,58155,HVAC,2016-01-05 09:30:13,2016-02-05 13:29:50,2016-04-05 14:53:24.263,Clo
30961,51562,DOOR,2015-09-24 11:53:26,2016-01-07 20:11:10,2016-01-11 17:08:02.087,Clo
90893,124014,PREVENTIVE MAINT,2019-07-30 08:48:29,2019-09-05 09:37:58,2020-03-26 11:03:46.650,Clo
72578,102615,_DELIVERY,2018-03-15 09:16:21,2018-03-15 13:27:46,2018-04-17 12:35:52.420,Clo
88749,120841,DOOR,2019-04-22 15:48:29,2019-09-10 09:46:03,NaT,Com


## Data preparation

### Include days to completion

In [15]:
def compute_days_to_completion(df):
    df = df.copy()
    # compute days to completion
    df["days_to_completion"] = df.apply(
        lambda x: (x["completed_dt"] - x["requested_dt"]) / np.timedelta64(1, "D"),
        axis=1,
    ).round(2)
    # set the index
    df = df.set_index(keys="requested_dt", verify_integrity=False, drop=False)
    return df


wr_durations = compute_days_to_completion(wr_dt)

In [19]:
wr_durations[
    [
        "wr_id",
        "problem_type",
        "requested_dt",
        "completed_dt",
        "date_closed",
        "days_to_completion",
        "status"
    ]
].sample(6, random_state=446)

Unnamed: 0_level_0,wr_id,problem_type,requested_dt,completed_dt,date_closed,days_to_completion,status
requested_dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-01-08 10:38:33,98483,HVAC,2018-01-08 10:38:33,2018-01-31 10:37:26,2018-02-21 12:57:44.720,23.0,Clo
2018-03-01 10:59:03,101954,PREVENTIVE MAINT,2018-03-01 10:59:03,2018-03-05 14:34:51,NaT,4.15,Com
2014-03-24 11:52:01,7764,PLUMB/UNCLOG,2014-03-24 11:52:01,2014-03-24 11:53:05,2014-03-25 00:00:00.000,0.0,Clo
2020-09-18 10:49:44,137156,LANDSCAPING,2020-09-18 10:49:44,NaT,NaT,,AA
2015-08-05 08:11:16,42287,ELEC/LIGHT,2015-08-05 08:11:16,2015-09-01 06:10:21,2015-09-01 00:00:00.000,26.92,Clo
2015-01-08 13:52:09,19840,HVAC,2015-01-08 13:52:09,2018-04-10 10:02:35,2018-05-01 10:30:31.450,1187.84,Clo


### Decision point: fiscal year
Note that the function `entirely_within_fiscal_year()` keeps only those rows where the work order was requested and closed in the same fiscal year. __Other rows that straddle two fiscal years are dropped__.

For comparison, I've included the function `add_fiscal_year()`, which derives the fiscal year from the request date and drops no rows.

In [32]:
def entirely_within_fiscal_year(df):
    df = df.copy()
    # store year and month for both request and closure
    df["requested_cal_year"] = df["requested_dt"].dt.year
    df["requested_cal_month"] = df["requested_dt"].dt.month
    df["completed_cal_year"] = df["completed_dt"].dt.year
    df["completed_cal_month"] = df["completed_dt"].dt.month
    # store the years as numbers
    y_requested = pd.to_numeric(df["requested_cal_year"])
    y_closed = pd.to_numeric(df["completed_cal_year"])
    # compute the fiscal year of request & closure
    df["requested_fiscal_year"] = np.where(
        df["requested_cal_month"] >= 7, y_requested + 1, y_requested
    )
    df["completed_fiscal_year"] = np.where(
        df["completed_cal_month"] >= 7, y_closed + 1, y_closed
    )
    # drop the rows that straddle two fiscal years
    cond_both = df["requested_fiscal_year"] == df["completed_fiscal_year"]
    df = df[cond_both]
    # cast the type of the year
    df["fiscal_year"] = (
        pd.to_datetime(df["requested_fiscal_year"], format="%Y")
    ).dt.year
    df = df.drop(
        columns=[
            "requested_cal_year",
            "requested_cal_month",
            "completed_cal_year",
            "completed_cal_month",
            "requested_fiscal_year",
            "completed_fiscal_year",
        ]
    )
    return df


def add_fiscal_year(df):
    df = df.copy()
    df["calendar_year"] = df["completed_dt"].dt.year
    df["month"] = df["completed_dt"].dt.month
    c = pd.to_numeric(df["calendar_year"])
    df["fiscal_year"] = np.where(df["month"] >= 7, c + 1, c)
    df["fiscal_year"] = (pd.to_datetime(df["fiscal_year"], format="%Y")).dt.year
    return df

In [38]:
# wr_fy = entirely_within_fiscal_year(wr_durations)
wr_fy = add_fiscal_year(wr_durations)

dropped_row_count = len(wr_durations) - len(wr_fy)
percent_rows_dropped = dropped_row_count / len(wr_durations) * 100
print(
    f"Limiting analysis to work orders entirely within one FY drops {dropped_row_count:,} rows from the data."
)
print(
    f"The dropped rows account for {percent_rows_dropped:,} of the data."
)

wr_fy[
    [
        "wr_id",
        "problem_type",
        "days_to_completion",
        "requested_dt",
        "completed_dt",
        "fiscal_year",
    ]
].sample(6, random_state=444)

Limiting analysis to work orders entirely within one FY drops 0 rows from the data.
The dropped rows account for 0.0 of the data.


Unnamed: 0_level_0,wr_id,problem_type,days_to_completion,requested_dt,completed_dt,fiscal_year
requested_dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-11-08 12:13:51,95968,_DELIVERY,5.84,2017-11-08 12:13:51,2017-11-14 08:24:31,2018.0
2016-09-01 06:50:23,73413,PAINTING,7.02,2016-09-01 06:50:23,2016-09-08 07:17:29,2017.0
2016-08-24 08:31:35,72977,ELEC/LIGHT,2.3,2016-08-24 08:31:35,2016-08-26 15:44:39,2017.0
2015-05-14 15:53:35,31800,OTHER,54.77,2015-05-14 15:53:35,2015-07-08 10:24:11,2016.0
2019-02-12 06:01:45,117241,PREVENTIVE MAINT,,2019-02-12 06:01:45,NaT,
2017-01-08 11:15:58,81990,OTHER,0.0,2017-01-08 11:15:58,2017-01-08 11:21:24,2017.0


In [39]:
cond = wr_fy["cost_total"] == 3038.21
#cond_1 = wr_fy["problem_type"] == "OTHER"
#cond_2 = wr_fy["bl_id"] == "B00163"
wr_fy[cond]

Unnamed: 0_level_0,wr_id,date_closed,pmp_id,bl_id,cost_total,cost_labor,cost_parts,problem_type,requestor,supervisor,po_number,invoice_number,release_number,name,status,requested_dt,date_requested,time_requested,completed_dt,date_completed,time_completed,days_to_completion,calendar_year,month,fiscal_year
requested_dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
2015-07-01 11:36:43,39659,2015-08-21,,B06064,3038.21,0.0,0.0,ROOF,AHEBRON,EARL.WILLIAMS2,529224,14s-21bc0030,21,EPFL No. 10 Northwood Library,Clo,2015-07-01 11:36:43,NaT,NaT,2015-08-06 08:43:56,NaT,NaT,35.88,2015.0,8.0,2016.0


### Filter to PM only, and for relevant fiscal years only

In [40]:
PM_list = [
    "HVAC|PM",
    "BUILDING|PM",
    "PREVENTIVE MAINT",
    "FUEL INSPECTION",
    "BUILDING INTERIOR INSPECTION",
    "INSPECTION",
    "FUEL INSPECTION",
]

cond_fy = wr_fy["fiscal_year"].isin(range(2016, 2021))
cond_pm = wr_fy["problem_type"].isin(PM_list)

wr_filtered = wr_fy[cond_fy & cond_pm]

print(f"The filtered work orders dataframe has {wr_filtered.shape[0]:,} rows.")

The filtered work orders dataframe has 6,727 rows.


## KPI: % PMs completed on time 
The goal here is to filter the data down to preventive maintenance only, and then show how many are completed before a given benchmark.

### Reproduce last year's work

#### Consider the counts

#### Compute the benchmark and add 'is_on_time' column

In [41]:
def compute_is_on_time(df, benchmark):
    df = df.copy()
    df["is_on_time"] = df["days_to_completion"] <= benchmark
    return df


wr_on_time = compute_is_on_time(wr_filtered, 26)

#### Group by fiscal year and get % on time

In [54]:
pm_compliance = wr_on_time.groupby("fiscal_year")[["is_on_time"]].agg(["mean", "count"])
pm_compliance["is_on_time"]["mean"] = pm_compliance["is_on_time"]["mean"].round(2)
pm_compliance.columns = pm_compliance.columns.droplevel(0)
pm_compliance["mean"] = pm_compliance["mean"].apply(lambda x: round(x * 100, 2))

pm_compliance

Unnamed: 0_level_0,mean,count
fiscal_year,Unnamed: 1_level_1,Unnamed: 2_level_1
2016.0,80.69,865
2017.0,55.58,1371
2018.0,76.42,1900
2019.0,69.62,1139
2020.0,33.61,1452


Note that there are some __large differences__ between the results reported last year and the results we get here. For reference, this is what DGS reported last year:


| Year|Last year's reported result|New computed result|
|---|---|---|
|2016 |61|41|
|2017|49|37|
|2018|66|66|
|2019|73|78|

### Compute this year's results
We move the benchmarking period forward by one year. That gives us a slightly lower benchmark.

In [None]:
cond_new_benchmark = wr_filtered["fiscal_year"].isin(range(2016, 2020))
on_time_benchmark = wr_filtered[cond_new_benchmark]["days_to_completion"].median()
print(f"PM work orders are on time if completed within {on_time_benchmark} days.")

wr_on_time = compute_is_on_time(wr_filtered, on_time_benchmark)

In [None]:
wr_on_time.groupby("fiscal_year")[["is_on_time"]].mean()

## KPI: % of preventative maintenance out of HVAC work orders

The two lists below contain the exact same problem types mentioned in last year's scorecard. So we would expect to be able to replicate last year's results closely.

In [None]:
CM_list = [
    "BOILER",
    "CHILLERS",
    "COOLING TOWERS",
    "HVAC",
    "HVAC INFRASTRUCTURE",
    "HVAC|REPAIR",
]

PM_list = [
    "HVAC|PM",
    "PREVENTIVE MAINT",
    # "BUILDING|PM",
    # "FUEL INSPECTION",
    # "BUILDING INTERIOR INSPECTION",
    # "INSPECTION",
]

### Filter to HVAC rows only

In [None]:
cond_cm = wr_fy["problem_type"].isin(CM_list)
cond_pm = wr_fy["problem_type"].isin(PM_list)

wr_HVAC = wr_fy[cond_cm | cond_pm]

print(f"We've gone from {len(wr_fy):,} rows to {len(wr_HVAC):,} rows.")

### Compute all PM/CM stats by fiscal year

In [None]:
def compute_pm_cm(df, PM_list):
    df = df.copy().sort_values("fiscal_year")
    df["is_pm"] = df["problem_type"].isin(PM_list)
    results_df = pd.DataFrame(
        columns=[
            "year",
            "percent_pm",
            "pm_cm_ratio",
            "count_cm",
            "count_pm",
            "count_hvac",
        ]
    )
    for year in df["fiscal_year"].unique():
        results_dict = {}
        df_fy = df[df["fiscal_year"] == year]
        count_pm = len(df_fy[df_fy["problem_type"].isin(PM_list)])
        count_hvac = len(df_fy)
        count_cm = count_hvac - count_pm
        results_dict["year"] = year
        results_dict["percent_pm"] = (count_pm / count_hvac) * 100
        results_dict["pm_cm_ratio"] = count_pm / count_cm
        results_dict["count_pm"] = count_pm
        results_dict["count_cm"] = count_cm
        results_dict["count_hvac"] = count_hvac
        results_df = results_df.append(results_dict, ignore_index=True)
    results_df[["year", "count_cm", "count_pm", "count_hvac"]] = results_df[
        ["year", "count_cm", "count_pm", "count_hvac"]
    ].astype(int)
    return results_df.round(2)


pm_cm_results = compute_pm_cm(wr_HVAC, PM_list)

In [None]:
cond_complete_FY = pm_cm_results["year"] <= 2020
pm_cm_results = pm_cm_results[cond_complete_FY]

pm_cm_results

In [None]:
count_plot_data = pd.melt(
    pm_cm_results, id_vars=["year"], value_vars=["count_cm", "count_pm"]
)

sns.lineplot(data=count_plot_data, y="value", x="year", hue="variable")

sns.despine()

In [None]:
ax = sns.lineplot(data=pm_cm_results, y="percent_pm", x="year",)
ax.set(title="Percent PM By Fiscal Year")
sns.despine()