# FMD Scorecard KPIs

## Setup

### Import packages

In [65]:
# workhorse modules
import pandas as pd
import numpy as np
from datetime import timedelta, datetime
import re
from pathlib import Path
import datadotworld as dw
import pyodbc

# local utility functions
from utils import *
from private.config import config

### Set pandas options
This makes Pandas print all rows and columns to the output when requested.

In [66]:
set_pd_params()

pd.options.mode.chained_assignment = None  # default='warn'

### Import the data
Data is a copy of Archibus's `wrhwr` table, with some irrelevant columns left out. 

In [68]:
# Get private credentials using dotenv system
server = config["SERVER"]
user = config["USER"]
password = config["PASSWORD"]
db = config["DB"]

# Connect to Archibus database
conn = pyodbc.connect(
    f"DRIVER=ODBC Driver 17 for SQL Server;SERVER={server};DATABASE={db};UID={user};PWD={password}"
)
cursor = conn.cursor()

# Open a file with our basic SQL query
query_path = Path.cwd() / "sql" / "input_for_FMD_KPIs.sql"
fd = open(query_path, "r")
sqlFile = fd.read()
fd.close()

# Query the database
kpis_raw = pd.read_sql(
    sqlFile, conn, parse_dates=["date_requested", "date_completed", "date_closed"]
)
conn.close()

print(f"The KPIs raw dataframe has {kpis_raw.shape[0]:,} rows.")
kpis_raw.sample(3, random_state=444)

The KPIs raw dataframe has 104,339 rows.


Unnamed: 0,wr_id,date_requested,time_requested,date_completed,time_completed,date_closed_request,pmp_id,bl_id,cost_total,cost_labor,cost_parts,problem_type,requestor,supervisor,po_number,invoice_number,release_number,name,pmp_id.1,status
67790,96854,2017-12-01,1899-12-30 11:08:50,2017-12-01,1899-12-30 11:09:45,2017-12-19 14:43:40.053,,B00057,66.7,66.7,0.0,OTHER,JIMMY.HOLTHAUS,JIMMY.HOLTHAUS,,,,Abel Wolman Muncipal Building,,Clo
15411,15600,2014-09-17,1899-12-30 11:15:15,2014-09-17,1899-12-30 11:16:05,2014-09-30 00:00:00.000,,B00064,66.7,66.7,0.0,ELEC/GENERAL,JIMMY.HOLTHAUS,JIMMY.HOLTHAUS,,,,War Memorial Building,,Clo
74988,105016,2018-05-04,1899-12-30 12:38:25,2018-08-24,1899-12-30 09:37:22,NaT,,B06031,0.0,0.0,0.0,HVAC,LORETTA.BROWN,,,,,Quarantine Road Landfill Operations Building 1,,Com


## Data cleaning

### Basic cleaning
- removes white spaces in strings to facilitate matching, 
- drops rows with no problem type, 
- renames a few columns

In [70]:
cond = kpis_raw["wr_id"] == 39638
kpis_raw[cond]

Unnamed: 0,wr_id,date_requested,time_requested,date_completed,time_completed,date_closed_request,pmp_id,bl_id,cost_total,cost_labor,cost_parts,problem_type,requestor,supervisor,po_number,invoice_number,release_number,name,pmp_id.1,status
26828,39638,2015-07-01,1899-12-30 10:49:50,2015-07-01,1899-12-30 10:50:44,2015-07-01,,B00026,66.7,66.7,0.0,LOCK,JIMMY.HOLTHAUS,JIMMY.HOLTHAUS,,,,Baltimore City Police North Western District,,Clo


In [16]:
# apply the tidy up function
wr_tidy = tidy_up_wr(kpis_raw)

print(f"The tidied work orders dataframe has {wr_tidy.shape[0]:,} rows.")

The tidied work orders dataframe has 132,547 rows.


### Fix repeated ID numbers

In [17]:
def unique_ids(df):
    df = df.copy()
    df = df.sort_values(["wr_id", "supervisor"])
    df = df.drop_duplicates(subset=["wr_id", "date_requested"], keep="first")
    return df


wr_unique_ids = unique_ids(wr_tidy)

print(f"The tidied work orders dataframe has {wr_unique_ids.shape[0]:,} rows.")
# print(wr_unique_ids.info())
wr_unique_ids.sample(3, random_state=444)

The tidied work orders dataframe has 102,434 rows.


Unnamed: 0,wo_id,date_requested,time_requested,date_completed,time_completed,date_closed_request,date_closed_order,pmp_id,bl_id,cost_total,cost_labor,cost_parts,problem_type,requestor,supervisor,po_number,invoice_number,release_number,name,status
9636,4866,2014-01-27,1899-12-30 13:50:25,2014-01-30,1899-12-30 06:07:59,2014-01-30 00:00:00.000,NaT,,B00036,0.0,0.0,0.0,DOOR,JOHN.RICE,MICHAEL.JONES2,,,,Baltimore City Police Department Warrant Task ...,Clo
78775,76324,2016-10-31,1899-12-30 14:55:57,2017-04-25,1899-12-30 08:56:22,2017-04-26 07:07:38.263,2017-04-26,,B00025,266.8,266.8,0.0,DOOR,NATALIE.PRESTON,TERRY.HOWELL,,,,Baltimore City Police North Eastern District,Clo
23924,12125,2014-06-23,1899-12-30 14:30:55,2014-06-23,1899-12-30 14:31:40,2014-06-23 00:00:00.000,NaT,,B00062,0.0,0.0,0.0,OTHER,JIMMY.HOLTHAUS,JIMMY.HOLTHAUS,,,,Elijah E. Cummings Courthouse and Old Post Office,Clo


### Remove duplicate work orders
Removes rows where technician says WR is a duplicate in the description, and the status is "Canceled", "Closed", or "Rejected". 

In [18]:
# wr_deduped = drop_dupes(wr_unique_ids)
wr_deduped = wr_unique_ids.copy()
# print(f"The deduped work orders dataframe has {wr_deduped.shape[0]:,} rows.")
# print(
#     f"Removing duplicates has cut {wr_tidy.shape[0] - wr_deduped.shape[0]:,} rows from the work orders dataframe."
# )

### Combine date and time columns to get timestamps
This takes the date from a date column and the time from a time column and combines them into a single timestamp.

This transformation allows us to know the time to completion with greater precision. 

In [19]:
# glue the date and time for request
wr_dt = glue_date_time(wr_deduped, "date_requested", "time_requested", "requested_dt")

# glue the date and time for completion
wr_dt = glue_date_time(wr_dt, "date_completed", "time_completed", "completed_dt")

wr_dt["date_closed_request"] = wr_dt["date_closed_request"].astype("datetime64")
# convert "date closed_order" to date time (this column has no time information)
wr_dt["date_closed_order"] = wr_dt["date_closed_order"].astype("datetime64")

In [29]:
wr_dt[
    [
        "wr_id",
        "problem_type",
        "requested_dt",
        "completed_dt",
        "date_closed_request",
        "date_closed_order",
        "status",
    ]
].sample(6, random_state=451)

Unnamed: 0,wo_id,problem_type,requested_dt,completed_dt,date_closed_request,date_closed_order,status
81206,80723,OTHER,2016-12-23 09:42:59,2017-02-16 11:33:18,NaT,NaT,Clo
11369,5743,SNOW_REMOVAL,2014-02-11 11:27:07,2014-02-11 11:28:44,2014-02-11 00:00:00.000,NaT,Clo
15523,7832,ROOF,2014-03-25 12:28:27,2014-03-25 12:29:43,2014-03-25 00:00:00.000,NaT,Clo
108470,108185,ELEC/LIGHT,2018-08-15 08:18:45,2018-10-17 08:28:11,2018-10-17 12:18:56.317,2018-10-17,Clo
64752,59672,OTHER,2016-01-21 15:08:36,2016-03-17 06:37:36,2016-03-18 06:12:57.190,2016-03-18,Clo
83721,83221,OTHER,2017-02-16 10:56:50,2017-02-16 10:59:14,NaT,NaT,Clo


## Data preparation

### Include days to completion

In [30]:
def compute_days_to_completion(df):
    df = df.copy()
    # compute days to completion
    df["days_to_completion"] = df.apply(
        lambda x: (x["completed_dt"] - x["requested_dt"]) / np.timedelta64(1, "D"),
        axis=1,
    ).round(2)
    # compute days to close
    df["days_to_close"] = df.apply(
        lambda x: (x["date_closed_request"] - x["requested_dt"]) / np.timedelta64(1, "D"),
        axis=1,
    ).round(2)
    # set the index
    df = df.set_index(keys="requested_dt", verify_integrity=False, drop=False)
    return df


wr_durations = compute_days_to_completion(wr_dt)

In [34]:
wr_durations[
    [
        "wr_id",
        "problem_type",
        "requested_dt",
        "completed_dt",
        "date_closed_request",
        "date_closed_order",
        "days_to_completion",
        "days_to_close",
        "status"
    ]
].sample(6, random_state=445)

Unnamed: 0_level_0,wo_id,problem_type,requested_dt,completed_dt,date_closed_request,date_closed_order,days_to_completion,days_to_close,status
requested_dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-10-05 13:19:24,93683,HVAC,2017-10-05 13:19:24,2017-10-16 08:26:44,2020-03-27 09:22:44.740,2020-03-27,10.8,903.84,Clo
2014-01-08 11:44:56,4025,ELEC/GENERAL,2014-01-08 11:44:56,NaT,2014-01-08 00:00:00.000,NaT,,-0.49,Clo
2018-12-10 08:11:39,112657,PLUMB/LEAK,2018-12-10 08:11:39,2019-09-25 11:57:54,NaT,NaT,289.16,,Com
2014-03-20 11:30:42,7609,PLUMB/OTHER,2014-03-20 11:30:42,2014-03-21 14:36:40,2014-03-21 00:00:00.000,NaT,1.13,0.52,Clo
2014-07-07 10:27:56,12702,ENVIR/ASBESTOS,2014-07-07 10:27:56,2014-07-07 10:28:42,2014-07-07 00:00:00.000,NaT,0.0,-0.44,Clo
2017-02-03 07:54:42,82582,PLUMBING-TOILET,2017-02-03 07:54:42,2017-02-03 14:32:46,2017-02-06 11:39:58.080,2017-02-06,0.28,3.16,Clo


### Decision point: fiscal year
Note that the function `entirely_within_fiscal_year()` keeps only those rows where the work order was requested and closed in the same fiscal year. __Other rows that straddle two fiscal years are dropped__.

For comparison, I've included the function `add_fiscal_year()`, which derives the fiscal year from the request date and drops no rows.

In [40]:
def entirely_within_fiscal_year(df):
    df = df.copy()
    # store year and month for both request and closure
    df["requested_cal_year"] = df["requested_dt"].dt.year
    df["requested_cal_month"] = df["requested_dt"].dt.month
    df["closed_cal_year"] = df["date_closed_request"].dt.year
    df["closed_cal_month"] = df["date_closed_request"].dt.month
    # store the years as numbers
    y_requested = pd.to_numeric(df["requested_cal_year"])
    y_closed = pd.to_numeric(df["closed_cal_year"])
    # compute the fiscal year of request & closure
    df["requested_fiscal_year"] = np.where(
        df["requested_cal_month"] >= 7, y_requested + 1, y_requested
    )
    df["closed_fiscal_year"] = np.where(
        df["closed_cal_month"] >= 7, y_closed + 1, y_closed
    )
    # drop the rows that straddle two fiscal years
    cond_both = df["requested_fiscal_year"] == df["closed_fiscal_year"]
    df = df[cond_both]
    # cast the type of the year
    df["fiscal_year"] = (
        pd.to_datetime(df["requested_fiscal_year"], format="%Y")
    ).dt.year
    df = df.drop(
        columns=[
            "requested_cal_year",
            "requested_cal_month",
            "closed_cal_year",
            "closed_cal_month",
            "requested_fiscal_year",
            "closed_fiscal_year",
        ]
    )
    return df

In [50]:
wr_fy = entirely_within_fiscal_year(wr_durations)
# wr_fy = add_fiscal_year(wr_durations)

print(
    f"Limiting analysis to work orders entirely within one FY drops {len(wr_durations) - len(wr_fy):,} rows from the data."
)

wr_fy[
    [
        "wr_id",
        "problem_type",
        "days_to_completion",
        "days_to_close",
        "requested_dt",
        "completed_dt",
        "date_closed_request",
        "fiscal_year",
    ]
].sample(6, random_state=444)

Limiting analysis to work orders entirely within one FY drops 32,672 rows from the data.


Unnamed: 0_level_0,wo_id,problem_type,days_to_completion,days_to_close,requested_dt,completed_dt,date_closed_request,fiscal_year
requested_dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-09-21 09:34:48,74435,ELEC/LIGHT,0.25,22.33,2016-09-21 09:34:48,2016-09-21 15:40:40,2016-10-13 17:28:32.997,2017
2017-05-22 14:56:21,87573,LOCK,1.95,8.65,2017-05-22 14:56:21,2017-05-24 13:46:23,2017-05-31 06:34:58.280,2017
2015-06-26 14:14:06,39516,SERV/CUSTODIAL,0.0,2.41,2015-06-26 14:14:06,2015-06-26 14:15:01,2015-06-29 00:00:00.000,2015
2018-04-05 14:02:56,102393,OTHER,2.85,5.9,2018-04-05 14:02:56,2018-04-08 10:27:00,2018-04-11 11:36:22.950,2018
2017-02-01 06:22:08,82454,PLUMB/OTHER,0.35,0.98,2017-02-01 06:22:08,2017-02-01 14:48:47,2017-02-02 05:57:23.927,2017
2013-11-21 10:53:56,1652,_DELIVERY,2.03,1.55,2013-11-21 10:53:56,2013-11-23 11:43:30,2013-11-23 00:00:00.000,2014


In [64]:
cond = wr_fy["cost_total"] == 3038.21
#cond_1 = wr_fy["problem_type"] == "OTHER"
#cond_2 = wr_fy["bl_id"] == "B00163"
wr_fy[cond]

Unnamed: 0_level_0,wo_id,date_closed_request,date_closed_order,pmp_id,bl_id,cost_total,cost_labor,cost_parts,problem_type,requestor,supervisor,po_number,invoice_number,release_number,name,status,requested_dt,date_requested,time_requested,completed_dt,date_completed,time_completed,days_to_completion,days_to_close,fiscal_year
requested_dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1


### Filter to PM only, and for relevant fiscal years only

In [51]:
PM_list = [
    "HVAC|PM",
    "BUILDING|PM",
    "PREVENTIVE MAINT",
    "FUEL INSPECTION",
    "BUILDING INTERIOR INSPECTION",
    "INSPECTION",
    "FUEL INSPECTION",
]

cond_fy = wr_fy["fiscal_year"].isin(range(2016, 2021))
cond_pm = wr_fy["problem_type"].isin(PM_list)

wr_filtered = wr_fy[cond_fy & cond_pm]

print(f"The filtered work orders dataframe has {wr_filtered.shape[0]:,} rows.")

The filtered work orders dataframe has 3,836 rows.


## KPI: % PMs completed on time 
The goal here is to filter the data down to preventive maintenance only, and then show how many are completed before a given benchmark.

### Reproduce last year's work

#### Consider the counts

The on-time benchmark will be computing using only rows from years [2018, 2016, 2017]


#### Compute the benchmark and add 'is_on_time' column

In [54]:
on_time_benchmark = wr_filtered[cond_benchmark]["days_to_completion"].median()
print(f"PM work orders are on time if completed within {on_time_benchmark} days.")


def compute_is_on_time(df, benchmark):
    df = df.copy()
    df["is_on_time"] = df["days_to_close"] <= benchmark
    return df


wr_on_time = compute_is_on_time(wr_filtered, 26)

PM work orders are on time if completed within 11.07 days.


#### Group by fiscal year and get % on time

In [55]:
wr_on_time.groupby("fiscal_year")[["is_on_time"]].mean().round(2)

Unnamed: 0_level_0,is_on_time
fiscal_year,Unnamed: 1_level_1
2016,0.71
2017,0.42
2018,0.26
2019,0.24
2020,0.45


Note that there are some __large differences__ between the results reported last year and the results we get here. For reference, this is what DGS reported last year:


| Year|Last year's reported result|New computed result|
|---|---|---|
|2016 |61|41|
|2017|49|37|
|2018|66|66|
|2019|73|78|

### Compute this year's results
We move the benchmarking period forward by one year. That gives us a slightly lower benchmark.

In [None]:
cond_new_benchmark = wr_filtered["fiscal_year"].isin(range(2016, 2020))
on_time_benchmark = wr_filtered[cond_new_benchmark]["days_to_completion"].median()
print(f"PM work orders are on time if completed within {on_time_benchmark} days.")

wr_on_time = compute_is_on_time(wr_filtered, on_time_benchmark)

In [None]:
wr_on_time.groupby("fiscal_year")[["is_on_time"]].mean()

## KPI: % of preventative maintenance out of HVAC work orders

The two lists below contain the exact same problem types mentioned in last year's scorecard. So we would expect to be able to replicate last year's results closely.

In [None]:
CM_list = [
    "BOILER",
    "CHILLERS",
    "COOLING TOWERS",
    "HVAC",
    "HVAC INFRASTRUCTURE",
    "HVAC|REPAIR",
]

PM_list = [
    "HVAC|PM",
    "PREVENTIVE MAINT",
    # "BUILDING|PM",
    # "FUEL INSPECTION",
    # "BUILDING INTERIOR INSPECTION",
    # "INSPECTION",
]

### Filter to HVAC rows only

In [None]:
cond_cm = wr_fy["problem_type"].isin(CM_list)
cond_pm = wr_fy["problem_type"].isin(PM_list)

wr_HVAC = wr_fy[cond_cm | cond_pm]

print(f"We've gone from {len(wr_fy):,} rows to {len(wr_HVAC):,} rows.")

### Compute all PM/CM stats by fiscal year

In [None]:
def compute_pm_cm(df, PM_list):
    df = df.copy().sort_values("fiscal_year")
    df["is_pm"] = df["problem_type"].isin(PM_list)
    results_df = pd.DataFrame(
        columns=[
            "year",
            "percent_pm",
            "pm_cm_ratio",
            "count_cm",
            "count_pm",
            "count_hvac",
        ]
    )
    for year in df["fiscal_year"].unique():
        results_dict = {}
        df_fy = df[df["fiscal_year"] == year]
        count_pm = len(df_fy[df_fy["problem_type"].isin(PM_list)])
        count_hvac = len(df_fy)
        count_cm = count_hvac - count_pm
        results_dict["year"] = year
        results_dict["percent_pm"] = (count_pm / count_hvac) * 100
        results_dict["pm_cm_ratio"] = count_pm / count_cm
        results_dict["count_pm"] = count_pm
        results_dict["count_cm"] = count_cm
        results_dict["count_hvac"] = count_hvac
        results_df = results_df.append(results_dict, ignore_index=True)
    results_df[["year", "count_cm", "count_pm", "count_hvac"]] = results_df[
        ["year", "count_cm", "count_pm", "count_hvac"]
    ].astype(int)
    return results_df.round(2)


pm_cm_results = compute_pm_cm(wr_HVAC, PM_list)

In [None]:
cond_complete_FY = pm_cm_results["year"] <= 2020
pm_cm_results = pm_cm_results[cond_complete_FY]

pm_cm_results

In [None]:
count_plot_data = pd.melt(
    pm_cm_results, id_vars=["year"], value_vars=["count_cm", "count_pm"]
)

sns.lineplot(data=count_plot_data, y="value", x="year", hue="variable")

sns.despine()

In [None]:
ax = sns.lineplot(data=pm_cm_results, y="percent_pm", x="year",)
ax.set(title="Percent PM By Fiscal Year")
sns.despine()