In [1]:
import pandas as pd
import numpy as np
import re, pytz, os, requests, sys
from pathlib import Path
from datetime import datetime
import sys
sys.path.append("/workspaces/service-data")

from src.clean import clean_percentage, clean_fiscal_yr, normalize_string, standardize_column_names
from src.load import load_csv
from src.export import export_to_csv
from src.merge import merge_si, merge_ss
from src.utils import dept_list, program_list
from main import get_config

base_dir = Path.cwd()
parent_dir = base_dir.parent

In [2]:
si = pd.read_csv("https://github.com/gcperformance/service-data/releases/latest/download/si.csv", 
                 keep_default_na=False, 
                 na_values='', 
                 delimiter=';',
                 engine='python',
                 skipfooter=1
                 )

ss = pd.read_csv("https://github.com/gcperformance/service-data/releases/latest/download/ss.csv", 
                 keep_default_na=False, 
                 na_values='', 
                 delimiter=';',
                 engine='python',
                 skipfooter=1
                 )

config = get_config()

In [3]:
# === SETUP ===
# Load extra files
org_var = load_csv('org_var.csv', config, snapshot=False)
sid_registry = load_csv('sid_registry.csv', config, snapshot=False)

# Build then import department, program list from utilities
dept = dept_list(config)
program = program_list(config)

# Determine the current date
timezone = pytz.timezone('America/Montreal')
current_datetime = pd.Timestamp.now(tz=timezone)
current_date = current_datetime.date()

# Coerce all numeric fields
int_cols = {
    'num_phone_enquiries': si,
    'num_applications_by_phone': si,
    'num_website_visits': si,
    'num_applications_online': si,
    'num_applications_by_mail': si,
    'num_applications_by_email': si,
    'num_applications_by_fax': si,
    'num_applications_by_other': si,
    'num_applications_total': si,
    'volume_meeting_target': ss,
    'total_volume': ss
}

for column, df in int_cols.items():
    int_cols[column][column] = pd.to_numeric(df[column], errors = 'coerce').fillna(0).astype(int)

# Harmonize all org_id datatypes across all dataframes
org_id_df = [si, ss, dept, org_var, program, sid_registry]

for df in org_id_df:
    df['org_id'] = pd.to_numeric(df['org_id'], errors = 'coerce').fillna(0).astype('Int64')


# Create numeric ids, strip out prefixes
si['service_id_numeric'] = si['service_id'].str.replace(r'^SRV', '', regex=True)
si['service_id_numeric'] = pd.to_numeric(si['service_id_numeric'], errors = 'coerce')

ss['service_standard_id_numeric'] = ss['service_standard_id'].str.replace(r'^STAN', '', regex=True)
ss['service_standard_id_numeric'] = pd.to_numeric(ss['service_standard_id_numeric'], errors = 'coerce')


# === QUALITY ASSURANCE CHECKS ===
# =================================


In [4]:
def fy_to_num(fiscal_yr): # Returns the year in which the fiscal year ends, as a number.
    return pd.to_numeric(fiscal_yr.split('-')[-1])

# Select fields used for analysis
si_variance_qa = si[['fiscal_yr', 'org_id', 'service_id', 'num_applications_total']].copy()
si_variance_qa['fy_num'] = si_variance_qa['fiscal_yr'].apply(lambda x: fy_to_num(x))

# Determine the latest fiscal year
si_variance_qa = pd.merge(
    si_variance_qa, 
    si_variance_qa.groupby(['org_id', 'service_id'], as_index=False)['fy_num'].max(), 
    on=['org_id', 'service_id'], 
    suffixes = ['', '_max']
)

# Only consider records with at least 4 years of reported non-zero values (latest + 3)
# Remove records without any application volume
si_variance_qa = si_variance_qa.loc[si_variance_qa['num_applications_total']>0]

# Determine how many years of reporting per service
si_variance_qa = pd.merge(
    si_variance_qa, 
    si_variance_qa.groupby(['org_id', 'service_id'], as_index=False).agg(
        years_reported = ("fiscal_yr", "nunique")
    ),
    on = ['org_id', 'service_id']
)

# Then only keep records with 4 or more years
si_variance_qa = si_variance_qa.loc[si_variance_qa['years_reported']>=4]


# Identify the rows belonging to the latest fiscal year
si_variance_qa['latest_fy_bool'] = si_variance_qa['fy_num'] == si_variance_qa['fy_num_max']

# Determine the average number of applications and their standard deviation
# by service and fiscal year, excluding the latest fiscal year
si_variance_qa = pd.merge(
    si_variance_qa,
    si_variance_qa.loc[~si_variance_qa['latest_fy_bool']]
        .groupby(['org_id', 'service_id'], as_index=False).agg(
            std_dev = ('num_applications_total', 'std'),
            mean = ('num_applications_total', 'mean')
        ),
    on=['org_id', 'service_id']
)

# Assuming the applications reported over the years follow a normal distribution, 
# then distance from the mean in terms of number of standard deviations will reveal how far out
# each number of applications is.
si_variance_qa['apps_stdevs_away_from_mean'] = np.abs(si_variance_qa['num_applications_total']-si_variance_qa['mean'])/si_variance_qa['std_dev']

# Issues to identify:
# 1. Standard deviation is 0 (std_dev = 0)
# this is when for all years (except the latest) the num_applications_total is the same
si_variance_qa['qa_no_volume_variation'] = (si_variance_qa['std_dev'] == 0)

# 2. The difference between the number of applications and the mean, in units of standard deviation, is greater than some threshold
# this is for big swings that would need to be investigated.
stdevs_away_from_mean_threshold = 20
si_variance_qa['qa_extreme_volume_variation'] = ((si_variance_qa['apps_stdevs_away_from_mean'] > stdevs_away_from_mean_threshold) & ~si_variance_qa['qa_no_volume_variation'])

# Add these checks into the si dataframe
# The merge is there to generate an indicator (true/false) that describes
# whether the service in the si is part of the si_variance_qa dataframe, filtered for the
# check in question
si = pd.merge(
    si,
    si_variance_qa.loc[
        (si_variance_qa['latest_fy_bool'] & 
        si_variance_qa['qa_no_volume_variation']),
        ['fiscal_yr', 'service_id', 'org_id']
    ],
    on=['fiscal_yr', 'service_id', 'org_id'], 
    how='left',
    indicator='qa_no_volume_variation'
)

si['qa_no_volume_variation'] = (si['qa_no_volume_variation'] == 'both')

si = pd.merge(
    si,
    si_variance_qa.loc[
        (si_variance_qa['latest_fy_bool'] & 
        si_variance_qa['qa_extreme_volume_variation']),
        ['fiscal_yr', 'service_id', 'org_id']
    ],
    on=['fiscal_yr', 'service_id', 'org_id'], 
    how='left',
    indicator='qa_extreme_volume_variation'
)

si['qa_extreme_volume_variation'] = (si['qa_extreme_volume_variation'] == 'both')

si['fy_num_applications_total'] = "("+si['fiscal_yr']+": "+si['num_applications_total'].astype('str')+")"
si_apps_by_fy = si.groupby(['org_id', 'service_id'], as_index=False).agg({'fy_num_applications_total': lambda x: ', '.join(sorted(x))})

si = pd.merge(
    si.drop(columns=['fy_num_applications_total']),
    si_apps_by_fy,
    on=['org_id', 'service_id']
)