In [27]:
import pandas as pd
import numpy as np
import re
import pytz
import os
from pathlib import Path
import sys
sys.path.append("/workspaces/service-data")
import requests

from src.clean import clean_percentage, normalize_string, standardize_column_names, clean_fiscal_yr
from src.load import load_csv
from src.export import export_to_csv
from src.merge import merge_si, merge_ss

base_dir = Path.cwd()
parent_dir = base_dir.parent

In [28]:
si = pd.read_csv("https://github.com/gcperformance/service-data/releases/download/service_data-425af09/si.csv", 
                 keep_default_na=False, 
                 na_values='', 
                 delimiter=';'
                 )

ss = pd.read_csv("https://github.com/gcperformance/service-data/releases/download/service_data-425af09/ss.csv", 
                 keep_default_na=False, 
                 na_values='', 
                 delimiter=';'
                 )


# print(si.info())
# print(si['num_applications_total'].dtype)

In [61]:
# DRR Indicator ID DR-2467: Fraction of high-volume services that are fully available online

# Define high-volume threshold
HIGH_VOLUME_THRESHOLD = 45000

# Define online interaction point (OIP) columns
OIP_COLS = [
    'os_account_registration',
    'os_authentication',
    'os_application',
    'os_decision',
    'os_issuance',
    'os_issue_resolution_feedback',
]

# Filter service inventory for high-volume services
si_hv = si[si['num_applications_total'] >= HIGH_VOLUME_THRESHOLD].copy()

# Melt the DataFrame for activation analysis
dr2467 = si_hv.melt(
    id_vars=['fiscal_yr', 'fy_org_id_service_id'],
    value_vars=OIP_COLS,
    var_name='online_interaction_point',
    value_name='activation'
)

# Create boolean indicators for activation states
dr2467['activation_y'] = dr2467['activation'].eq('Y')
dr2467['activation_n'] = dr2467['activation'].eq('N')
dr2467['activation_na'] = dr2467['activation'].eq('NA')

# Aggregate activations at service level
dr2467 = dr2467.groupby(['fiscal_yr', 'fy_org_id_service_id'], as_index=False).agg(
    activation_y=('activation_y', 'sum'),
    activation_n=('activation_n', 'sum'),
    activation_na=('activation_na', 'sum')
)

# Determine end-to-end online availability (online_e2e)
dr2467['online_e2e'] = np.select(
    [
        dr2467['activation_na'] == len(OIP_COLS),  # All interaction points are 'NA', len(OIP_COLS)=6
        dr2467['activation_n'] > 0                # At least one interaction point is 'N'
    ],
    [
        None,  # Fully NA services are excluded
        False  # Services with any 'N' are not fully online
    ],
    default=True  # Services without 'N' are fully online
).astype('bool')

# Remove services with all NA activation states
dr2467 = dr2467[dr2467['activation_na'] < len(OIP_COLS)]

# Aggregate at the fiscal year level
dr2467 = dr2467.groupby('fiscal_yr', as_index=False).agg(
    online_e2e_count=('online_e2e', 'sum'),
    high_volume_service_count=('fy_org_id_service_id', 'nunique')
)

# Compute DR-2467 score
dr2467['dr2467_score'] = (dr2467['online_e2e_count'] / dr2467['high_volume_service_count']) * 100

print(dr2467)


   fiscal_yr  online_e2e_count  high_volume_service_count  dr2467_score
0  2018-2019                32                         88     36.363636
1  2019-2020                43                        100     43.000000
2  2020-2021                53                         93     56.989247
3  2021-2022                65                        107     60.747664
4  2022-2023                53                        100     53.000000
5  2023-2024                53                         98     54.081633


In [None]:
# DRR Indicator ID DR-2468:
# Fraction of high-volume (+phone enquiries) services that meet one or more service standard
# High volume: num_applications_total_plus_phone_enquiries >= 45000

# Define high-volume threshold
HIGH_VOLUME_THRESHOLD = 45000

# Select relevant columns and ensure numeric conversion for 'num_phone_enquiries'
si_hvte = si[['service_id', 'fiscal_yr', 'org_id', 'fy_org_id_service_id', 'num_applications_total', 'num_phone_enquiries']].copy()
si_hvte['num_phone_enquiries'] = pd.to_numeric(si_hvte['num_phone_enquiries'], errors='coerce').fillna(0)

# Compute total applications including phone enquiries
si_hvte['num_applications_total_plus_phone_enquiries'] = si_hvte['num_applications_total'] + si_hvte['num_phone_enquiries']

# Filter for high-volume services
si_hvte = si_hvte[si_hvte['num_applications_total_plus_phone_enquiries'] >= HIGH_VOLUME_THRESHOLD]

# Filter services that met targets
ss_met = ss.loc[ss['target_met'] == 'Y', ['fy_org_id_service_id']]
print(f"Services meeting target: {ss_met.shape[0]}")

# Identify services that met the target
si_hvte['ss_target_met'] = si_hvte['fy_org_id_service_id'].isin(ss_met['fy_org_id_service_id'])

# Aggregate data at fiscal year level
dr2468 = si_hvte.groupby('fiscal_yr').agg(
    hvte_services_count_meeting_target=('ss_target_met', 'sum'),
    hvte_services_count=('fy_org_id_service_id', 'count')
).reset_index()

# Compute DR-2468 score
dr2468['dr2468_score'] = (dr2468['hvte_services_count_meeting_target'] / dr2468['hvte_services_count']) * 100

print(dr2468)


Services meeting target: 7074
   fiscal_yr  hvte_services_count_meeting_target  hvte_services_count  \
0  2018-2019                                  54                  126   
1  2019-2020                                  65                  135   
2  2020-2021                                  59                  131   
3  2021-2022                                  56                  142   
4  2022-2023                                  68                  125   
5  2023-2024                                  65                  128   

   dr2468_score  
0     42.857143  
1     48.148148  
2     45.038168  
3     39.436620  
4     54.400000  
5     50.781250  


In [51]:
# DRR indicator ID DR-2469:
# Fraction of online applications over total applications for high volume services
# High volume: num_applications_total >= 45000
# Filter for high-volume services
HIGH_VOLUME_THRESHOLD = 45000
si_hv = si[si['num_applications_total'] >= HIGH_VOLUME_THRESHOLD].copy() 

# Melt the DataFrame
dr2469 = si_hv[['service_id', 'fiscal_yr', 'num_applications_total', 'num_applications_online']].copy()

dr2469['num_applications_online'] = pd.to_numeric(dr2469['num_applications_online'], errors='coerce').fillna(0)


# Determine fy-level counts for applications
dr2469 = dr2469.groupby(['fiscal_yr'], as_index=False).agg(
    online_applications=('num_applications_online', 'sum'),
    total_applications=('num_applications_total', 'sum')
)



# Determine score and associated result
dr2469['dr2469_score'] = (dr2469['online_applications']/dr2469['total_applications'])*100

print(dr2469)

   fiscal_yr  online_applications  total_applications  dr2469_score
0  2018-2019          183172998.0         375468932.0     48.785128
1  2019-2020          167916288.0         332890936.0     50.441832
2  2020-2021          214391664.0         265063800.0     80.883042
3  2021-2022          196470237.0         276925384.0     70.946995
4  2022-2023          269060992.0         403844178.0     66.624953
5  2023-2024          266174489.0         405902576.0     65.575955
