In [1]:
import pandas as pd
import numpy as np
import re
import pytz
import os
from pathlib import Path
import sys
sys.path.append("/workspaces/service-data")
import requests

from src.clean import clean_percentage, normalize_string, standardize_column_names, clean_fiscal_yr
from src.load import load_csv
from src.export import export_to_csv
from src.merge import merge_si, merge_ss

base_dir = Path.cwd()
parent_dir = base_dir.parent

In [None]:
file_path = "https://github.com/gcperformance/service-data/releases/download/service_data-425af09/si.csv"
si = pd.read_csv(file_path, keep_default_na=False, na_values='', delimiter=';')

# print(si.info())
# print(si['num_applications_total'].dtype)

high_volume_threshold = 45000
si_hv = si[si['num_applications_total']>=high_volume_threshold]

print(si_hv.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8935 entries, 0 to 8934
Data columns (total 53 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   fiscal_yr                                            8935 non-null   object 
 1   service_id                                           8934 non-null   object 
 2   service_name_en                                      8934 non-null   object 
 3   service_name_fr                                      8934 non-null   object 
 4   service_description_en                               8933 non-null   object 
 5   service_description_fr                               8933 non-null   object 
 6   service_type                                         8934 non-null   object 
 7   service_recipient_type                               8934 non-null   object 
 8   service_scope                                        8934 non-null  

In [None]:
# DRR indicator ID DR-2467:
# Fraction of high volume services that are fully available online
# High volume: num_applications_total < 45000

oip_cols = [
    'os_account_registration', 
    'os_authentication', 
    'os_application', 
    'os_decision', 
    'os_issuance', 
    'os_issue_resolution_feedback', 
]

# Melt the DataFrame
dr2467 = pd.melt(
    si_hv, 
    id_vars=['fiscal_yr', 'service_id'], 
    value_vars=oip_cols, 
    var_name='online_interaction_point', 
    value_name='activation')

# Create boolean columns for activation states
dr2467['activation_y'] = (dr2467['activation'] == 'Y')
dr2467['activation_n'] = (dr2467['activation'] == 'N')
dr2467['activation_na'] = (dr2467['activation'] == 'NA')

# Group by and sum the activation columns
dr2467 = dr2467.groupby(['fiscal_yr', 'service_id']).agg(
    activation_y=('activation_y', 'sum'),
    activation_n=('activation_n', 'sum'),
    activation_na=('activation_na', 'sum')
).reset_index()

# Determine conditions for online_e2e
conditions = [
    (dr2467['activation_na'] == 6),  # All interaction points are NaN
    (dr2467['activation_n'] > 0)      # Some interaction points are 'N'
]
choices = [None, False]

dr2467['online_e2e'] = np.select(conditions, choices, default=True).astype(bool)

# Remove all NaN/Nones
dr2467 = dr2467.dropna(subset=['online_e2e'])

# Remove fully NA services
dr2467 = dr2467[dr2467['activation_na']<6]

# Determine department-level counts for online e2e services and all services
dr2467 = dr2467.groupby(['fiscal_yr']).agg(
    online_e2e_count=('online_e2e', 'sum'),
    service_count_dr2467=('service_id', 'nunique')
).reset_index()

# Determine score and associated result
dr2467['dr2467_score'] = (dr2467['online_e2e_count']/dr2467['service_count_dr2467'])*100

print(dr2467)

   fiscal_yr  online_e2e_count  service_count_dr2467  dr2467_score
0  2018-2019                32                    88     36.363636
1  2019-2020                43                   100     43.000000
2  2020-2021                53                    93     56.989247
3  2021-2022                65                   107     60.747664
4  2022-2023                53                   100     53.000000
5  2023-2024                53                    98     54.081633


In [30]:
# DRR indicator ID DR-2469:
# Fraction of online applications over total applications for high volume services
# High volume: num_applications_total < 45000


# Melt the DataFrame
dr2469 = si_hv.loc[:,['service_id', 'fiscal_yr', 'num_applications_total', 'num_applications_online']]

dr2469['num_applications_online'] = dr2469['num_applications_online'].replace(['NA', 'ND'], 0)
dr2469['num_applications_online'] = dr2469['num_applications_online'].astype(float)

print(dr2469['num_applications_total'].dtype)
print(dr2469['num_applications_online'].dtype)




# Determine fy-level counts for applications
dr2469 = dr2469.groupby(['fiscal_yr']).agg(
    online_applications=('num_applications_online', 'sum'),
    total_applications=('num_applications_total', 'sum')
).reset_index()



# Determine score and associated result
dr2469['dr2469_score'] = (dr2469['online_applications']/dr2469['total_applications'])*100

print(dr2469)

float64
float64
   fiscal_yr  online_applications  total_applications  dr2469_score
0  2018-2019          183172998.0         375468932.0     48.785128
1  2019-2020          167916288.0         332890936.0     50.441832
2  2020-2021          214391664.0         265063800.0     80.883042
3  2021-2022          196470237.0         276925384.0     70.946995
4  2022-2023          269060992.0         403844178.0     66.624953
5  2023-2024          266174489.0         405902576.0     65.575955
