# Quality assurance and automated service data review

This notebook reviews published service data for common mistakes. Relies on gc-service-data-script outputs to function.

In [1]:
import pandas as pd
import numpy as np
import requests
import pytz
from tqdm.notebook import tqdm  # Use tqdm.notebook for JupyterLab progress bars
from concurrent.futures import ThreadPoolExecutor, as_completed


In [2]:
# Specify date and time in correct timezone
timezone = pytz.timezone('America/Montreal')
current_datetime = pd.Timestamp.now(tz=timezone)
current_datetime_str = current_datetime.strftime("%Y-%m-%d_%H:%M:%S")
print(f'Current datetime: {current_datetime_str}')

Current datetime: 2024-12-02_11:45:08


In [3]:
# Import service inventory and service standards to dataframes
si = pd.read_csv('si.csv', sep=';')
ss = pd.read_csv('ss.csv', sep=';')

# Extract date of generation from timestamp on last line
date = pd.to_datetime(si.iloc[-1, 0].split(':')[1].split('_')[0])

# Remove last line with datestamp from dataframes
si = si.iloc[:-1]
ss = ss.iloc[:-1]

ss.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12477 entries, 0 to 12476
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   fiscal_yr                   12477 non-null  object 
 1   service_id                  12477 non-null  object 
 2   service_name_en             12477 non-null  object 
 3   service_name_fr             12477 non-null  object 
 4   service_standard_id         12477 non-null  object 
 5   service_standard_en         12459 non-null  object 
 6   service_standard_fr         12406 non-null  object 
 7   type                        12477 non-null  object 
 8   gcss_tool_fiscal_yr         5724 non-null   object 
 9   channel                     12477 non-null  object 
 10  channel_comments_en         4286 non-null   object 
 11  channel_comments_fr         4326 non-null   object 
 12  target_type                 10424 non-null  object 
 13  target                      121

In [4]:
# Function to check if a URI's format is valid
def is_valid_format(uri):
    if not uri or not isinstance(uri, str):  # Ignore blank or non-string URIs
        return False
    return uri.startswith(('http://', 'https://'))  # Ensure valid web URL scheme

# Function to check if a URI's address is valid
def is_valid_uri(uri):
    session = requests.Session()
    try:
        response = session.head(uri, allow_redirects=True, timeout=60)
        return uri, response.status_code == 200
    except requests.RequestException:  # Handle exceptions
        return uri, False

# Load existing validation results (if the file exists)
try:
    previous_results_df = pd.read_csv('uri_validation_results.csv')
    previous_validation_results = dict(zip(previous_results_df['uri'], previous_results_df['is_valid']))
except FileNotFoundError:
    # If no previous results file exists, start with an empty dictionary
    previous_validation_results = {}

# Define URI columns and their corresponding DataFrames
uri_cols = {
    'service_uri_en': si,
    'service_uri_fr': si,
    'standards_targets_uri_en': ss,
    'standards_targets_uri_fr': ss,
    'performance_results_uri_en': ss,
    'performance_results_uri_fr': ss
}

# Step 1: Consolidate all URIs into a single list
all_uris = pd.concat([df[col] for col, df in uri_cols.items()], ignore_index=True)
unique_uris = all_uris.dropna().unique()  # Drop NaN and get unique URIs
valid_format_uris = [uri for uri in unique_uris if is_valid_format(uri)]  # Filter valid formats

# Step 2: Identify new URIs that need validation
new_uris = [uri for uri in valid_format_uris if uri not in previous_validation_results]

print(f'checking {len(new_uris)}')

# Step 3: Validate new URIs with multithreading
validation_results = {}
with ThreadPoolExecutor(max_workers=500) as executor:  # Adjust max_workers based on system capacity
    future_to_uri = {executor.submit(is_valid_uri, uri): uri for uri in new_uris}
    for future in tqdm(as_completed(future_to_uri), total=len(new_uris), desc="Validating URIs"):
        uri, is_valid = future.result()
        validation_results[uri] = is_valid

# Combine previous and new results
validation_results.update(previous_validation_results)

# Step 4: Map validation results back to each DataFrame column
for column, df in uri_cols.items():
    df[f'qa_{column}_is_valid'] = (
        df[column]
        .map(validation_results)
        .astype(bool)
        .fillna(False) # Replace NaN with False
    )

# Step 5: Save updated validation results to a CSV file
validation_results_df = pd.DataFrame(list(validation_results.items()), columns=['uri', 'is_valid'])
validation_results_df.to_csv('uri_validation_results.csv', index=False)

valid_results = validation_results_df['is_valid'][validation_results_df['is_valid']].count()

print(f"All URI checks complete. {valid_results} were valid")

checking 0


Validating URIs: 0it [00:00, ?it/s]

All URI checks complete. 3763 were valid


In [82]:
# Duplicate service ID conflict
# Step 1: Flag rows where 'service_id' is duplicated within each 'fiscal_yr'
si['qa_duplicate_sid'] = si.duplicated(subset=['fiscal_yr', 'service_id'], keep=False)

# Step 2: Get unique 'service_id's that are flagged as duplicates
duplicate_ids = si.loc[si['qa_duplicate_sid'], 'service_id'].unique()

# Step 3: Filter rows with duplicate 'service_id's and group by 'service_id' and 'department_en'
duplicate_groups = (
    si.loc[si['service_id'].isin(duplicate_ids), ['fiscal_yr', 'service_id', 'department_en']]
    .groupby(['service_id', 'department_en'])['fiscal_yr']  # Count occurrences of 'fiscal_yr'
    .nunique()  # Count unique fiscal years for each group
)

# Step 4: Identify groups with only one unique fiscal year (problematic cases)
problematic_duplicates = duplicate_groups[duplicate_groups == 1].reset_index()

# Step 5: Keep only 'service_id' and 'department_en' columns
problematic_duplicates = problematic_duplicates[['service_id', 'department_en']]

# Step 6: Create a set of tuples from 'problematic_duplicates' for efficient lookup
problematic_set = set(zip(problematic_duplicates['service_id'], problematic_duplicates['department_en']))

# Step 7: Update the 'qa_duplicate_sid' column based on whether each row matches a problematic duplicate
si['qa_duplicate_sid'] = si.apply(
    lambda row: (row['service_id'], row['department_en']) in problematic_set, axis=1
)


# check:
#si.loc[:, ['fiscal_yr', 'department_en', 'service_id', 'qa_duplicate_sid']][si['qa_duplicate_sid']]

{('4', 'Office of the Commissioner for Federal Judicial Affairs Canada'), ('2', 'Office of the Commissioner for Federal Judicial Affairs Canada'), ('7', 'Office of the Commissioner for Federal Judicial Affairs Canada'), ('3', 'Office of the Commissioner for Federal Judicial Affairs Canada'), ('1', 'Office of the Commissioner for Federal Judicial Affairs Canada')}


In [6]:
# Duplicate Service Standard ID conflict

ss['qa_duplicate_stdid'] = ss.duplicated(subset=['fiscal_yr', 'service_standard_id'], keep=False)

#ss.loc[:, ['fiscal_yr', 'department_en', 'service_id','service_standard_id', 'qa_duplicate_stdid']][ss['qa_duplicate_stdid']].sort_values(by='service_standard_id')

In [7]:
# Record is reported for a fiscal year that is incomplete or in the future.
si['fiscal_yr_end_date'] = pd.to_datetime(si['fiscal_yr'].str.split('-').str[1]+'-04-01')
si['qa_fiscal_yr_in_future'] = si['fiscal_yr_end_date'] >= date

In [8]:
# Record has contradiction between client feedback channels and online interaction points for feedback
si['qa_client_feedback_contradiction'] = (

    # Service accepts client feedback via the online channel (ONL) but online issue resolution or feedback is not applicable or not activated
    (
        si['client_feedback_channel'].str.contains('ONL') & 
        (
            si['os_issue_resolution_feedback'].isna() | 
            (si['os_issue_resolution_feedback'] == 'N')
        )
    ) |
    # Service has not listed the online channel (ONL) for client feedback but online issue resolution or feedback is activated
    (
        (~si['client_feedback_channel'].str.contains('ONL')) &
        (si['os_issue_resolution_feedback'] == 'Y')
    )
)

# si[['client_feedback_channel', 'os_issue_resolution_feedback', 'client_feedback_contradiction']].loc[si['client_feedback_contradiction'] == True]

In [9]:
# Service standards have volume but no volume indicated at service level
ss_vol_by_service = (
    ss.groupby(['fiscal_yr', 'service_id'])['total_volume']
    .sum()
    .reset_index()
    .rename(columns={'total_volume':'total_volume_ss'})
)

si = (
    si.merge(ss_vol_by_service, on=['fiscal_yr', 'service_id'], how='left')
    .fillna({'total_volume_ss': 0})
)

si['qa_ss_vol_without_si_vol'] = (
    (si['total_volume_ss'] > 0) & (si['num_applications_total'] == 0)
)

In [10]:
# Services that target society as a recipient type we would not expect to see specific interaction volume
# Note that this assumption may be false
si['qa_service_recipient_type_society_with_interactions'] = (
    (si['service_recipient_type'] == 'SOCIETY') &
    (si['num_applications_total'] > 0)
)

In [11]:
# Services where 'persons' are a client type should not be 'NA' for SIN as ID
si['qa_use_of_sin_applicable'] = (
    (si['client_target_groups'].str.contains('PERSON')) &
    (si['sin_usage'].isna())
)

In [12]:
# Services where 'econom' (business) are a client type should not be 'NA' for CRA BN as ID
si['qa_use_of_cra_bn_applicable'] = (
    (si['client_target_groups'].str.contains('ECONOM')) &
    (si['cra_bn_identifier_usage'].isna())
)

In [13]:
# Define the DataFrames to export to csv and their corresponding names
csv_exports = {
    "si_qa": si,
    "ss_qa": ss,
}

# Loop through the dictionary
for name, df in csv_exports.items():
    # Generate the filename using the key (string name)
    fn = f"{name}.csv"
    
    # Export the DataFrame to CSV
    df.to_csv(fn, index=False, sep=';')
    
    # Append the timestamp at the end of the file
    with open(fn, 'a') as timestamped_file:
        timestamped_file.write(f"\nTimestamp:{current_datetime_str}\n")
