# Quality assurance and automated service data review: URIs only

This notebook reviews published service data for issues with reported URIs. Relies on gc-service-data-script outputs to function.

In [None]:
import pandas as pd
import numpy as np
import requests, pytz, os, re
from tqdm.notebook import tqdm  # Use tqdm.notebook for JupyterLab progress bars
from concurrent.futures import ThreadPoolExecutor, as_completed


In [None]:
# Specify date and time in correct timezone
timezone = pytz.timezone('America/Montreal')
current_datetime = pd.Timestamp.now(tz=timezone)
current_datetime_str = current_datetime.strftime("%Y-%m-%d_%H:%M:%S")
print(f'Current datetime: {current_datetime_str}')

In [None]:
# Import service inventory and service standards to dataframes
si = pd.read_csv('si.csv', sep=';',  na_values=[], keep_default_na=False)
ss = pd.read_csv('ss.csv', sep=';',  na_values=[], keep_default_na=False)

# Extract date of generation from timestamp on last line
date = pd.to_datetime(si.iloc[-1, 0].split(':')[1].split('_')[0])

# Remove last line with datestamp from dataframes
si = si.iloc[:-1]
ss = ss.iloc[:-1]

In [None]:
# Function to check if a URI's format is a problem
def is_problem_format(uri):
    # Normalize URI by converting to string and stripping whitespace
    uri = str(uri).strip()

    # Check for multiple occurrences of 'http' or 'https'
    if uri.count('http://') > 1 or uri.count('https://') > 1:
        return True  # Problematic if more than one scheme

    # Check if input starts with valid schemes
    if not uri.startswith(('http://', 'https://')):
        return True  # Problematic if it does not start with a valid scheme

    # Check for invalid characters
    invalid_characters_pattern = r'[^a-zA-Z0-9\-._~:/?#@!$&\'()*+,;=%]'
    if re.search(invalid_characters_pattern, uri):
        return True  # Problematic if invalid characters are found

    return False  # No problem


# Function to check if a URI's address is a problem
def is_problem_uri(uri):
    session = requests.Session()
    try:
        response = session.head(uri, allow_redirects=True, timeout=60)
        return uri, response.status_code != 200, response.status_code
    except requests.RequestException as e:
        return uri, True, str(e)

# Consolidate all URIs from DataFrames
uri_cols = {
    'service_uri_en': si,
    'service_uri_fr': si,
    'standards_targets_uri_en': ss,
    'standards_targets_uri_fr': ss,
    'performance_results_uri_en': ss,
    'performance_results_uri_fr': ss
}

# Set up dictionaries to store validation results and descriptions of issues. 
# Keys will be URIs, values will be results or details
validation_results = {}
details = {}

# Generate a list of unique URIs to check
all_uris = pd.concat([df[col] for col, df in uri_cols.items()], ignore_index=True)
print(f'total uri records: {len(all_uris)}')

unique_uris = all_uris.dropna().sort_values().unique()
print(f'unique uris: {len(unique_uris)}')

# Filter out problematic formats so we don't validate them with requests
problem_format_uris = [uri for uri in unique_uris if is_problem_format(uri)]
print(f'problem format uris: {len(problem_format_uris)}')

# All problem format URIs are assigned the same description
for uri in problem_format_uris:
#    print(f'problem format: {uri}')
    validation_results[uri] = True
    details[uri] = "Incorrect URI format"

# Remove problematic format URIs from unique_uris
unique_uris = [uri for uri in unique_uris if uri not in problem_format_uris]
print(f'valid format uris: {len(unique_uris)}')

# Use ThreadPoolExecutor for parallel validation of remaining URIs
with ThreadPoolExecutor(max_workers=1000) as executor:
    results = executor.map(is_problem_uri, unique_uris)

# Process results from executor.map
    for uri, is_problem, detail in results:
#        print(f'{is_problem}, {detail}: {uri}')
        validation_results[uri] = is_problem
        details[uri] = detail

print('done checks.')

# Filter validation_results for only results with errors:
filtered_results = {uri: is_problem for uri, is_problem in validation_results.items() if is_problem}

print(f'problem uris: {len(filtered_results)}')

# Save updated validation results
filtered_results_df = pd.DataFrame(
    [(uri, is_problem) for uri, is_problem in filtered_results.items()],
    columns=['uri', 'is_problem']
)

# Map validation results to DataFrames
for column, df in uri_cols.items():
    # Add validation status
    df = df.merge(filtered_results_df, how='left', left_on=column, right_on='uri')
    uri_cols[column][f'qa_{column}_is_problem'] = df['is_problem'].astype(bool).fillna(False)
    
print('Dataframes updated')


filtered_results_df.to_csv('uri_validation_errors.csv', index=False)
print('Results saved')