In [None]:
import requests
import pandas
from tqdm.auto import tqdm
import numpy

In [None]:
tqdm.pandas()

In [None]:
import pprint
import re
import json

In [None]:
from db_utils import minio_utils

In [None]:
secrets = json.load(open("/home/jovyan/secrets/secrets.json"))

## Getting inventory data

In [None]:
minio_utils.minio_to_file(
    filename="../tempdata/WIP-Data-Inventory-November-2019.xlsx",
    minio_bucket="data-inventory",
    minio_key=secrets["minio"]["edge"]["access"],
    minio_secret=secrets["minio"]["edge"]["secret"],
    data_classification=minio_utils.DataClassification.EDGE,
)

In [None]:
inventory_df = pandas.read_excel("../tempdata/WIP-Data-Inventory-November-2019.xlsx")

In [None]:
minio_utils.dataframe_to_minio(
    inventory_df,
    minio_bucket="data-inventory.raw",
    minio_key=secrets["minio"]["edge"]["access"],
    minio_secret=secrets["minio"]["edge"]["secret"],
    data_classification=minio_utils.DataClassification.EDGE
)

In [None]:
inventory_df.columns

In [None]:
inventory_df['DS Directorate'].str.title().str.strip().value_counts()

## Creating Organisations

In [None]:
directorates = {
   "Water And Waste",
   "Spatial Planning And Environment",
   "Corporate Services",
   "Finance",
   "Economic Opportunities And Asset Management",
   "Energy And Climate Change",
   "Community Services And Health",
   "Transport", 
   "Safety And Security",
   "Human Settlements", 
   "Urban Management",
   "Office Of The City Manager"
}

In [None]:
ckan_api_key = secrets["city-ckan"]["ckan-api-key"]

In [None]:
api_action_path_template = 'https://ds3.capetown.gov.za/data-catalogue/api/action/{}'

In [None]:
for directorate in directorates:
    directorate_slug = directorate.lower().replace(" ","-")
    
    resp = requests.post(
        api_action_path_template.format('organization_create'),
        data={
            "name": directorate_slug,
            "title": directorate,
            "description": f"Organisation for data sets that are under the stewardship of the {directorate} directorate"
        },
        headers={"X-CKAN-API-Key": ckan_api_key},
    )
    
    print(directorate, resp)

In [None]:
for directorate in directorates:
    directorate_slug = directorate.lower().replace(" ","-")
    
    resp = requests.post(
        api_action_path_template.format('organization_purge'),
        data={
            "id": directorate_slug,
        },
        headers={"X-CKAN-API-Key": ckan_api_key},
    )
    
    print(directorate, resp)

## Loading inventory into CKAN

In [None]:
bad_char_pattern = re.compile('[^a-z0-9-_]')

In [None]:
email_pattern = re.compile("([a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)")

Package (dataset) create [API reference](https://docs.ckan.org/en/2.8/api/#ckan.logic.action.create.package_create)

In [None]:
metadata_field_blacklist = {
    'Data Set Description (old)',
    'Data Set Purpose',
    'Master Data Content Type',
    'Person Type Description',
    'Location Type Description ', 
    'Object Type Description',
    'Comments',
    'Server Name',
    'Physical Locations', 
    'Unnamed: 35', 
    'Unnamed: 36', 
    'Unnamed: 37',
    'Additional Data Sourcing Method Description (Free Text)  x1, x2, xn',
    'DS/TR Contact Information',
    'Data Format Description',
    'Data Source Name',
    'Data Sourcing Method',
    'Data Type',
    'Event-based frequency description',
    'Update Frequency'
}

In [None]:
http = requests.Session()
for dataset in tqdm(inventory_df.to_dict(orient='records')[:]):
    dataset_slug = str(dataset['Data Set Alias (Commonly known name)']).strip().lower().replace(" ","-")
    dataset_slug = re.sub(bad_char_pattern, "", dataset_slug,)
    
    directorate = str(dataset['DS Directorate']).strip().title()
    directorate_slug = directorate.lower().replace(" ","-")
    #print(directorate_slug)
    
    dataset_metadata = {
        'name': dataset_slug,
        'title': dataset['Data Set Alias (Commonly known name)'],
        'private': False,
        'author': dataset['Data Owner'], 
        'maintainer': dataset['Data Steward (DS)'],
        'notes': dataset['Data Set Description'],
        'owner_org': directorate_slug,
        'extras': [
           {'key': column.strip(), 'value': str(dataset[column])}
           for column in inventory_df.columns.values
           if not column in metadata_field_blacklist
        ]
    }
    #print(pprint.pformat(dataset_metadata))
    
    #print(dataset['DS/TR Contact Information'])
    contact_details = str(dataset['DS/TR Contact Information']) if pandas.notna(dataset['DS/TR Contact Information']) else ""
    contact_email_match = re.match(email_pattern, contact_details)
    if contact_email_match:
        dataset_metadata["maintainer_email"] = contact_email_match.group(0)
    
    if directorate in directorates and len(dataset_slug) > 1:
        resp = http.post(
            api_action_path_template.format('package_create'),
            data=json.dumps(dataset_metadata),
            headers={"X-CKAN-API-Key": ckan_api_key, 'Content-Type': 'application/json'},   
        )
    elif len(dataset_slug) < 2:
        print(f"data set '{dataset['Data Set Alias (Commonly known name)']}' is too short")
    else:
        print(f"data set '{dataset['Data Set Alias (Commonly known name)']}' in unknown directorate '{directorate}', putting in under maintenance...")
        directorate = 'Under Maintenence'
        directorate_slug = 'under-maintenence'
        dataset_metadata['owner_org'] = directorate_slug
        
        resp = http.post(
            api_action_path_template.format('package_create'),
            data=json.dumps(dataset_metadata),
            headers={"X-CKAN-API-Key": ckan_api_key, 'Content-Type': 'application/json'},   
        )
    
    #print(dataset_slug, resp)
    #print("\n")
    #print(resp.json())

In [None]:
http = requests.Session()
for dataset in tqdm(inventory_df.to_dict(orient='records')[:]):
    dataset_slug = str(dataset['Data Set Alias (Commonly known name)']).lower().strip().replace(" ","-")
    dataset_slug = re.sub(bad_char_pattern, "", dataset_slug,)
    
    resp = http.post(
        api_action_path_template.format('dataset_purge'),
        data={"id": dataset_slug},
        headers={"X-CKAN-API-Key": ckan_api_key},
    )
    
    #print(dataset_slug, resp)

## Linking

### MPortal

In [None]:
city_proxy = f"http://{secrets['proxy']['username']}:{secrets['proxy']['password']}@internet05.capetown.gov.za:8080/"

In [None]:
mportal_feature_list_request = requests.get(
    "http://mportal.capetown.gov.za/agsint/rest/services/Single_Layers?f=pjson",
    proxies={"http": city_proxy,"https": city_proxy}
)
mportal_feature_dict = {
    service_dict['name'].split("/")[-1].lower(): service_dict['name']
    for service_dict in mportal_feature_list_request.json()['services']
}

In [None]:
http = requests.Session()
mportal_template = "http://mportal.capetown.gov.za/agsint/rest/services/{}/MapServer"
for dataset in tqdm(inventory_df.to_dict(orient='records')[:10]):
    dataset_title = str(dataset['Data Set Alias (Commonly known name)']).lower().strip().replace(" ","_")
    
    if dataset_title in mportal_feature_dict:
        resource_url = mportal_template.format(mportal_feature_dict[dataset_title])
        print(f'"{dataset_title}" is present at "{resource_url}", creating resource...')
        
        dataset_slug = str(dataset['Data Set Alias (Commonly known name)']).lower().strip().replace(" ","-")
        dataset_slug = re.sub(bad_char_pattern, "", dataset_slug,)
        
        resource_metadata = {
            'package_id': dataset_slug,
            'url': resource_url,
            'resource_type': 'api',
            'format': 'API',
            'name': 'Link to MPortal Layer',
        }
        
        resp = http.post(
            api_action_path_template.format('resource_create'),
            data=json.dumps(resource_metadata),
            headers={"X-CKAN-API-Key": ckan_api_key, 'Content-Type': 'application/json'},
            
        )
        #print(resp)

In [None]:
http = requests.Session()

batch_size = 1000
for i in range(10):
    resp = http.post(
        'https://ds3.capetown.gov.za/data-catalogue/api/action/current_package_list_with_resources',
        data=json.dumps({'limit': batch_size, 'offset': i*batch_size}),
        headers={"X-CKAN-API-Key": ckan_api_key, 'Content-Type': 'application/json'}, 
    )

    for dataset in tqdm(resp.json()['result']):
        for resource in dataset['resources']:
            if resource['name'] == 'Link to MPortal Layer':
                #print(f"Deleting resource from '{dataset['name']}'...")
                resp = http.post(
                    api_action_path_template.format('resource_delete'),
                    data=json.dumps({"id": resource['id'], "package_id": dataset['id']}),
                    headers={"X-CKAN-API-Key": ckan_api_key, 'Content-Type': 'application/json'},
                )
                
                #print(resp.json())

### Open Data Portal

In [None]:
odp_feature_list_request = requests.get(
    "https://citymaps.capetown.gov.za/agsext1/rest/services/Theme_Based/Open_Data_Service/MapServer/?f=pjson",
    proxies={"http": city_proxy,"https": city_proxy}
)
odp_feature_set = {
    service_dict['name'].lower().replace(" ","-").replace("---","-")
    for service_dict in odp_feature_list_request.json()['layers']
}

In [None]:
http = requests.Session()
mportal_template = "https://odp-cctegis.opendata.arcgis.com/datasets/{}"
for dataset in tqdm(inventory_df.to_dict(orient='records')[:]):
    dataset_title = str(dataset['Data Set Alias (Commonly known name)']).lower().strip().replace(" ","-").replace("---", "-")
    
    if dataset_title in odp_feature_set:
        resource_url = mportal_template.format(dataset_title)
        print(f'"{dataset_title}" is present at "{resource_url}", creating resource...')
        
        dataset_slug = str(dataset['Data Set Alias (Commonly known name)']).lower().strip().replace(" ","-")
        dataset_slug = re.sub(bad_char_pattern, "", dataset_slug,)
        
        resource_metadata = {
            'package_id': dataset_slug,
            'url': resource_url,
            'resource_type': 'api',
            'format': 'API',
            'name': 'Link to Open Data Portal data set',
        }
        
        #print(resource_metadata['url'])
        
        resp = http.post(
            api_action_path_template.format('resource_create'),
            data=json.dumps(resource_metadata),
            headers={"X-CKAN-API-Key": ckan_api_key, 'Content-Type': 'application/json'},
            
        )
        #print(resp)

In [None]:
http = requests.Session()

batch_size = 1000
for i in range(10):
    resp = http.post(
        'https://ds3.capetown.gov.za/data-catalogue/api/action/current_package_list_with_resources',
        data=json.dumps({'limit': batch_size, 'offset': i*batch_size}),
        headers={"X-CKAN-API-Key": ckan_api_key, 'Content-Type': 'application/json'}, 
    )

    for dataset in tqdm(resp.json()['result']):
        for resource in dataset['resources']:
            if resource['name'] == 'Link to Open Data Portal data set':
                #print(f"Deleting resource from '{dataset['name']}'...")
                resp = http.post(
                    'https://ds3.capetown.gov.za/data-catalogue/api/action/resource_delete',
                    data=json.dumps({"id": resource['id'], "package_id": dataset['id']}),
                    headers={"X-CKAN-API-Key": ckan_api_key, 'Content-Type': 'application/json'},
                )
                
                #print(resp.json())