In [None]:
import requests
import pandas
from tqdm.auto import tqdm
import numpy

In [None]:
# from ldap3 import Server, Connection, ALL, NTLM 

In [None]:
tqdm.pandas()

In [None]:
import pprint
import re
import json
from difflib import SequenceMatcher
import time

In [None]:
from db_utils import minio_utils

In [None]:
secrets = json.load(open("/home/jovyan/secrets/secrets.json"))

## Getting inventory data

In [None]:
minio_utils.minio_to_file(
    filename="../tempdata/WIP-Data-Inventory-November-2019.xlsx",
    minio_bucket="data-inventory",
    minio_key=secrets["minio"]["edge"]["access"],
    minio_secret=secrets["minio"]["edge"]["secret"],
    data_classification=minio_utils.DataClassification.EDGE,
)

In [None]:
inventory_df = pandas.read_excel("../tempdata/WIP-Data-Inventory-November-2019.xlsx")

In [None]:
minio_utils.dataframe_to_minio(
    inventory_df,
    minio_bucket="data-inventory.raw",
    minio_key=secrets["minio"]["edge"]["access"],
    minio_secret=secrets["minio"]["edge"]["secret"],
    data_classification=minio_utils.DataClassification.EDGE
)

In [None]:
inventory_df.columns

In [None]:
inventory_df['DS Directorate'].str.title().str.strip().value_counts()

## Creating Organisations

In [None]:
directorates = {
   "Water And Waste",
   "Spatial Planning And Environment",
   "Corporate Services",
   "Finance",
   "Economic Opportunities And Asset Management",
   "Energy And Climate Change",
   "Community Services And Health",
   "Transport", 
   "Safety And Security",
   "Human Settlements", 
   "Urban Management",
   "Office Of The City Manager",
   "Under Construction"
}

In [None]:
ckan_api_key = secrets["city-ckan"]["ckan-api-key"]

In [None]:
api_action_path_template = 'https://ds3.capetown.gov.za/data-catalogue/api/action/{}'

In [None]:
for directorate in directorates:
    directorate_slug = directorate.lower().replace(" ","_")
    
    resp = requests.post(
        api_action_path_template.format('organization_create'),
        data={
            "name": directorate_slug,
            "title": directorate,
            "description": f"Organisation for data sets that are under the stewardship of the {directorate} directorate"
        },
        headers={"X-CKAN-API-Key": ckan_api_key},
    )
    
    print(directorate, resp)

In [None]:
for directorate in directorates:
    directorate_slug = directorate.lower().replace(" ","_")
    
    resp = requests.post(
        api_action_path_template.format('organization_purge'),
        data={
            "id": directorate_slug,
        },
        headers={"X-CKAN-API-Key": ckan_api_key},
    )
    
    print(directorate, resp)

## Getting email addresses
With the power of LDAP*!

\*no ADs were harmed in the making of this email list

In [None]:
people = pandas.concat([inventory_df[col] for col in ["Data Steward (DS)", "Data Custodian", "Technical Reference"]]).apply(label_sanitise)

### From my laptop
Still need to get LDAP ports unblocked

In [None]:
pandas.DataFrame(people.unique()).to_csv("inventory_people.csv", index=False)

In [None]:
inventory_emails = pandas.read_csv("inventory_people_with_email.csv", header=None)

In [None]:
emails_dict = inventory_emails.set_index(0).to_dict()[1]

In [None]:
emails_dict["Adri Janse Van Rensburg"] = "Adri.JansevanRensburg@capetown.gov.za"

### How the sausage is made

In [None]:
# raw_df = people[0].str.split(r"[/\n\-\:]", expand=True).dropna(how="all")

In [None]:
# server = Server('capetown.gov.za', get_info=ALL, use_ssl=True) 
# conn = Connection(
#     server, 
#     user=f"CAPETOWN\\{secrets["proxy"]["username"]}", password=f"{secrets["proxy"]["password"]}", 
#     authentication=NTLM, auto_bind=True
# )

In [None]:
# @functools.lru_cache(1000) 
# def lookup_email(common_name):
#     common_name = common_name.strip() if pandas.notna(common_name) else None
#     if common_name is None:
#         return None
#     time.sleep(0.1)
#     try:
#         conn.search('DC=capetown,DC=gov,DC=za',f'(&(objectClass=user)(cn={common_name}))', attributes=['displayName', 'mail']) 
#         email = conn.entries[0]["mail"].value 
#         return email 
#     except: 
#         print(f"Couldn't find for {common_name}") 
#         return None

In [None]:
# result_df = pandas.DataFrame()
# for col in raw_df.columns:
#     result_df.loc[:,col] = raw_df[col].apply(lookup_email)

In [None]:
# inventory_emails = {name: email for name, email in zip(raw_df.values.flatten(), result_df.values.flatten()) if email is not None}

## Loading inventory into CKAN

In [None]:
bad_char_pattern = re.compile('[^a-z0-9-_]')

In [None]:
email_pattern = re.compile("([a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)")

Package (dataset) create [API reference](https://docs.ckan.org/en/2.8/api/#ckan.logic.action.create.package_create)

In [None]:
metadata_field_blacklist = {
    'Data Set Description (old)',
    'Data Set Purpose',
    'Master Data Content Type',
    'Person Type Description',
    'Location Type Description ', 
    'Object Type Description',
    'Comments',
    'Server Name',
    'Physical Locations', 
    'Unnamed: 35', 
    'Unnamed: 36', 
    'Unnamed: 37',
    'Additional Data Sourcing Method Description (Free Text)  x1, x2, xn',
    'DS/TR Contact Information',
    'Data Format Description',
    'Data Source Name',
    'Data Sourcing Method',
    'Data Type',
    'Event-based frequency description',
}

In [None]:
# http = requests.Session()
# for dataset in tqdm(inventory_df.to_dict(orient='records')[:]):
#     dataset_slug = label_to_value(dataset['Data Set Alias (Commonly known name)']).strip().lower().replace(" ","-")
#     dataset_slug = re.sub(bad_char_pattern, "", dataset_slug,)
    
#     directorate = str(dataset['DS Directorate']).strip().title()
#     directorate_slug = directorate.lower().replace(" ","-")
#     #print(directorate_slug)
    
#     dataset_metadata = {
#         'name': dataset_slug,
#         'title': dataset['Data Set Alias (Commonly known name)'],
#         'private': False,
#         'author': dataset['Data Owner'], 
#         'maintainer': dataset['Data Steward (DS)'],
#         'notes': dataset['Data Set Description'],
#         'owner_org': directorate_slug,
#         'extras': [
#            {'key': column.strip(), 'value': str(dataset[column])}
#            for column in inventory_df.columns.values
#            if not column in metadata_field_blacklist
#         ]
#     }
#     #print(pprint.pformat(dataset_metadata))
    
#     #print(dataset['DS/TR Contact Information'])
#     contact_details = str(dataset['DS/TR Contact Information']) if pandas.notna(dataset['DS/TR Contact Information']) else ""
#     contact_email_match = re.match(email_pattern, contact_details)
#     if contact_email_match:
#         dataset_metadata["maintainer_email"] = contact_email_match.group(0)
    
#     if directorate in directorates and len(dataset_slug) > 1:
#         resp = http.post(
#             api_action_path_template.format('package_create'),
#             data=json.dumps(dataset_metadata),
#             headers={"X-CKAN-API-Key": ckan_api_key, 'Content-Type': 'application/json'},   
#         )
#     elif len(dataset_slug) < 2:
#         print(f"data set '{dataset['Data Set Alias (Commonly known name)']}' is too short")
#     else:
#         print(f"data set '{dataset['Data Set Alias (Commonly known name)']}' in unknown directorate '{directorate}', putting in under maintenance...")
#         directorate = 'Under Maintenence'
#         directorate_slug = 'under-maintenence'
#         dataset_metadata['owner_org'] = directorate_slug
        
#         resp = http.post(
#             api_action_path_template.format('package_create'),
#             data=json.dumps(dataset_metadata),
#             headers={"X-CKAN-API-Key": ckan_api_key, 'Content-Type': 'application/json'},   
#         )
    
#     #print(dataset_slug, resp)
#     #print("\n")
#     #print(resp.json())

In [None]:
inventory_df.columns

In [None]:
def label_to_value(label):
    sanitised_string = (
        str(label).strip()
                  .lower()
                  .replace(" ", "_")
    )

    pattern = re.compile(r'\W')
    sanitised_string = re.sub(
        pattern, "",
        sanitised_string
    )

    return sanitised_string

In [None]:
def label_sanitise(label):
    sanitised_string = (
        str(label).strip()
                  .replace("^\s$", "")
        if pandas.notna(label)
        else None
    )
    sanitised_string = sanitised_string if sanitised_string != "" else None
    
    return sanitised_string

In [None]:
split_pattern = re.compile(r'[/\n\-\:]')

In [None]:
http = requests.Session()
i = 0
for dataset in tqdm(inventory_df.to_dict(orient='records')[:]):
    # Applying string sanitising
    dataset_sanitised = {
        key: label_sanitise(value)
        for key, value in dataset.items()
    }
    dataset_values = {
        key: label_to_value(value)
        for key, value in dataset.items()
    }
    dataset_slug = dataset_values['Data Set Alias (Commonly known name)'][:100]
    
    # Choices validation
    dataset_values["Update Frequency"] = (
        dataset_values["Update Frequency"]
        if dataset_values["Update Frequency"] in ["historical", "event-based"]
        else None
    )
    dataset_values["Data Access Rights"] = (
        dataset_values['Data Access Rights']
        if dataset_values['Data Access Rights'] in ["open_public", 
                                                    "internal_open", 
                                                    "internal_restricted",
                                                    "secret"]
        else None
    )
    
    # Forming Org values
    directorate = str(dataset['DS Directorate']).strip().title()
    under_construction = True if directorate not in directorates else False
    
    directorate_slug = label_to_value(directorate) if not under_construction else directorate
    department_slug = (
        "_".join([directorate_slug, dataset_values['DS Department']])
        if pandas.notna(dataset['DS Department']) and not under_construction
        else None
    )
    branch_slug = (
        "_".join([department_slug, dataset_values['DS Branch']])
        if pandas.notna(dataset['DS Branch']) and not under_construction
        else None
    )
    
    dataset_metadata = {
        'name': dataset_slug,
        'title': dataset_sanitised['Data Set Alias (Commonly known name)'],
        'notes': dataset_sanitised['Data Set Description'],
        "data_quality": dataset_sanitised['Data Quality'],
        "update_frequency": dataset_values["Update Frequency"],
        "data_access_rights": dataset_values["Data Access Rights"],
        "restricted_reason": dataset_sanitised['Restricted Reason'],
        "data_format": dataset_sanitised['Data File Format'],
        'maintainer': dataset_sanitised['Data Steward (DS)'],
        'dstr_branch': branch_slug,
        'dstr_department': department_slug,
        'owner_org': directorate_slug,
        'data_contact_': dataset_sanitised['Technical Reference'],
        'publisher': dataset_sanitised['Data Custodian'],
        'host_system_id': dataset_values["System / Application Name"],
        'spatial_coverage': "na",
        'temporal_coverage': "false",
        #'temporal_coverage_start': '',
        #'temporal_coverage_end': ''
        'private': False,
    }
    # Adding contact details if present
    contact_details = label_sanitise(dataset['DS/TR Contact Information'])
    if contact_details is not None:
        contact_email_match = email_pattern.match(contact_details)
        if contact_email_match:
            dataset_metadata["maintainer_email"] = contact_email_match.group(0)
        
    # We tried what was in the spreadsheet, now to see what else we may have...
    for email_field, metadata_field in [("Data Steward (DS)", "maintainer_email"), 
                                        ("Data Custodian", "publisher_email"), 
                                        ("Technical Reference", "data_contact_email")]:
        if dataset_sanitised[email_field] is not None:
            field_values = split_pattern.split(dataset_sanitised[email_field])
            email_lookups = [emails_dict[name] for name in field_values if name in emails_dict]
            if len(email_lookups) > 0:
                #print(dataset_sanitised['Data Set Alias (Commonly known name)'], email_field, ",".join(email_lookups))
                email_string = ",".join(email_lookups)
                dataset_metadata[metadata_field] = email_string
    
    under_construction = False
    under_construction_set = {"name", "title", "maintainer"}
    fix_dict = {}
    
    # Removing null values
    null_values = [key for key, val in dataset_metadata.items() if pandas.isna(val)]
    for null_key in null_values:
        del dataset_metadata[null_key]
        if null_key in under_construction_set:
            #print("Setting under construction because '{}'".format(null_key))
            under_construction = True
        fix_dict[null_key] = "Missing Value"
        
        
    # Required values to be set to NA
    for key in [
        #"data_set_description",
        "notes",
        "data_quality",
        "data_format",
        'dstr_branch', 
        'dstr_department', 
        'data_contact_', 
        'publisher']:
        if key not in dataset_metadata:
            dataset_metadata[key] = "NA"
        
    #print(pprint.pformat(dataset_metadata))
          
    if len(dataset_slug) < 2:
        print(f"data set '{dataset['Data Set Alias (Commonly known name)']}' is too short")
        
    else:
        header_used = False
        while True:
            if under_construction:
                #print(f"data set '{dataset['Data Set Alias (Commonly known name)']}' in Under Construction")
                directorate = 'Under Construction'
                directorate_slug = 'under_construction'
                dataset_metadata['owner_org'] = directorate_slug
                
            if len(fix_dict):
                reason_str = "\n".join([
                    "* `{}` - {}".format(field, reason) 
                    for field, reason in fix_dict.items()
                ])
                
                # Clearing the dict
                for k in list(fix_dict.keys()):
                    del fix_dict[k]
                
                dataset_metadata["notes"] = (
                    dataset_metadata["notes"]
                    + ("\n## **Metadata that needs to be fixed**\n" if not header_used else "\n")
                    + reason_str
                )
                header_used = True
            
            resp = http.post(
                api_action_path_template.format('package_create'),
                data=json.dumps(dataset_metadata),
                headers={"X-CKAN-API-Key": ckan_api_key, 'Content-Type': 'application/json'},   
            )
            
            #print(resp.text)
            if resp.ok:
                break
            elif "That URL is already in use." in resp.text:
                break
            elif resp.json()['error']['__type'] == 'Validation Error':
                under_construction = True
                fix_dict = {
                    k: ",".join(v)
                    for k,v in resp.json()['error'].items()
                    if k != "__type"
                }
            elif resp.json()['error']['__type'] == 'Internal Server Error':
                print("**INTERNAL SERVER ERROR**")
                print(i, dataset_slug, resp)
                break
    
    #print(i, dataset_slug, resp)
    #print(resp.json())
    #print("\n")
    i += 1

In [None]:
http = requests.Session()
for dataset in tqdm(inventory_df.to_dict(orient='records')[:10]):
    dataset_slug = label_to_value(dataset['Data Set Alias (Commonly known name)'])[:100]
    
    resp = http.post(
        api_action_path_template.format('dataset_purge'),
        data={"id": dataset_slug},
        headers={"X-CKAN-API-Key": ckan_api_key},
    )
    
    print(dataset_slug, resp)

## Linking

### MPortal

In [None]:
city_proxy = f"http://{secrets['proxy']['username']}:{secrets['proxy']['password']}@internet.capetown.gov.za:8080/"

In [None]:
mportal_feature_list_request = requests.get(
    "http://mportal.capetown.gov.za/agsint/rest/services/Single_Layers?f=pjson",
    proxies={"http": city_proxy,"https": city_proxy}
)
mportal_feature_dict = {
    service_dict['name'].split("/")[-1].lower(): service_dict['name']
    for service_dict in mportal_feature_list_request.json()['services']
}

In [None]:
http = requests.Session()
mportal_template = "http://mportal.capetown.gov.za/agsint/rest/services/{}/MapServer"
for dataset in tqdm(inventory_df.to_dict(orient='records')[:]):
    dataset_title = label_to_value(dataset['Data Set Alias (Commonly known name)'])
    
    matches = []
    if dataset_title in mportal_feature_dict:
        matches += [dataset_title]
    
    for entry in mportal_feature_dict:
        match_ratio = SequenceMatcher(None, dataset_title, entry).ratio() 
        if 0.7 <= match_ratio < 1.0:
            matches += [entry]
            
    if len(matches) > 0:
        print(f'"{dataset_title}" has matches: {",".join(matches)}')
    
    dataset_slug = label_to_value(dataset['Data Set Alias (Commonly known name)'])[:100]
    for match in matches:
        resource_url = mportal_template.format(mportal_feature_dict[match])
        
        resource_metadata = {
            'package_id': dataset_slug,
            'url': resource_url,
            'resource_type': 'api',
            'format': 'MPortal Link',
            'name': f'MPortal Layer "{mportal_feature_dict[match].replace("_", " ")}"',
        }
        
        resp = http.post(
            api_action_path_template.format('resource_create'),
            data=json.dumps(resource_metadata),
            headers={"X-CKAN-API-Key": ckan_api_key, 'Content-Type': 'application/json'},
            
        )
        #print(resp)

In [None]:
http = requests.Session()

batch_size = 1000
for i in range(10):
    resp = http.post(
        'https://ds3.capetown.gov.za/data-catalogue/api/action/current_package_list_with_resources',
        data=json.dumps({'limit': batch_size, 'offset': i*batch_size}),
        headers={"X-CKAN-API-Key": ckan_api_key, 'Content-Type': 'application/json'}, 
    )

    for dataset in tqdm(resp.json()['result']):
        for resource in dataset['resources']:
            if 'MPortal Layer' in resource['name']:
                #print(f"Deleting resource from '{dataset['name']}'...")
                resp = http.post(
                    api_action_path_template.format('resource_delete'),
                    data=json.dumps({"id": resource['id'], "package_id": dataset['id']}),
                    headers={"X-CKAN-API-Key": ckan_api_key, 'Content-Type': 'application/json'},
                )
                
                #print(resp.json())

### Open Data Portal

In [None]:
odp_feature_list_request = requests.get(
    "https://citymaps.capetown.gov.za/agsext1/rest/services/Theme_Based/Open_Data_Service/MapServer/?f=pjson",
    proxies={"http": city_proxy,"https": city_proxy}
)
odp_feature_dict = {
    service_dict['name']: service_dict['name'].lower().replace(" ","-").replace("---","-")
    for service_dict in odp_feature_list_request.json()['layers']
    if service_dict['minScale'] != 0 and service_dict['maxScale'] != 0
}

In [None]:
http = requests.Session()
mportal_template = "https://odp-cctegis.opendata.arcgis.com/datasets/{}"
for dataset in tqdm(inventory_df.to_dict(orient='records')[:]):
    dataset_title = label_sanitise(dataset['Data Set Alias (Commonly known name)'])
    if dataset_title is not None:
        matches = [
            entry
            for entry in odp_feature_dict
            if 0.7 <= SequenceMatcher(None, dataset_title, entry).ratio() <= 1.0
        ]

        if len(matches) > 0:
            print(f'"{dataset_title}" has matches: {", ".join(matches)}')
    
        for match in matches:
            resource_url = mportal_template.format(odp_feature_dict[match])
            #print(f'"{dataset_title}" is present at "{resource_url}", creating resource...')

            dataset_slug = label_to_value(dataset['Data Set Alias (Commonly known name)'])[:100]

            resource_metadata = {
                'package_id': dataset_slug,
                'url': resource_url,
                'resource_type': 'api',
                'format': 'ODP Link',
                'name': f'Open Data Portal data set "{match}"',
            }

            #print(resource_metadata['url'])

            resp = http.post(
                api_action_path_template.format('resource_create'),
                data=json.dumps(resource_metadata),
                headers={"X-CKAN-API-Key": ckan_api_key, 'Content-Type': 'application/json'},

            )
            #print(resp)

In [None]:
http = requests.Session()

batch_size = 1000
for i in range(10):
    resp = http.post(
        'https://ds3.capetown.gov.za/data-catalogue/api/action/current_package_list_with_resources',
        data=json.dumps({'limit': batch_size, 'offset': i*batch_size}),
        headers={"X-CKAN-API-Key": ckan_api_key, 'Content-Type': 'application/json'}, 
    )

    for dataset in tqdm(resp.json()['result']):
        for resource in dataset['resources']:
            if 'Open Data Portal data set' in resource['name']:
                #print(f"Deleting resource from '{dataset['name']}'...")
                resp = http.post(
                    'https://ds3.capetown.gov.za/data-catalogue/api/action/resource_delete',
                    data=json.dumps({"id": resource['id'], "package_id": dataset['id']}),
                    headers={"X-CKAN-API-Key": ckan_api_key, 'Content-Type': 'application/json'},
                )
                
                #print(resp.json())