In [61]:
import pprint
import json
import time
import random

In [91]:
from tqdm.auto import tqdm
import pandas
import requests
from requests_ntlm import HttpNtlmAuth
from bs4 import BeautifulSoup
import numpy

In [63]:
secrets = json.load(open("/home/jovyan/secrets/secrets.json"))

## Reading in Metadata Spreadsheet

In [64]:
city_metadata_df = pandas.read_excel("MetadataFields_Descriptors.xlsx")

In [65]:
city_metadata_df = city_metadata_df[
    ['Category', 'Field Name', 'Description','Acceptable Values']
]

In [66]:
city_metadata_df.Category = (
    city_metadata_df.Category.fillna(method="ffill")
                    .str.split("\n")
                    .apply(lambda x: x[0])
)

## Core Schema

In [94]:
core_filter = city_metadata_df.Category == "Core Fields"

In [118]:
fields_urlified = (
    city_metadata_df.loc[core_filter, "Field Name"]
                    .str.lower()
                    .str.replace(" ", "_")
                    .str.replace(r'\W', "")
)

metadata_df = pandas.DataFrame({
    "field_name": fields_urlified,
    "help_text": city_metadata_df.loc[core_filter, "Description"].str.split("\n").apply(lambda x: x[0]),
    "required": True,
    'label': city_metadata_df.loc[core_filter, "Field Name"].str.strip()
}).set_index('label')

### Overrides

In [119]:
metadata_df

Unnamed: 0_level_0,field_name,help_text,required
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Unique ID,unique_id,Dataset unique ID,True
Data Set Name,data_set_name,A name given to the dataset.,True
Data Set Description (Including purpose),data_set_description_including_purpose,A summary of the dataset.,True
Data Quality,data_quality,Subjective assesment of dataset quality.,True
Update Frequency,update_frequency,How often is data updated in the data set?,True
Data Access Rights,data_access_rights,Indicates the classification of the dataset in...,True
Restricted Reason,restricted_reason,To be completed if the data is indicated as Re...,True
Data Format,data_format,Provides an indication of the interoperable st...,True
Data Steward,data_steward,Indicates the individual who assumes business ...,True
DS/TR Branch,dstr_branch,Indicates the organisational unit (branch) of ...,True


In [120]:
choices_dict = {}

In [121]:
metadata_df.loc["Unique ID", "field_name"] = "name"
metadata_df.loc["Unique ID", "preset"] = "dataset_slug"

In [122]:
metadata_df.loc["Data Set Name", "field_name"] = "name"
metadata_df.loc["Data Set Name", "preset"] = "title"

In [123]:
metadata_df.loc["Data Set Description (Including purpose)", "field_name"] = "data_set_description"
metadata_df.loc["Data Set Description (Including purpose)", "form_snippet"] = "markdown.html"

In [124]:
metadata_df.loc["Update Frequency", "preset"] = "select"
metadata_df.loc["Update Frequency", "choices"] = True
choices_dict["Update Frequency"] = ["Historical", "Event-based"]

In [125]:
# Add reference to Data Access SOP to help with classification
metadata_df.loc["Data Access Rights", "preset"] = "select"
metadata_df.loc["Data Access Rights", "choices"] = True
choices_dict["Data Access Rights"] = ["Open Public", "Internal Open", "Internal Restricted", "Secret"]

In [126]:
metadata_df.loc["Restricted Reason", "required"] = False # ToDo - add validator that inspects the data access rights field

In [127]:
metadata_df.loc["Data Format", "preset"] = "resource_format_autocomplete"
metadata_df.loc["Data Format", "required"] = False

In [128]:
metadata_df.loc["Data Steward", "field_name"] = "maintainer"
metadata_df.loc["Data Steward", "display_property"] = "dc:contributor"

In [129]:
metadata_df.loc["DS/TR Directorate", "field_name"] = "owner_org"
metadata_df.loc["DS/TR Directorate", "preset"] = "dataset_organization"

In [130]:
metadata_df.loc["DS/TR Department", "preset"] = "select"
metadata_df.loc["DS/TR Department", "choices_helper"] = "cct_metadata_get_departments"
metadata_df.loc["DS/TR Department", "validators"] = "cct_metadata_check_department"

In [131]:
metadata_df.loc["DS/TR Branch", "preset"] = "select"
metadata_df.loc["DS/TR Branch", "choices_helper"] = "cct_metadata_get_branches"
metadata_df.loc["DS/TR Branch", "validators"] = "cct_metadata_check_branch"

In [141]:
metadata_df.loc["Data Custodian", "field_name"] = "publisher"
metadata_df.loc["Data Custodian", "display_property"] = "dc:publisher"

In [132]:
metadata_df.loc["Spatial Coverage", "preset"] = "select"
choices_dict["Spatial Coverage"] = ["NA", "<1m²", "<10m²", "<100m²", "<1km²", "<10km²", "<100km²", "<1000km²", "<10000km²"]

In [133]:
metadata_df.loc["Temporal Coverage", "preset"] = "boolean_select"

In [134]:
metadata_df.loc["Temporal Coverage Start", "field_name"] = "temporal_coverage_start"
metadata_df.loc["Temporal Coverage Start", "help_text"] = "Start date of temporal coverage."
metadata_df.loc["Temporal Coverage Start", "preset"] = "date"

In [135]:
metadata_df.loc["Temporal Coverage End", "field_name"] = "temporal_coverage_end"
metadata_df.loc["Temporal Coverage End", "help_text"] = "End date of temporal coverage."
metadata_df.loc["Temporal Coverage End", "preset"] = "date"

## Outputting Schema

In [136]:
metadata_schema = {
  "scheming_version": 1,
  "dataset_type": "dataset",
  "about": "Schema for the City of Cape Town",
  "about_url": "http://github.com/ckan/ckanext-scheming",
    "resource_fields": [
    {
      "field_name": "url",
      "label": "URL",
      "preset": "resource_url_upload"
    },
    {
      "field_name": "name",
      "label": "Name",
      "form_placeholder": "eg. January 2020 Service Requests"
    },
    {
      "field_name": "description",
      "label": "Description",
      "form_snippet": "markdown.html",
      "form_placeholder": "Some useful notes about the data"
    },
    {
      "field_name": "format",
      "label": "Format",
      "preset": "resource_format_autocomplete"
    }
    ]
}

In [142]:
metadata_schema["dataset_fields"] = metadata_df.reset_index().to_dict(orient="records")

In [143]:
for dataset_dict in metadata_schema["dataset_fields"]:
    # Cleaning up nan values
    keys_to_remove = [
        key
        for key, value in dataset_dict.items()
        if pandas.isna(value)
    ]
    
    for key in keys_to_remove:
        del dataset_dict[key]
    
    # Merging in choices
    choices = choices_dict.get(dataset_dict["label"], None)
    if choices is not None:
        dataset_dict["choices"] = [
            {"label": label, "value": label.lower().replace(" ","_")}
            for label in choices
        ]

In [144]:
metadata_schema["dataset_fields"]

[{'label': 'Unique ID',
  'field_name': 'name',
  'help_text': 'Dataset unique ID',
  'required': True,
  'preset': 'dataset_slug'},
 {'label': 'Data Set Name',
  'field_name': 'name',
  'help_text': 'A name given to the dataset.',
  'required': True,
  'preset': 'title'},
 {'label': 'Data Set Description (Including purpose)',
  'field_name': 'data_set_description',
  'help_text': 'A summary of the dataset.',
  'required': True,
  'form_snippet': 'markdown.html'},
 {'label': 'Data Quality',
  'field_name': 'data_quality',
  'help_text': 'Subjective assesment of dataset quality.',
  'required': True},
 {'label': 'Update Frequency',
  'field_name': 'update_frequency',
  'help_text': 'How often is data updated in the data set?',
  'required': True,
  'preset': 'select',
  'choices': [{'label': 'Historical', 'value': 'historical'},
   {'label': 'Event-based', 'value': 'event-based'}]},
 {'label': 'Data Access Rights',
  'field_name': 'data_access_rights',
  'help_text': 'Indicates the clas

In [145]:
with open("cct_metadata.json", "w") as metadata_json_file:
    json.dump(metadata_schema, metadata_json_file)

## Getting list of City Departments

In [5]:
proxies = {
    'http':  f"http://internet.capetown.gov.za:8080",
    'https': f"http://internet.capetown.gov.za:8080",
}

In [6]:
URL = "http://cityweb.capetown.gov.za/en/_layouts/CityDepartmentStructure.ListAndUI/"

In [7]:
# Reading the page contents
DIRECTORATES_PAGE = "Directorates.aspx"

In [37]:
def setup_session():
    http = requests.Session()
    http.proxies = proxies
    auth = HttpNtlmAuth(f'CAPETOWN\\{secrets["proxy"]["username"]}', secrets["proxy"]["password"])
    http.auth = auth

    headers = {'Proxy-Authorization': requests.auth._basic_auth_str(secrets["proxy"]["username"], secrets["proxy"]["password"])}
    http.headers = headers
    
    return http

In [38]:
def get_divs(link, div_id, session):
    
    divs = []
    tries = 0
    while tries < 2:
        try:            
            resp = session.get(link)
            soup = BeautifulSoup(resp.text, 'html.parser')
            divs = soup.find_all("div", {"id": div_id})
            tries = 2
            
        except Exception as e:
            print(f"failed for '{link}'")
            print(f'failed because {e.__class__}: "{e}"')
            session = setup_session()
            
            tries += 1
            time.sleep(5)
        
    return divs

In [39]:
directorate_divs = get_divs(URL + DIRECTORATES_PAGE, "directorate", setup_session())

directorate_dicts = [
    {
        "name": direc.find('a').contents[0].strip().lower().title(), 
        "link": URL + direc.find('a').get('href'),
    }
    for direc in tqdm(directorate_divs)
]

DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): internet.capetown.gov.za:8080
DEBUG:urllib3.connectionpool:http://internet.capetown.gov.za:8080 "GET http://cityweb.capetown.gov.za/en/_layouts/CityDepartmentStructure.ListAndUI/Directorates.aspx HTTP/1.1" 401 16
DEBUG:urllib3.connectionpool:http://internet.capetown.gov.za:8080 "GET http://cityweb.capetown.gov.za/en/_layouts/CityDepartmentStructure.ListAndUI/Directorates.aspx HTTP/1.1" 401 0
DEBUG:urllib3.connectionpool:http://internet.capetown.gov.za:8080 "GET http://cityweb.capetown.gov.za/en/_layouts/CityDepartmentStructure.ListAndUI/Directorates.aspx HTTP/1.1" 200 80021


HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




In [41]:
session = setup_session()
for directorate_dict in tqdm(directorate_dicts):
    if directorate_dict.get("departments", 0) == 0:
        directorate_dict["departments"] = [
            {
                "name": dept.find('a').contents[0].strip().lower().title(),
                "link": URL + dept.find('a').get('href'),
            }
            for dept in get_divs(directorate_dict["link"], "department", session)
        ]

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): internet.capetown.gov.za:8080
DEBUG:urllib3.connectionpool:http://internet.capetown.gov.za:8080 "GET http://cityweb.capetown.gov.za/en/_layouts/CityDepartmentStructure.ListAndUI/Departments.aspx?Directorate=70000506 HTTP/1.1" 401 16
DEBUG:urllib3.connectionpool:http://internet.capetown.gov.za:8080 "GET http://cityweb.capetown.gov.za/en/_layouts/CityDepartmentStructure.ListAndUI/Departments.aspx?Directorate=70000506 HTTP/1.1" 401 0
DEBUG:urllib3.connectionpool:http://internet.capetown.gov.za:8080 "GET http://cityweb.capetown.gov.za/en/_layouts/CityDepartmentStructure.ListAndUI/Departments.aspx?Directorate=70000506 HTTP/1.1" 200 77055
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): internet.capetown.gov.za:8080
DEBUG:urllib3.connectionpool:http://internet.capetown.gov.za:8080 "GET http://cityweb.capetown.gov.za/en/_layouts/CityDepartmentStructure.ListAndUI/Departments.aspx?Directorate=70007527 HTTP/1.1" 401 16





In [44]:
for directorate_dict in tqdm(directorate_dicts):
    session = setup_session()
    for department_dict in directorate_dict["departments"]:
        if directorate_dict.get("branches", 0) == 0:
            department_dict["branches"] = [
                {
                    "name": dept.find('a').contents[0].strip().lower().title(),
                    "link": URL + dept.find('a').get('href'),
                }
                for dept in get_divs(department_dict["link"], "branch", session)
            ]

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): internet.capetown.gov.za:8080
DEBUG:urllib3.connectionpool:http://internet.capetown.gov.za:8080 "GET http://cityweb.capetown.gov.za/en/_layouts/CityDepartmentStructure.ListAndUI/Branches.aspx?Directorate=70000506&Department=70011193 HTTP/1.1" 401 16
DEBUG:urllib3.connectionpool:http://internet.capetown.gov.za:8080 "GET http://cityweb.capetown.gov.za/en/_layouts/CityDepartmentStructure.ListAndUI/Branches.aspx?Directorate=70000506&Department=70011193 HTTP/1.1" 401 0
DEBUG:urllib3.connectionpool:http://internet.capetown.gov.za:8080 "GET http://cityweb.capetown.gov.za/en/_layouts/CityDepartmentStructure.ListAndUI/Branches.aspx?Directorate=70000506&Department=70011193 HTTP/1.1" 200 70722
DEBUG:urllib3.connectionpool:http://internet.capetown.gov.za:8080 "GET http://cityweb.capetown.gov.za/en/_layouts/CityDepartmentStructure.ListAndUI/Branches.aspx?Directorate=70000506&Department=70000507 HTTP/1.1" 200 83712
DEBUG:urllib3.connecti




In [45]:
pprint.pprint(directorate_dicts)

[{'departments': [{'branches': [],
                   'link': 'http://cityweb.capetown.gov.za/en/_layouts/CityDepartmentStructure.ListAndUI/./Branches.aspx?Directorate=70000506&Department=70011193',
                   'name': 'Administration'},
                  {'branches': [{'link': 'http://cityweb.capetown.gov.za/en/_layouts/CityDepartmentStructure.ListAndUI/./Sections.aspx?Directorate=70000506&Department=70000507&Branch=70010441',
                                 'name': 'Area Central'},
                                {'link': 'http://cityweb.capetown.gov.za/en/_layouts/CityDepartmentStructure.ListAndUI/./Sections.aspx?Directorate=70000506&Department=70000507&Branch=70010440',
                                 'name': 'Area East'},
                                {'link': 'http://cityweb.capetown.gov.za/en/_layouts/CityDepartmentStructure.ListAndUI/./Sections.aspx?Directorate=70000506&Department=70000507&Branch=70010439',
                                 'name': 'Area North'},
    

In [54]:
for dd in directorate_dicts:
    del dd["link"]
    for dept_d in dd["departments"]:
        del dept_d["link"]
        for b in dept_d["branches"]:
            del b["link"]

In [58]:
json.dump(directorate_dicts, open("city_structure.json","w"))