## Purpose

This script will scan the CKAN API for the Humanitarian Data Exchange and return the metadata for all items as a CSV file in the GeoBTAA Metadata Application Profile. It will also create a secondary CSV file for the associated multiple downloads. 

## Import modules

In [None]:
# Standard libraries
import csv
import urllib.request
import json
import time
import os
import re
import ast
import decimal
import ssl
import sys

# Third-party libraries
import pandas as pd
import numpy as np

# auto-generate the current time in 'YYYYMM' format
action_date = time.strftime('%Y%m%d')

## Declare paths and defaults

In [None]:
# # Specify the path to the JSON file
output_filename = "ckan_metadata.json"  # Update with the correct path

# Specify the CKAN portal URL you want to harvest from
portalURL = "https://data.humdata.org"

# Construct the API URL for package search
packageURL = portalURL + 'api/3/action/package_search'

# Specify the path for the CSV file
csv_file_path = action_date + "_99-1400.csv"  # Update with the desired path

## Part 1: Download the metadata to your desktop

This cell will scan the API and create a JSON file on your desktop. This will take several minutes.

In [None]:
# Specify the CKAN portal URL you want to harvest from
packageURL = "https://data.humdata.org/api/3/action/package_search"

# Specify the number of items per page
items_per_page = 10

# Initialize variables for pagination
start = 0
total_results = 0

# List to store all metadata
all_metadata = []

# Request metadata in paginated manner
while True:
    try:
        # Construct the API request URL with pagination parameters
        api_request_url = f"{packageURL}?start={start}&rows={items_per_page}"

        # Set up a request with a user-agent
        request = urllib.request.Request(api_request_url, headers={'User-Agent': 'Mozilla/5.0'})
        
        # Request metadata
        context = ssl._create_unverified_context()
        response = urllib.request.urlopen(request, context=context, timeout=30)  # Increased timeout

        # Check if the response is valid (status code 200)
        if response.status == 200:
            response_json = json.loads(response.read().decode('utf-8'))
            
            # Extract metadata from the response
            metadata = response_json['result']['results']
            all_metadata.extend(metadata)
            
            # Update pagination variables
            start += items_per_page
            total_results = response_json['result']['count']
            
            # Break the loop if we have collected all items
            if start >= total_results:
                break
        else:
            print(f"Failed to fetch data: HTTP status code {response.status}")
            break

    except urllib.error.URLError as e:
        print(f"Failed to fetch data: {e.reason}")
        break

# Save the metadata to a local JSON file on your desktop
desktop_path = ""  # Replace with your desktop path
output_filename = "ckan_metadata.json"
output_path = os.path.join(desktop_path, output_filename)

with open(output_path, "w") as json_file:
    json.dump(all_metadata, json_file, indent=4)

print(f"Metadata for {total_results} items saved to {output_path}")


## Part 2: Read the JSON into a pandas dataframe

In [None]:
# Read the JSON file into a DataFrame
all_df = pd.read_json(output_filename)
filtered_df = all_df[(all_df['archived'] == False) & (all_df['has_geodata'] == True) & (all_df['private'] == False)]


# Specify the columns you want to keep and their new names
columns_to_keep_and_rename = {
    "id": "ID",
    "title": "Alternative Title",
    "dataset_source": "Publisher",
    "license_url": "License",
    "notes": "Description",
    "methodology_other":"Methodology",
    "dataset_date":"dataset_date",
    "metadata_created":"metadata_created",
    "name": "Identifier",
    "solr_additions": "Places",
    "resources": "resources",
    "tags": "tags"
}

# Select and rename the specified columns
df = filtered_df[list(columns_to_keep_and_rename.keys())].rename(columns=columns_to_keep_and_rename)

# Remove duplicate items
df = df.drop_duplicates(subset=['ID'])

In [None]:
def parse_countries(places):
    try:
        # Convert the string to a JSON object
        places_json = json.loads(places)
        # Extract countries and join with a pipe
        countries = '|'.join(places_json.get('countries', []))
        return countries
    except json.JSONDecodeError:
        return ''  # Return an empty string in case of parsing error

# Apply the function to the 'solr_additions' column
df['Spatial Coverage'] = df['Places'].apply(parse_countries)
# Drop the "Places" column
df.drop(columns=['Places'], inplace=True)

In [None]:
def extract_dates(dataset_date):
    # Use regular expression to find all occurrences of four consecutive digits (years)
    years = re.findall(r'\b\d{4}\b', dataset_date)

    # Check if there are years found
    if years:
        start_year = years[0]
        # If the second date is an asterisk, use '2024' as the end year
        end_year = '2024' if dataset_date.endswith('TO *]') else years[-1]

        # Prepare Date Range and Temporal Coverage
        date_range = f"{start_year}-{end_year}" if start_year != end_year else start_year
        temporal_coverage = start_year if start_year == end_year else f"{start_year}-{end_year}"

        return date_range, temporal_coverage
    else:
        return None, None  # Return None if years are not found

# Apply the function to the 'dataset_date' column and split the results
df['Date Range'], df['Temporal Coverage'] = zip(*df['dataset_date'].apply(extract_dates))
df.drop(columns=['dataset_date'], inplace=True)


In [None]:
def extract_year(metadata_created):
    # Split the string at the first hyphen and take the first part
    year = metadata_created.split('-')[0]
    return year

# Apply the function to the 'metadata_created' column
df['Date Issued'] = df['metadata_created'].apply(extract_year)
df.drop(columns=['metadata_created'], inplace=True)

In [None]:
def extract_keywords(tags):
    # Extract 'display_name' from each tag in the array, join them with a pipe, and omit 'geodata'
    return '|'.join(tag['display_name'] for tag in tags if 'display_name' in tag and tag['display_name'].lower() != 'geodata')

# Apply the function to the 'tags' column
df['Keyword'] = df['tags'].apply(extract_keywords)
df.drop(columns=['tags'], inplace=True)

In [None]:
df.to_csv(csv_file_path, index=False)

### To do: Parse the URLs

In [None]:
# def extract_shp_geodatabase_resources(resources):
#     # Initialize an empty list to collect the download URLs
#     download_urls = []

#     # Iterate through each resource in the array
#     for resource in resources:
#         # Check if the format is either "SHP" or "Geodatabase"
#         if resource.get('format') in ['SHP', 'Geodatabase']:
#             # Extract the download URL
#             url = resource.get('download_url')
#             if url:
#                 download_urls.append(url)

#     # Join the URLs with a pipe
#     return '|'.join(download_urls)

# # Apply the function to the 'resources' column
# df['SHP_Geodatabase_URLs'] = df['resources'].apply(extract_shp_geodatabase_resources)


### Add default and constructed values

In [None]:
df['Date Accessioned'] = action_date
df['Code'] = "99-1400"
df['Is Part Of'] = "99-1400"
df['Member Of'] = "b0153110-e455-4ced-9114-9b13250a7093"
df['Accrual Method'] = "CKAN"
df['Access Rights'] = "Public"
# df['Language'] = "eng"
df['Provider'] = "Humanitarian Data Exchange"
df['Information'] = "https://data.humdata.org/dataset/" + df['ID'].astype(str)
df['Format'] = "Files"

In [None]:
# Define the desired order of columns
desired_order = [
'Title',
'Alternative Title',
'Description',
'Language',
'Format',
'Display Note',
'Creator',
'Provider',
'Resource Class',
'Theme',
'Temporal Coverage',
'Date Range',
'Spatial Coverage',
'Bounding Box',
'Member Of',
'Is Part Of',
'FeatureServer',
'HTML',
'ImageServer',
'Information',
'MapServer',
'ID',
'Identifier',
'Rights',
'Access Rights',
'Date Accessioned',
'Code',
'Accrual Method'

# Add more columns as needed in the desired order
]

# Reindex the DataFrame based on the desired order of columns
df = df.reindex(columns=desired_order)


In [None]:
# Apply .str.strip() method to all string columns in the DataFrame and replace newline and tab characters
df = df.apply(lambda x: x.str.replace('\n', ' ').str.replace('\t', ' ').str.replace('<br/>', ' ').str.replace('<br/><br/>', '|').str.strip() if x.dtype == "object" else x)

In [None]:
# Write the selected DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)