## Purpose

This script will scan the CKAN API for the Humanitarian Data Exchange and return the metadata for all items as a CSV file in the GeoBTAA Metadata Application Profile. It will also create a secondary CSV file for the associated multiple downloads. 

## Import modules

In [None]:
# Standard libraries
import csv
import urllib.request
import json
import time
import os
import re
import ast
import decimal
import ssl
import sys

# Third-party libraries
import pandas as pd
import numpy as np

# auto-generate the current time in 'YYYY-MM-DD' format
action_date = time.strftime('%Y-%m-%d')

## Declare paths and defaults

In [None]:
# # Specify the path to the JSON file
output_filename = "ckan_metadata.json"  # Update with the correct path

# Specify the CKAN portal URL you want to harvest from
portalURL = "https://data.humdata.org"

# Construct the API URL for package search
packageURL = portalURL + 'api/3/action/package_search'

# Specify the path for the CSV file
csv_file_path = action_date + "_99-1400.csv"  # Update with the desired path

## Part 1: Download the metadata to your desktop

This cell will scan the API and create a JSON file on your desktop. This will take several minutes.

In [None]:
# # Specify the CKAN portal URL you want to harvest from
# packageURL = "https://data.humdata.org/api/3/action/package_search"

# # Specify the number of items per page
# items_per_page = 10

# # Initialize variables for pagination
# start = 0
# total_results = 0

# # List to store all metadata
# all_metadata = []

# # Request metadata in paginated manner
# while True:
#     try:
#         # Construct the API request URL with pagination parameters
#         api_request_url = f"{packageURL}?start={start}&rows={items_per_page}"

#         # Set up a request with a user-agent
#         request = urllib.request.Request(api_request_url, headers={'User-Agent': 'Mozilla/5.0'})
        
#         # Request metadata
#         context = ssl._create_unverified_context()
#         response = urllib.request.urlopen(request, context=context, timeout=30)  # Increased timeout

#         # Check if the response is valid (status code 200)
#         if response.status == 200:
#             response_json = json.loads(response.read().decode('utf-8'))
            
#             # Extract metadata from the response
#             metadata = response_json['result']['results']
#             all_metadata.extend(metadata)
            
#             # Update pagination variables
#             start += items_per_page
#             total_results = response_json['result']['count']
            
#             # Break the loop if we have collected all items
#             if start >= total_results:
#                 break
#         else:
#             print(f"Failed to fetch data: HTTP status code {response.status}")
#             break

#     except urllib.error.URLError as e:
#         print(f"Failed to fetch data: {e.reason}")
#         break

# # Save the metadata to a local JSON file on your desktop
# desktop_path = ""  # Replace with your desktop path
# output_filename = "ckan_metadata.json"
# output_path = os.path.join(desktop_path, output_filename)

# with open(output_path, "w") as json_file:
#     json.dump(all_metadata, json_file, indent=4)

# print(f"Metadata for {total_results} items saved to {output_path}")


## Part 2: Read the JSON into a pandas dataframe

In [None]:
# Read the JSON file into a DataFrame
all_df = pd.read_json(output_filename)
filtered_df = all_df[(all_df['archived'] == False) & (all_df['has_geodata'] == True) & (all_df['private'] == False)]


# Specify the columns you want to keep and their new names
columns_to_keep_and_rename = {
    "id": "ID",
    "title": "Alternative Title",
    "dataset_source": "Publisher",
    "license_url": "License",
    "notes": "Description",
    "methodology_other":"Methodology",
    "dataset_date":"dataset_date",
    "metadata_created":"metadata_created",
    "name": "Identifier",
    "solr_additions": "solr_additions",
    "resources": "resources",
    "tags": "tags"
}

# Select and rename the specified columns
df = filtered_df[list(columns_to_keep_and_rename.keys())].rename(columns=columns_to_keep_and_rename)

# Remove duplicate items
df = df.drop_duplicates(subset=['ID'])

## Match the nation place names to bounding boxes from an external file

In [None]:
# Load the bounding box data
bbox_df = pd.read_csv('../../data/nation-bbox.csv')

# Function to clean and prepare string data for matching
def clean_place_name(name):
    return str(name).strip().lower()

# Prepare the bounding box DataFrame
bbox_df['Label'] = bbox_df['Label'].apply(clean_place_name)
bbox_df['altLabel'] = bbox_df['altLabel'].apply(clean_place_name)

# Create a dictionary to map both 'Label' and 'altLabel' to the 'Bounding Box'
bbox_map = {}
for _, row in bbox_df.iterrows():
    bbox_map[row['Label']] = row['Bounding Box']
    if pd.notna(row['altLabel']):
        bbox_map[row['altLabel']] = row['Bounding Box']

# Function to parse countries from JSON
def parse_countries(places):
    try:
        # Convert the string to a JSON object
        places_json = json.loads(places)
        # Extract countries and join with a pipe
        countries = '|'.join(places_json.get('countries', []))
        return countries
    except json.JSONDecodeError:
        return ''  # Return an empty string in case of parsing error

# Function to find the bounding box for each place name
def find_bounding_boxes(place_names):
    boxes = []
    place_list = place_names.split('|')  # Split the string by pipe
    for name in place_list:
        clean_name = clean_place_name(name)
        box = bbox_map.get(clean_name)
        if box:
            boxes.append(box)
    return '|'.join(boxes) if boxes else None

# Apply the function to extract countries
df['Spatial Coverage'] = df['solr_additions'].apply(parse_countries)

# Drop the "Places" column
df.drop(columns=['solr_additions'], inplace=True)

# Apply the function to find bounding boxes
df['Bounding Boxes'] = df['Spatial Coverage'].apply(find_bounding_boxes)

## Merge multiple bounding boxes into one

In [None]:
def combine_bounding_boxes(bboxes):
    if not bboxes:
        return None

    # Initialize min and max values with extreme values
    min_lon = float('inf')
    min_lat = float('inf')
    max_lon = float('-inf')
    max_lat = float('-inf')

    # Iterate through all bounding boxes
    for bbox in bboxes.split('|'):
        if bbox:
            # Parse the bounding box coordinates
            coords = list(map(float, bbox.split(',')))
            # Update the min and max values
            min_lon = min(min_lon, coords[0])  # west - minimum longitude
            min_lat = min(min_lat, coords[1])  # south - minimum latitude
            max_lon = max(max_lon, coords[2])  # east - maximum longitude
            max_lat = max(max_lat, coords[3])  # north - maximum latitude

    # Create the combined bounding box
    return f"{min_lon},{min_lat},{max_lon},{max_lat}"

df['Bounding Box'] = df['Bounding Boxes'].apply(combine_bounding_boxes)

In [None]:
def extract_dates(dataset_date):
    # Use regular expression to find all occurrences of four consecutive digits (years)
    years = re.findall(r'\b\d{4}\b', dataset_date)

    # Check if there are years found
    if years:
        start_year = years[0]
        # If the second date is an asterisk, use '2024' as the end year
        end_year = '2024' if dataset_date.endswith('TO *]') else years[-1]

        # Prepare Date Range and Temporal Coverage
        date_range = f"{start_year}-{end_year}" if start_year != end_year else start_year
        temporal_coverage = start_year if start_year == end_year else f"{start_year}-{end_year}"

        return date_range, temporal_coverage
    else:
        return None, None  # Return None if years are not found

# Apply the function to the 'dataset_date' column and split the results
df['Date Range'], df['Temporal Coverage'] = zip(*df['dataset_date'].apply(extract_dates))
# df.drop(columns=['dataset_date'], inplace=True)


In [None]:
def extract_date_range(dataset_date):
    # Use regular expression to find all occurrences of four consecutive digits (years)
    years = re.findall(r'\b\d{4}\b', dataset_date)

    # Check if there are years found
    if years:
        start_year = years[0]
        # If the second date is an asterisk, use '2024' as the end year
        end_year = '2024' if dataset_date.endswith('TO *]') else years[-1]

        # Always prepare Date Range in the format yyyy-yyyy
        date_range = f"{start_year}-{end_year}"

        return date_range
    else:
        return None  # Return None if years are not found

# Apply the function to the 'dataset_date' column and assign results
df['Date Range'] = df['dataset_date'].apply(extract_date_range)
# df.drop(columns=['dataset_date'], inplace=True)

In [None]:
def extract_year(metadata_created):
    # Split the string at the first hyphen and take the first part
    year = metadata_created.split('-')[0]
    return year

# Apply the function to the 'metadata_created' column
df['Date Issued'] = df['metadata_created'].apply(extract_year)
# df.drop(columns=['metadata_created'], inplace=True)

In [None]:
def extract_keywords(tags):
    # Extract 'display_name' from each tag in the array, join them with a pipe, and omit 'geodata'
    return '|'.join(tag['display_name'] for tag in tags if 'display_name' in tag and tag['display_name'].lower() != 'geodata')

# Apply the function to the 'tags' column
df['Keyword'] = df['tags'].apply(extract_keywords)
df.drop(columns=['tags'], inplace=True)

### To do: Parse the URLs

In [None]:
# def extract_shp_geodatabase_resources(resources):
#     # Initialize an empty list to collect the download URLs
#     download_urls = []

#     # Iterate through each resource in the array
#     for resource in resources:
#         # Check if the format is either "SHP" or "Geodatabase"
#         if resource.get('format') in ['SHP', 'Geodatabase']:
#             # Extract the download URL
#             url = resource.get('download_url')
#             if url:
#                 download_urls.append(url)

#     # Join the URLs with a pipe
#     return '|'.join(download_urls)

# # Apply the function to the 'resources' column
# df['SHP_Geodatabase_URLs'] = df['resources'].apply(extract_shp_geodatabase_resources)


### Add default and constructed values

In [None]:
df['Date Accessioned'] = action_date
df['Code'] = "99-1400"
df['Is Part Of'] = "99-1400"
df['Member Of'] = "b0153110-e455-4ced-9114-9b13250a7093"
df['Accrual Method'] = "CKAN"
df['Access Rights'] = "Public"
df['Language'] = "eng"
df['Provider'] = "Humanitarian Data Exchange"
df['Information'] = "https://data.humdata.org/dataset/" + df['ID'].astype(str)
df['Format'] = "Files"
df['Resource Class'] = "Datasets"

In [None]:
# Define the desired order of columns
desired_order = [
'Title',
'Alternative Title',
'Description',
'Language',
'Format',
'Display Note',
'Creator',
'Provider',
'Resource Class',
'Theme',
'Temporal Coverage',
'Date Range',
'Spatial Coverage',
'Bounding Box',
'Member Of',
'Is Part Of',
'FeatureServer',
'HTML',
'ImageServer',
'Information',
'MapServer',
'ID',
'Identifier',
'Rights',
'Access Rights',
'Date Accessioned',
'Code',
'Accrual Method',
'License',
'Bounding Boxes'

# Add more columns as needed in the desired order
]

# Reindex the DataFrame based on the desired order of columns
df = df.reindex(columns=desired_order)

## Account for US state data

In [None]:
# Load the state bounding box data from the CSV file
state_df = pd.read_csv('../../data/states.csv')
# Create a dictionary that maps state names to bounding boxes
state_bbox_map = dict(zip(state_df['State'], state_df['Bounding Box']))

def update_spatial_coverage_and_bounding_box(row):
    title = row['Alternative Title']
    # Regex to capture state from the title format "United States (state)"
    match = re.match(r'United States \(([^)]+)\)', title)
    if match:
        state = match.group(1)
        # Append the state to the Spatial Coverage field
        if 'Spatial Coverage' in row and pd.notna(row['Spatial Coverage']):
            row['Spatial Coverage'] += f"|{state}"
        else:
            row['Spatial Coverage'] = state
        
        # Update the Bounding Box if the state is found in the map
        if state in state_bbox_map:
            row['Bounding Box'] = state_bbox_map[state]
    return row

# Apply the function to each row of the DataFrame
df = df.apply(update_spatial_coverage_and_bounding_box, axis=1)

## Clean up the titles

In [None]:
# Load the nation names
bbox_path = os.path.join('../../', 'data', 'nation-bbox.csv')
bbox_df = pd.read_csv(bbox_path)

# Extract unique nation names from the 'Label' and 'altLabel' columns and sort by length
nations = pd.concat([bbox_df['Label'], bbox_df['altLabel'].dropna()]).unique()
nations = sorted(nations, key=len, reverse=True)  # Sort by length, longest first

def transform_title(row, nations):
    alt_title = row['Alternative Title']
    context = ""

    # First, handle the United States (state) pattern
    us_state_match = re.match(r'United States \(([^)]+)\)', alt_title)
    if us_state_match:
        state = us_state_match.group(1)
        # Remove the "United States (state)" part and clean up the title
        alt_title = re.sub(r'United States \([^)]+\)\s*', '', alt_title).strip()
        context = f"[{state}]"

    # Next, handle nation names if no US state context has been set
    if not context:
        for nation in nations:
            if re.search(rf"\b{re.escape(nation)}\b", alt_title, re.I):
                # Place the nation name in brackets and remove it from its original place
                alt_title = re.sub(rf"\b{re.escape(nation)}\b", '', alt_title, flags=re.I).strip()
                context = f"[{nation}]"
                break

    # Append the context (nation or state) to the end of the title
    if context:
        alt_title = f"{alt_title.strip()} {context}"

    # Capitalize only the first character of the title if it starts with a lowercase letter
    if alt_title and alt_title[0].islower():
        alt_title = alt_title[0].upper() + alt_title[1:]

    # Remove leading non-word characters and multiple spaces
    alt_title = re.sub(r"^[^\w]+", "", alt_title)  # Removes leading non-word characters
    alt_title = re.sub(r'\s+', ' ', alt_title).strip()

    return alt_title

# Apply the transformed title directly to the DataFrame
df['Title'] = df.apply(lambda x: transform_title(x, nations), axis=1)

## Add Theme categories based on keywords in the title

In [None]:
# Load the themes data
theme_df = pd.read_csv('../../data/theme.csv')

# Prepare a dictionary mapping keywords to themes
theme_map = {}
for index, row in theme_df.iterrows():
    # Include the label itself as a keyword
    keywords = row['keywords'].split('|') + [row['label']]
    for keyword in keywords:
        theme_map[keyword.lower()] = row['label']

def assign_theme(title):
    matched_themes = []
    # Scan each keyword and check for a match in the title
    for keyword in theme_map:
        if keyword in title.lower():  # Check if keyword is in the title
            matched_theme = theme_map[keyword]
            if matched_theme not in matched_themes:  # Avoid duplicates
                matched_themes.append(matched_theme)
    
    if matched_themes:
        return '|'.join(matched_themes)  # Join all matched themes with a pipe
    return None  # Return None if no keywords match

# Apply the function to the Title column to update the Theme column
df['Theme'] = df['Title'].apply(assign_theme)

In [None]:
# Function to remove HTML tags from a string
def remove_html_tags(text):
    if pd.isna(text):
        return text
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

# Function to clean up and standardize text data in DataFrame
def clean_text_columns(column):
    if column.dtype == "object":
        # Remove HTML tags
        column = column.apply(remove_html_tags)
        # Replace newlines, carriage returns, tabs, and reduce multiple spaces to one
        column = column.str.replace('\n', ' ').replace('\x0D', ' ').str.replace('\t', ' ').str.replace('<br/>', ' ').str.replace('\s+', ' ', regex=True).str.strip()
    return column

# Apply cleaning functions to each column
df = df.apply(clean_text_columns)

In [None]:
# Write the selected DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)