## Transform a batch GBL 1.0 JSON files from UW-Madison

**Purpose: This script will read a batch of GBL-1.0 metadata JSON files and tranform them into a single CSV.** 

This is a version that is specific for UW-Madison records

Metadata records in the [GeoBlacklight](https://opengeometadata.org/docs/gbl-1.0) or [OpenGeoMetadata](https://opengeometadata.org/docs/ogm-aardvark) standards are frequently shared as batches of JSON files. The entire [OpenGeoMetadata organization](https://github.com/OpenGeoMetadata) contains repositories full of hundreds of thousands of GeoBlacklight JSONs.

In order to ingest these into the BTAA Geoportal, we need to transform them into a CSV.  


## Part 1: Load the modules and JSON files

### Import python modules

In [None]:
import csv
import json
import os
import pandas as pd
import uuid
import datetime
import re

### Declare the paths and file names

First, move a folder of the JSONs into this directory. Files in the folder can be nested.

In [None]:
json_path = r"/Users/majew030/GitHub/OGM/edu.wisc/HeldBy_RML" # enter the name of the folder
csv_name = "10d-03" # create a name for the output CSV without the .csv extension

### Load the files into a pandas DataFrame

In [None]:
dataset = [] # empty list

# through all items, format and append to dataset list
for path, dir, files in os.walk(json_path):
    for filename in files:
        if filename.endswith(".json"):
            file_path = os.path.join(path, filename)
            json_file_open = open(file_path, 'rb')
            data = json_file_open.read().decode('utf-8', errors='ignore')
            loaded = json.loads(data)
            dataset.append(loaded)
            
df = pd.DataFrame(dataset) # convert dataset into dataframe

In [None]:
# optional: check all the field names
for col in df.columns:
    print(col)

### Transform multivalued fields for CSV template

In [None]:
# Split arrays and turn them into multivalued fields separated by pipes '|'

df['dc_creator_sm']=df['dc_creator_sm'].str.join('|')
df['dc_subject_sm']=df['dc_subject_sm'].str.join('|')
df['dct_spatial_sm']=df['dct_spatial_sm'].str.join('|')
df['dct_isPartOf_sm']=df['dct_isPartOf_sm'].str.join('|')
df['dct_temporal_sm']=df['dct_temporal_sm'].str.join('|')

In [None]:
# Split solr_geom coordinates and reorder from WENS to WSEN
df[['w','e','n','s']] = df['solr_geom'].str.strip('ENVELOPE()').str.split(',', expand=True)
df['Bounding Box'] = df[['w', 's','e','n']].agg(','.join, axis=1) 

In [None]:
### Rename columns to match GeoBTAA Template

df = df.rename(columns={ 
    'dc_title_s' : 'Title',
    'dc_description_s' : 'Description',
    'dc_creator_sm' : 'Creator',
    'dct_issued_s' : 'Date Issued',
    'dc_rights_s' : 'Access Rights',
    'dc_format_s' : 'Format',
    'layer_slug_s' : 'ID',
    'layer_id_s' : 'WxS Identifier', 
    'dct_provenance_s' : 'Provider',
    'dc_publisher_s' : 'Publisher',
    'dc_publisher_sm' : 'Publisher',
    'dct_temporal_sm' : 'Temporal Coverage',
    'dct_isPartOf_sm' : 'Local Collection',
    'dc_subject_sm': 'Subject',
    'uw_deprioritize_item_b' : 'Child Record',
    'thumbnail_path_ss' : 'B1G Image'
})

## Add new fields

In [None]:
# Create Date Range field

def format_temporal_coverage(row):
    temporal_coverage = row['Temporal Coverage']
    
    # Check if the value is already a valid date range in yyyy-yyyy format
    if pd.notna(temporal_coverage) and re.match(r'\d{4}-\d{4}', temporal_coverage):
        return temporal_coverage  # Value is already formatted, so no change needed
    
    # Apply your existing logic to duplicate and format the value
    return f"{temporal_coverage}-{temporal_coverage}" if pd.notna(temporal_coverage) else ''

# Apply the function to create or update the "Date Range" column
df['Date Range'] = df.apply(format_temporal_coverage, axis=1)

In [None]:
# Create Georeferenced field

def check_geotiff(value):
    if pd.notna(value) and "GeoTIFF" in value:
        return "true"
    else:
        return "false"

# Create the "Georeferenced" column using the check_geotiff function
df["Georeferenced"] = df["Format"].apply(check_geotiff)


In [None]:
# Create Identifier field

df['Identifier'] = "https://geodata.wisc.edu/catalog/" + df['ID']

In [None]:
# Create Theme field

theme_map = {
    "Farming": "Agriculture",
    "Biota": "Biology",
    "Atmospheric Sciences": "Climate",
    "Geoscientific Information": "Geology",
    "Imagery and Base Maps": "Imagery",
    "Planning and Cadastral": "Property",
    "Utilities and Communication": "Utilities"
}

def map_theme_multivalued(subject):
    """
    Split a pipe-separated Subject field into terms,
    remap each term via theme_map if present,
    then rejoin with pipes.
    """
    # handle missing or empty
    if not isinstance(subject, str) or subject.strip() == "":
        return subject

    parts = subject.split("|")
    mapped = [ theme_map.get(p.strip(), p.strip()) for p in parts ]
    return "|".join(mapped)

df['Theme'] = df['Subject'].apply(map_theme_multivalued)


In [None]:
### Add administrative fields with default values

# Get the current date in yyyy-mm-dd format
today_date = datetime.date.today().isoformat()

# Add the "Date Accessioned" column with the today's date value to the DataFrame
df['Date Accessioned'] = today_date
df['Code'] = "10"
df['Is Part Of'] = "10d-03"
df['Member Of'] = "dc8c18df-7d64-4ff4-a754-d18d0891187d"
df['Accrual Method'] = "GBL-1.0"
df['Language'] = "eng"
df['Spatial Coverage'] = "Wisconsin"


### Transform values for fields without a straight crosswalk

In [None]:
#Convert Type to Resource Class value
df['Resource Class'] = df['dc_type_s'].apply(lambda x: 'Imagery' if x == 'Image' else 'Datasets')


#Convert Geometry Type to Resource Type value
df['Resource Type'] = df['layer_geom_type_s'].astype(str) + ' data'

In [None]:
# Add Display Note from uw_notice_s and uw_supplemental_s

def map_display_note(notice, supplemental):
    """    - uw_notice_s goes first (if nonblank)
    - uw_supplemental_s goes second, prepended with "Info: " (if nonblank)
    """
    parts = []
    # add notice if present
    if isinstance(notice, str) and notice.strip():
        parts.append(notice.strip())
    # add supplemental if present
    if isinstance(supplemental, str) and supplemental.strip():
        parts.append(f"Info: {supplemental.strip()}")
    # join or return empty string
    return "|".join(parts) if parts else ""

df['Display Note'] = [
    map_display_note(n, s)
    for n, s in zip(df['uw_notice_s'], df['uw_supplemental_s'])
]

In [None]:
# Update Creator field to FAST format

counties_in_wisconsin = [
    'Adams', 'Ashland', 'Barron', 'Bayfield', 'Brown', 'Buffalo',
    'Burnett', 'Calumet', 'Chippewa', 'Clark', 'Columbia', 'Crawford',
    'Dane', 'Dodge', 'Door', 'Douglas', 'Dunn', 'Eau Claire',
    'Florence', 'Fond du Lac', 'Fond Du Lac', 'Forest', 'Grant', 'Green', 'Green Lake',
    'Iowa', 'Iron', 'Jackson', 'Jefferson', 'Juneau', 'Kenosha',
    'Kewaunee', 'La Crosse', 'Lacrosse', 'Lafayette', 'Langlade', 'Lincoln',
    'Manitowoc', 'Marathon', 'Marinette', 'Marquette', 'Menominee',
    'Milwaukee', 'Monroe', 'Oconto', 'Oneida', 'Outagamie', 'Ozaukee',
    'Pepin', 'Pierce', 'Polk', 'Portage', 'Price', 'Racine',
    'Richland', 'Rock', 'Rusk', 'Sauk', 'Sawyer', 'Shawano',
    'Sheboygan', 'St. Croix', 'St Croix', 'Taylor', 'Trempealeau', 'Vernon', 'Vilas',
    'Walworth', 'Washburn', 'Washington', 'Waukesha', 'Waupaca',
    'Waushara', 'Winnebago', 'Wood'
]


In [None]:
def prepend_wisconsin(name, counties):
    """
    If `name` is a Wisconsin county (endswith " County" and base name is in counties),
    or if it starts with "City of ", prepend 'Wisconsin--'.
    Otherwise, return as-is.
    """
    # County check
    if name.endswith(" County"):
        base = name[:-len(" County")]
        if base in counties:
            return f"Wisconsin--{name}"
    # City check
    if name.startswith("City of "):
        return f"Wisconsin--{name}"
    return name


df['Creator'] = df['Creator'].apply(
    prepend_wisconsin, 
    args=(counties_in_wisconsin,)
)

## Cleanup

In [None]:
def trim_pipes_and_spaces(value):
    if isinstance(value, str):
        # Remove leading and trailing pipes and spaces
        value = value.strip('| ').strip('| ')
        # Replace double or more spaces with a single space
        value = re.sub(r'\s{2,}', ' ', value)
        return value
    return value

# Apply the function to the entire DataFrame
df = df.map(trim_pipes_and_spaces)

In [None]:
# Drop duplicates by ID
df = df.drop_duplicates(subset=['ID'])

### Write the DataFrame to a CSV file with Aardvark labels

In [None]:
# Define the desired order of columns
desired_order = [
'Title',
'Description',
'Language',
'Display Note',
'Creator',
'Publisher',
'Provider',
'Resource Class',
'Resource Type',
'Local Collection',
'Theme',
'Subject',
'Keyword',
'Temporal Coverage',
'Date Issued',
'Date Range',
'Spatial Coverage',
'Bounding Box',
'Geometry',
'Member Of',
'Is Part Of',
'Format',
'WxS Identifier',
'Georeferenced',
'ID',
'Identifier',
'Access Rights',
'Child Record',
'Date Accessioned',
'Code',
'Accrual Method',
'B1G Image'
]

# Reindex the DataFrame based on the desired order of columns
primary_df = df.reindex(columns=desired_order)

In [None]:
primary_df.to_csv("{}.csv".format(csv_name), index=False, na_rep='')

# Distributions

### Split the References into columns and write to a separate CSV

In [None]:
import logging

# Configure logging
logging.basicConfig(filename='processing.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def extract_values(row):
    try:
        dct_references_s = json.loads(row['dct_references_s'].replace('""', '"'))
        return dct_references_s
    except json.JSONDecodeError as e:
        # Log the error and the record causing it
        logging.error(f'Error processing record with ID: {row["layer_slug_s"]}, Error: {str(e)}')
        return None

# Apply the function to split the column and expand into separate columns
df = pd.concat([df, df.apply(extract_values, axis=1).apply(pd.Series)], axis=1)

# Rename columns based on keys in the JSON
df = df.rename(columns={
    'http://schema.org/downloadUrl': 'download',
    'http://schema.org/url': 'full_layer_description',
    'http://www.isotc211.org/schemas/2005/gmd/': 'metadata_iso',
    'http://www.opengis.net/cat/csw/csdgm': 'metadata_fgdc',
    'http://www.w3.org/1999/xhtml': 'metadata_html',
    'http://lccn.loc.gov/sh85035852': 'documentation_download',
    'http://iiif.io/api/image': 'iiif_image',
    'http://iiif.io/api/presentation#manifest': 'iiif_manifest',
    'http://www.loc.gov/mods/v3': 'metadata_mods',
    'https://openindexmaps.org': 'open_index_map',
    'http://www.opengis.net/def/serviceType/ogc/wms': 'wms',
    'http://www.opengis.net/def/serviceType/ogc/wfs': 'wfs',
    'urn:x-esri:serviceType:ArcGIS#FeatureLayer': 'arcgis_feature_layer',
    'urn:x-esri:serviceType:ArcGIS#TiledMapLayer': 'arcgis_tiled_map_layer',
    'urn:x-esri:serviceType:ArcGIS#DynamicMapLayer': 'arcgis_dynamic_map_layer',
    'urn:x-esri:serviceType:ArcGIS#ImageMapLayer': 'arcgis_image_map_layer'
})

# Define the columns in the DataFrame that correspond to distribution types
distribution_columns = [
    'download', 'documentation_external', 'metadata_iso', 'metadata_fgdc', 'metadata_html',
    'documentation_download', 'iiif_image', 'iiif_manifest', 'metadata_mods',
    'open_index_map', 'wms', 'wfs', 'arcgis_feature_layer',
    'arcgis_tiled_map_layer', 'arcgis_dynamic_map_layer', 'arcgis_image_map_layer'
]

# Function to check if the value is an array
def is_array_type(value):
    return isinstance(value, list)

# Function to extract the download information for rows with multiple downloads
def extract_multiple_downloads(row):
    friendlier_id = row["ID"]
    downloads = row.get("download", None)
    extracted_downloads = []
    if is_array_type(downloads):
        for download in downloads:
            if isinstance(download, dict):
                label = download.get("label", "")  # Use the label from the array
                url = download.get("url", "")
                extracted_downloads.append({
                    "friendlier_id": friendlier_id,
                    "label": label,
                    "reference_type": "download",
                    "distribution_url": url
                })
    return extracted_downloads

# Prepare a list to store the rows for the new CSV
distribution_rows = []

# Iterate over each row in the DataFrame
for _, row in df.iterrows():
    friendlier_id = row['ID']
    label = row.get('Format', "")  # Default label for single download
    
    # Handle array-type download values
    if is_array_type(row.get('download', None)):
        # Extract multiple download links
        distribution_rows.extend(extract_multiple_downloads(row))
    elif pd.notnull(row.get('download', None)):  # Single download
        distribution_rows.append({
            'friendlier_id': friendlier_id,
            'label': label,  # Use dc_format_s for single download
            'reference_type': 'download',
            'distribution_url': row['download']
        })
    
    # Process other distribution columns
    for col in distribution_columns:
        if col != "download" and col in df.columns and pd.notnull(row.get(col, None)):
            distribution_rows.append({
                'friendlier_id': friendlier_id,
                'label': "",  # Leave blank for non-download rows
                'reference_type': col,
                'distribution_url': row[col]
            })

# Create a new DataFrame for the distribution links
distribution_df = pd.DataFrame(distribution_rows)

# Save the distribution DataFrame to a CSV file
distribution_df.to_csv('distributions.csv', index=False)