## Transform a batch OpenGeoMetadata JSON files

**Purpose: This script will read a batch of GeoBlacklight metadata JSON files and tranform them into a single CSV.** 

Metadata records in the [GeoBlacklight](https://opengeometadata.org/docs/gbl-1.0) or [OpenGeoMetadata](https://opengeometadata.org/docs/ogm-aardvark) standards are frequently shared as batches of JSON files. The entire [OpenGeoMetadata organization](https://github.com/OpenGeoMetadata) contains repositories full of hundreds of thousands of GeoBlacklight JSONs.

In order to ingest these into the BTAA Geoportal, we need to transform them into a CSV.  


## 1. Import python modules

In [1]:
import csv
import json
import os
import pandas as pd
import uuid

## 2. Declare the paths and file names

Put a folder of the JSONs into this directory. They can be nested.

In [2]:
json_path = r"princeton" # enter the name of the folder
csv_name = "princeton" # create a name for the output CSV without the .csv extension

## 3. Load the files into a pandas DataFrame

In [3]:
dataset = [] # empty list

# through all items, format and append to dataset list
for path, dir, files in os.walk(json_path):
    for filename in files:
        if filename.endswith(".json"):
            file_path = os.path.join(path, filename)
            json_file_open = open(file_path, 'rb')
            data = json_file_open.read().decode('utf-8', errors='ignore')
            loaded = json.loads(data)
            dataset.append(loaded)
            
df = pd.DataFrame(dataset) # convert dataset into dataframe

## 4. Edit the values of various fields

In [5]:
# return the first value of a multivalued cell;this removes the []
# df['dc_creator_sm']=df['dc_creator_sm'].str[0]
# df['dc_subject_sm']=df['dc_subject_sm'].str[0]

df['dc_creator_sm']=df['dc_creator_sm'].str.join('|')
df['dc_subject_sm']=df['dc_subject_sm'].str.join('|')
df['dct_spatial_sm']=df['dct_spatial_sm'].str.join('|')



# remove brackets from Temporal Coverage which is a mix of single values and lists
# .str.join('') takes each item, whether a list or a single character, and joins them with a pipe
df['dct_temporal_sm']=df['dct_temporal_sm'].str.join('|')

# Split solr_geom coordinates and reorder from WENS to WSEN
df[['w', 'e','n','s']] = df['solr_geom'].str.strip('ENVELOPE()').str.split(',', expand=True)
df['Bounding Box'] = df[['w', 's','e','n']].agg(','.join, axis=1) 

#Convert Data Type to Resource Class value
# df['Resource Class'] = df['dc_type_s'].apply(lambda x: 'Datasets' if x == 'Dataset' else '')

#Convert Geometry Type to Resource Type value
df['Resource Type'] = df['layer_geom_type_s'].astype(str) + ' data'

# Create Date Range field
df['Date Range'] = df['dct_temporal_sm'].astype(str) +'-' + df['dct_temporal_sm'].astype(str) 

## 5. Split the References into separate columns

In [6]:
def extract_values(row):
    dct_references_s = json.loads(row['dct_references_s'].replace('""', '"'))
    return dct_references_s

# Apply the function to split the column and expand into separate columns
df = pd.concat([df, df.apply(extract_values, axis=1).apply(pd.Series)], axis=1)

# Rename columns based on keys in the JSON
df = df.rename(columns={
    'http://schema.org/downloadUrl': 'Download',
    'http://schema.org/url': 'Information',
    'http://www.isotc211.org/schemas/2005/gmd/': 'ISO19139',
    'http://www.opengis.net/cat/csw/csdgm': 'FGDC',
    'http://www.w3.org/1999/xhtml': 'HTML',
    'http://lccn.loc.gov/sh85035852': 'Documentation',
    'http://iiif.io/api/image': 'IIIF',
    'http://iiif.io/api/presentation#manifest': 'Manifest',
    'http://www.loc.gov/mods/v3': 'MODS',
    'https://openindexmaps.org': 'Index Map',
    'http://www.opengis.net/def/serviceType/ogc/wms': 'WMS',
    'http://www.opengis.net/def/serviceType/ogc/wfs': 'WFS',
    'urn:x-esri:serviceType:ArcGIS#FeatureLayer': 'FeatureServer',
    'urn:x-esri:serviceType:ArcGIS#TiledMapLayer': 'TileServer',
    'urn:x-esri:serviceType:ArcGIS#DynamicMapLayer': 'MapServer',
    'urn:x-esri:serviceType:ArcGIS#ImageMapLayer': 'ImageServer',
    'http://schema.org/DownloadAction': 'Harvard Download'
    # Add more key-value pairs for renaming columns as needed
})

## 6. Remove unnecessary columns

In [7]:
df = df.drop(columns=[
    'geoblacklight_version',
    'layer_modified_dt', 
#     'thumbnail_path_ss',
    'w','e','n','s', 
    'layer_id_s',
    'solr_year_i',
    'layer_geom_type_s',
    'solr_geom',
    'dct_references_s'
])

## 7. Rename  columns

In [8]:
df = df.rename(columns={
    'dc_title_s': 'Title', 
    'dc_description_s': 'Description',
    'dc_creator_sm': 'Creator',
    'dct_issued_s': 'Date Issued',
    'dc_rights_s' : 'Access Rights',
    'dc_format_s': 'Format',
    'layer_slug_s' : 'ID',
    'dc_identifier_s' : 'Identifier',
    'dc_language_s' : 'Language',
    'dct_provenance_s' : 'Provider',
    'dc_publisher_s' : 'Publisher',
    'dc_publisher_sm' : 'Publisher',
    'dc_source_sm' : 'Source',
    'dct_spatial_sm' : 'Spatial Coverage',
    'dc_subject_sm' : 'Subject',
    'dct_temporal_sm' : 'Temporal Coverage',
})


In [None]:
# new_rows = []
# for index, row in df.iterrows():
#     dct_isPartOf_sm = row['dct_isPartOf_sm']
#     if pd.notna(dct_isPartOf_sm):
#         if isinstance(dct_isPartOf_sm, str):
#             title = dct_isPartOf_sm.strip()
#             new_id = str(uuid.uuid4())  # Generating a new UUID as the ID for the collection
#             new_row = {
#                 'Title': title,
#                 'ID': new_id,
#                 'Resource Class': 'Collections'
#             }
#             new_rows.append(new_row)
#         elif isinstance(dct_isPartOf_sm, list):
#             for title in dct_isPartOf_sm:
#                 title = title.strip()
#                 new_id = str(uuid.uuid4())  # Generating a new UUID as the ID for each collection
#                 new_row = {
#                     'Title': title,
#                     'ID': new_id,
#                     'Resource Class': 'Collections'
#                 }
#                 new_rows.append(new_row)

# # Append the new rows to the DataFrame
# df = df.append(new_rows, ignore_index=True)

## 8. Write to a CSV file

In [9]:
# df = df.dropna(axis=1, how='all')

df.to_csv("{}.csv".format(csv_name),index=False, na_rep='')