# OIH Master Data Product Generator

## Product concepts

* solr
* WMO WIS2
* state of the ocean report (shacl + s2/h3 needed for this too)
* Duplication checks  (or do in the graph directly)


## About
This notebook is a test some approaches for processing the release graphs into a format that is useful for the Solr index

## ADRs

1)



## Look at

* https://github.com/sybrenjansen/mpire

## Example Solr input at

https://github.com/iodepo/odis-arch/blob/master/graphOps/extraction/solr/solrexample.json


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

## requirements.txt

In [None]:
%%capture
!pip install -q rdflib
!pip install -q shapely
!pip install -q pyld
!pip install -q kglab
!pip install -q minio
!pip install -q objdict
!pip install -q shapely
!pip install -q geopandas
!pip install -q oxrdflib

## imports

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)  ## remove pandas future warning
import pandas as pd
import geopandas as gpd
from shapely import wkt
# import s3fs
import pyarrow.parquet as pq
import shapely
import requests
import os
import re
import json, io
from pyld import jsonld
import kglab
from minio import Minio
import rdflib
from rdflib import ConjunctiveGraph  #  needed for nquads
from urllib.request import urlopen
from dateutil import parser
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
from pyproj import Geod
import oxrdflib # https://github.com/oxigraph/oxrdflib

In [None]:
# Check for using GPU, in case you want to ensure your GPU is used
# gc = kglab.get_gpu_count()
# print(gc)

## Definitions

In [None]:
# pop out last element in a quad to make a triple
def popper(input):
    lines = input.decode().split('\n') # Split input into separate lines
    modified_lines = []

    for line in lines:
        newline = line.replace("http://schema.org", "https://schema.org")
        segments = newline.split(' ')

        if len(segments) > 3:
            segments.pop()   # Remove the last two segment
            segments.pop()
            new_line = ' '.join(segments) + ' .'
            modified_lines.append(new_line)

    result_string = '\n'.join(modified_lines)

    return(result_string)

def contextAlignment(input):
    lines = input.decode().split('\n') # Split input into separate lines
    modified_lines = []

    for line in lines:
        newline = line.replace("http://schema.org", "https://schema.org")

        modified_lines.append(newline)

    result_string = '\n'.join(modified_lines)

    return(result_string)

def publicurls(client, bucket, prefix):
    urls = []
    objects = client.list_objects(bucket, prefix=prefix, recursive=True)
    for obj in objects:
        result = client.stat_object(bucket, obj.object_name)

        if result.size > 0:  #  how to tell if an objet   obj.is_public  ?????
            url = client.presigned_get_object(bucket, obj.object_name)
            # print(f"Public URL for object: {url}")
            urls.append(url)

    return urls

def to_wkt(polygon_string):
    # split the input string into pairs
    pairs = polygon_string.split(',')

    # transform each pair into 'y x' format
    # transformed_pairs = [' '.join(reversed(pair.split())) for pair in pairs]
    transformed_pairs = [' '.join(pair.split()) for pair in pairs]


    # join the transformed pairs with a comma and a space
    transformed_string = ', '.join(transformed_pairs)

    # return the final WKT string
    return f"POLYGON (({transformed_string}))"

def contains_alpha(s):
    if isinstance(s, (int, float)):
      return False
    return any(c.isalpha() for c in s)


## Load Graph(s)

At this point we have the URLs, and we could either loop load all of them or pull one out manually and use.  This section dmonstrates loading and working with one


In [None]:
client = Minio("ossapi.oceaninfohub.org:80",  secure=False) # Create client with anonymous access.
urls = publicurls(client, "public", "graph")
for u in urls:
  print(u)

http://ossapi.oceaninfohub.org/public/graphs/summonedafricaioc_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedaquadocs_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedcioos_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonededmerp_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonededmo_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedemodnet_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedinanodc_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedinvemardocuments_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedinvemarexperts_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedinvemarinstitutions_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedinvemartraining_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedinvemarvessels_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedmarinet

In [None]:
# load single quad graph into a RDFLIB conjunctive graph

# u = "http://ossapi.oceaninfohub.org/public/graphs/summonedcioos_v1_release.nq"
u = "http://ossapi.oceaninfohub.org/public/graphs/summonedoceanexperts_v1_release.nq"

df = urlopen(u)
dg = df.read()
r = contextAlignment(dg)

g = ConjunctiveGraph()
# g = rdflib.ConjunctiveGraph(store="Oxigraph")
g.parse(data=r, format="nquads")
print(len(g))

901711


In [None]:
# # load all graphs

# g = ConjunctiveGraph()
# for u in urls:
#   print("loading: {}".format(u))

#   df = urlopen(u)
#   dg = df.read()
#   r = contextAlignment(dg)

#   g.parse(data=r, format="nquads")

# print(len(g))


In [None]:
## Convert the RDFLIB graph to a kglabs graph

namespaces = {
    "sh":   "http://www.w3.org/ns/shacl#" ,
    "schema":   "https://schema.org/" ,
    "geo":      "http://www.opengis.net/ont/geosparql#",
}

kg = kglab.KnowledgeGraph(name = "OIH test", base_uri = "https://oceaninfohub.org/id/", namespaces = namespaces, use_gpus=True, import_graph = g)

## Query Section

In [None]:

# List of URLs
#     "https://raw.githubusercontent.com/iodepo/odis-in/master/SPARQL/searchOIH/baseQuery.rq",
urls = [
    "https://raw.githubusercontent.com/iodepo/odis-in/master/SPARQL/searchOIH/baseQuery.rq",
    "https://raw.githubusercontent.com/iodepo/odis-in/master/SPARQL/searchOIH/sup_geo.rq",
    "https://raw.githubusercontent.com/iodepo/odis-in/master/SPARQL/searchOIH/address_geo.rq",
    "https://raw.githubusercontent.com/iodepo/odis-in/master/SPARQL/searchOIH/sup_temporal.rq",
    "https://raw.githubusercontent.com/iodepo/odis-in/master/SPARQL/searchOIH/dataset.rq"
]

for url in urls:
    try:
        response = requests.get(url)
        if response.status_code == 200:
            # Extract the file name from the URL and change ".rq" to "rq"
            file_name = url.split("/")[-1].replace(".rq", "rq")
            content = response.text

            # Create a variable with the modified name and store the content
            globals()[file_name] = content
        else:
            print(f"Failed to download URL {url}. Status code: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")

In [None]:
print(address_georq)

PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX schema: <https://schema.org/>
PREFIX geosparql: <http://www.opengis.net/ont/geosparql#>

SELECT (?s as ?id) ?type ?address ?addressCountry ?country
WHERE {
    ?s rdf:type ?type .
    FILTER (?type IN (schema:ResearchProject, schema:Project, schema:Organization,
                         schema:Dataset, schema:CreativeWork, schema:Person, schema:Map, schema:Course,
                         schema:CourseInstance, schema:Event, schema:Vehicle )
    )
    FILTER (isIRI(?s))
    OPTIONAL {
        ?s schema:nationality ?nat .
        ?nat a schema:Country .
        ?nat schema:name ?country .
    }
    OPTIONAL {
        ?s schema:spatialCoverage ?sc .
        ?sc a schema:Place .
        ?sc schema:geo ?geo .
        OPTIONAL {
            ?geo schema:address ?address .
        }
        OPTIONAL {
            ?geo schema:addr

In [None]:


# # this expects -- spatialCoverage -> Place -- geo --> GeoShape
# # could prefix asWKT with hasGeometry
# geoq = """
# PREFIX schema: <https://schema.org/>
# PREFIX geosparql: <http://www.opengis.net/ont/geosparql#>

# SELECT (?s as ?id) ?type ?name ?geotype ?geompred ?geom ?lat ?long
# WHERE
# {
#     ?s rdf:type ?type .
#     FILTER ( ?type IN (schema:ResearchProject, schema:Project, schema:Organization,
#     schema:Dataset, schema:CreativeWork, schema:Person, schema:Map, schema:Course,
#     schema:CourseInstance, schema:Event, schema:Vehicle ) )
#     ?s schema:spatialCoverage ?sc .
#     ?sc a schema:Place .
#     OPTIONAL { ?sc schema:name ?name } .
#     OPTIONAL {
#       ?sc schema:latitude ?lat .
#       ?sc schema:longitude ?long .
#     }
#     OPTIONAL {
#       ?sc schema:geo ?geo .
#       ?geo a ?geotype .
#       ?geo ?geompred ?geom .
#       FILTER(!isIRI(?geom))
#     }
#     OPTIONAL {
#       ?gs geosparql:asWKT ?wkt
#     }
# }

# """

# timeq = """
# PREFIX schema: <https://schema.org/>

# SELECT (?s as ?id) ?type ?time ?temporalCoverage ?dateModified ?datePublished
# WHERE
# {
#     ?s rdf:type ?type .
#     FILTER ( ?type IN (schema:ResearchProject, schema:Project, schema:Organization,
#     schema:Dataset, schema:CreativeWork, schema:Person, schema:Map, schema:Course,
#     schema:CourseInstance, schema:Event, schema:Vehicle ) )

#     OPTIONAL { ?s schema:temporalCoverage ?temporalCoverage }
#     OPTIONAL { ?s schema:dataModified   ?dataModified }
#     OPTIONAL { ?s schema:datePublished   ?datePublished }


# }
# """



# datasetq = """
# PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
# PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
# PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
# prefix prov: <http://www.w3.org/ns/prov#>
# PREFIX schema: <https://schema.org/>

# SELECT DISTINCT (?s as ?id) ?type ?name ?headline ?url ?description ?sameAs ?license ?citation ?keywords ?includedInDataCatalog ?distribution ?region ?provider ?publisher ?creator
# WHERE {
#     graph ?g {
#         BIND(schema:Dataset AS ?type)
#         ?s rdf:type ?type .

#         OPTIONAL { ?s schema:name ?name . }
#         OPTIONAL { ?s schema:headline ?headline . }
#         OPTIONAL { ?s schema:url ?url . }
#         OPTIONAL { ?s schema:description ?description . }

#         OPTIONAL { ?s schema:sameAs ?sameAs . }
#         OPTIONAL { ?s schema:license ?license . }
#         OPTIONAL { ?s schema:citation ?citation . }
#         OPTIONAL { ?s schema:keywords ?keywords . }
#         OPTIONAL { ?s schema:includedInDataCatalog ?includedInDataCatalog . }
#         OPTIONAL { ?s schema:distribution ?distribution . }
#         OPTIONAL { ?s schema:region ?region . }
#         OPTIONAL { ?s schema:provider ?provider . }
#         OPTIONAL { ?s schema:publisher ?publisher .}
#         OPTIONAL { ?s schema:creator ?creator . }
#         }
# }
# """


## Notes

* many of the above can be arrays, we need to note this in the shacl cardinality
* also the Pandas dataframes need to roll these up into comma seperated items, ie, python lists via aggregate and join.   These will then serialize out to solr JSON correctly


In [None]:
##would be nice to do a type count SPARQL here as a sanity check...
# counts = """
# PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>


# SELECT  ( COUNT( DISTINCT ?s) as ?count) ?type
# WHERE
#  {
#      graph ?g {
#          ?s rdf:type ?type .
#      }
# }
# GROUP BY ?type
# ORDER BY DESC(?count)
# """

# cdf = kg.query_as_df(counts)

# cdf.head(10)



## Loop on Queries

NOTE, do the queries need isIRI for the subject to avoid resoruces without a top level ID?

In [None]:
qlist = [baseQueryrq, datasetrq,  sup_georq, sup_temporalrq, address_georq]

# m1 = pd.merge(pdf, geodf, on='id', how='outer')
# mf = pd.DataFrame()
dfl = []
for q in qlist:
  df = kg.query_as_df(q)
  print(len(df))
  if len(df) > 0:
    dfl.append(df)

209367
0
0
50944


In [None]:
common_column = ["id", "type"]  # Replace with the actual common column name

# Initialize a merged DataFrame with the first DataFrame
merged_df = dfl[0]

# Iterate through the remaining DataFrames and merge them into the merged_df
for df in dfl[1:]:
    merged_df = pd.merge(merged_df, df, on=common_column, how='inner')

In [None]:
merged_df['id'].nunique()

50944

In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67447 entries, 0 to 67446
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           67447 non-null  object
 1   type         67447 non-null  object
 2   name         49112 non-null  object
 3   url          59578 non-null  object
 4   description  22507 non-null  object
dtypes: object(5)
memory usage: 2.6+ MB


## Post processing

### GeoSpatial

In [None]:
merged_df['filteredgeom'] = merged_df['geom'].apply(lambda x: np.nan if contains_alpha(x) else x)

### Regions
Incorporate Jeff's regions.py which needs

* address (Org, person, Course?
* name (THING, in all)
* spatialFeature (WKT geom column)
* countryOfLastProcessing (vehicle only)

In [None]:
import re

def normalize(s):
    if isinstance(s, str):
      s = s.lower()
      s = re.sub(r"\(.*\)","",s)
      s = re.sub(r"\[.*\]","",s)
      s = re.sub(r"and|the|of","", s)
      s = s.rstrip('.')
      return set(s.split(None))
    else:
      return set(s)

In [None]:
# from . import datashaping
import shapely.geometry
import os, json

import shapely.wkt
import shapely.geometry
from urllib.request import urlopen

with open('/content/regions-clipped.geojson', 'r') as f:
    geo_regions = json.load(f)['features']
    for r in geo_regions:
        r['shape'] = shapely.geometry.shape(r['geometry'])

# leverage the UNSD API "GeoArea" JSON endpoint, instead of locally-stored CSV
#  see https://unstats.un.org/SDGAPI/swagger/
unsdGeoareaEndpoint = "https://unstats.un.org/SDGAPI/v1/sdg/GeoArea/Tree"
response = urlopen(unsdGeoareaEndpoint)
unsdDataJSON = json.loads(response.read())

# use the "World (total) by continental regions" branch
continentalRegions = unsdDataJSON[1]
continentalRegionsChildren = continentalRegions['children']

# parse the JSON from the API call
countries_dict_with_regions = {}
country_map_list = []

for list_regions in continentalRegionsChildren:
    if list_regions['children'] == None:
        regionName = list_regions['geoAreaName']
        # print('Region name (no children): ' + regionName)
    else:
        regionName = list_regions['geoAreaName']
        # print('Region name: ' + regionName)
        # loop through sub-regions
        for list_subregions in list_regions['children']:
            subRegionName = list_subregions['geoAreaName']
            # print('Sub-region name: ' + subRegionName)
            # loop through intermediate region items
            for list_intermediate_regions in list_subregions['children']:
                if list_intermediate_regions['type'] == 'Region':
                    intermediateRegionName = list_intermediate_regions['geoAreaName']
                    # print('Intermediate region name: ' + intermediateRegionName)
                    # loop through intermediate region children
                    for list_intermediate_region_children in list_intermediate_regions['children']:
                        countryName = list_intermediate_region_children['geoAreaName'].lower()
                        # print('Country name: ' + countryName)
                        countries_dict_with_regions[countryName] = [regionName, subRegionName]
                        country_map_list.append((normalize(countryName), countryName))
                else:
                    countryName = list_intermediate_regions['geoAreaName'].lower()
                    # print('Country name: ' + countryName)
                    countries_dict_with_regions[countryName] = [regionName, subRegionName]
                    country_map_list.append((normalize(countryName), countryName))


def address(address):
    normalized = normalize(address)
    value = list()
    for parts, country in country_map_list:
        if parts <= normalized:
          if country in countries_dict_with_regions:
            value = countries_dict_with_regions[country]
    return value

def name(n):
    normalized = normalize(n)
    value = list()
    for parts, country in country_map_list:
      if parts <= normalized:
        if country in countries_dict_with_regions:
          value = countries_dict_with_regions[country]
    return value


def countryLastProcessing(countryOfLastProcessing):
    normalized = normalize(countryOfLastProcessing)
    value = list()
    for parts, country in country_map_list:
        if parts <= normalized:
          if country in countries_dict_with_regions:
            value = countries_dict_with_regions[country]
    return value


def feature(feature):
    try:
        the_geom = shapely.wkt.loads(feature)
        return [r['properties']['name'] for r in geo_regions if r['shape'].intersects(the_geom)]
    except:
        print("Invalid WKT")
        return list()

FileNotFoundError: ignored

In [None]:
import pandas as pd

data1 = ["Marine Science Country Profiles : Kenya",
        "The fisheries of Barbados and some of their problems",
        "Fiji : Where's the data?", 'POLYGON ((-95.5 19.5,-95.5 31.5,-73.5 31.5,-73.5 19.5,-95.5 19.5))']

data2 = ["Marine Science Country Profiles : Kenya",
        "The fisheries of Barbados and some of their problems",
        "Fiji : Where's the data?"]

dftest = pd.DataFrame({'name': data1})
dftest.head()

In [None]:
# merged_df['region'] = merged_df['name'].apply(lambda x: x + [item for item in name(x) if item not in x] if x else x)
dftest['nregion'] = dftest['name'].apply(lambda x: name(x)  if x else x)
dftest['aregion'] = dftest['name'].apply(lambda x: address(x)  if x else x)
dftest['cregion'] = dftest['name'].apply(lambda x: countryLastProcessing(x)  if x else x)
dftest['fregion'] = dftest['name'].apply(lambda x: feature(x)  if x else x)

dftest.head()

In [None]:
def g(df):
    df['region'] = df[['nregion', 'aregion','cregion', 'fregion']].apply(lambda x: list(set(x[0] + x[1] + x[2]+ x[3])), axis=1)
    del df['nregion']
    del df['aregion']
    del df['cregion']
    del df['fregion']
    return df

In [None]:
dftest = g(dftest.copy())

dftest.head()

In [None]:
def make_pairs(ll):
  """Makes pairs of coordinates from a list of coordinates.

  Args:
    ll: A list of coordinates.

  Returns:
    A list of pairs of coordinates.
  """

  coords = []
  for i in range(0, len(ll), 2):
    coords.append((ll[i], ll[i+1]))

  return coords

In [None]:
def gj(geom, value):
  test = geom.split()
  test = [float(x) for x in test]
  if len(test) < 2:
    return None

  cp = make_pairs(test)

  if len(cp) == 1:
    # print("POINT")
    geom = shapely.Point(cp)
  elif len(cp) == 2:
    # print("BOX")
    geom = shapely.box(cp[0][0], cp[0][1], cp[1][0], cp[1][1])
  else:
    # print("POLYGON")
    geom = shapely.Polygon(cp)

  if value == "centroid":
    return geom.centroid
  elif value == "length":
    return geom.length
  elif value == "area":
    geod = Geod(ellps="WGS84")
    area = abs(geod.geometry_area_perimeter(geom)[0])
    return area
  elif value == "wkt":
    return shapely.to_wkt(geom)
  elif value == "geojson":
    return shapely.to_geojson(geom)
  else:
    return None

In [None]:
# merged_df['dt_endDate'] = merged_df['temporalCoverage'].apply(lambda x: re.split("/", x)[1] if "/" in x else np.nan)

merged_df['centroid'] = merged_df['filteredgeom'].apply(lambda x: gj(str(x), "centroid"))
merged_df['length'] = merged_df['filteredgeom'].apply(lambda x: gj(str(x), "length"))
merged_df['area'] = merged_df['filteredgeom'].apply(lambda x: gj(str(x), "area"))
merged_df['wkt'] = merged_df['filteredgeom'].apply(lambda x: gj(str(x), "wkt"))
merged_df['geojson'] = merged_df['filteredgeom'].apply(lambda x: gj(str(x), "geojson"))

In [None]:
# merged_df[].head()
print(merged_df["wkt"][1])

### Temporal

In [None]:
merged_df['temporalCoverage'] = merged_df['temporalCoverage'].astype('str')  # fine to make str since we don't use in the solr JSON
merged_df['dt_startDate'] = merged_df['temporalCoverage'].apply(lambda x: re.split("/", x)[0] if "/" in x else np.nan)
merged_df['dt_endDate'] = merged_df['temporalCoverage'].apply(lambda x: re.split("/", x)[1] if "/" in x else np.nan)
merged_df['n_startYear'] = merged_df['dt_startDate'].apply(lambda x: parser.parse(x).year if "-" in str(x) else np.nan)
merged_df['n_endYear'] = merged_df['dt_endDate'].apply(lambda x: parser.parse(x).year if "-" in str(x) else np.nan)

In [None]:
merged_df.info()

In [None]:
merged_df.head()

In [None]:
# transforms needed for aggregation
merged_df['keywords'] = merged_df['keywords'].astype(str)  #  why is this needed?


In [None]:
mf = merged_df.groupby('id').agg({'keywords': ', '.join,
                                        'type': 'first',
                                        'name': ', '.join,
                                        'description': ', '.join,
                                        'url': ', '.join,
                                        'geotype':'first',
                                        'geompred':'first',
                                        'geom':'first',
                                        'temporalCoverage': 'first',
                                        'datePublished': 'first',
                                        'license': 'first',
                                        'creator': 'first',
                                        'includedInDataCatalog': 'first',
                                        'distribution': 'first',
                                        'publisher': 'first',
                                        'filteredgeom': 'first',
                                        'dt_startDate': 'first',
                                        'dt_endDate': 'first',
                                        'n_startYear': 'first',
                                        'n_endYear': 'first'}).reset_index()


In [None]:
mf.info()

## Outpt JSON for Solr

Example Records

https://github.com/iodepo/odis-arch/blob/master/graphOps/extraction/solr/solrexample.json

for

https://catalogue.cioos.ca/dataset/ff0232d8-34bd-4456-be28-20d4f8b2937c.jsonld




In [None]:
mf.to_parquet('solr_set.parquet')
mf.to_csv('solr_set.csv')

In [None]:
output_directory = 'json_output'

In [None]:
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [None]:
def remove_brackets(string):
  if isinstance(string, (int, float)):
    return string
  if string.startswith('<') and string.endswith('>'):
    return string[1:-1]
  else:
    return string

In [None]:
from objdict import ObjDict

for index, row in mf.iterrows():
    # Create a JSON string from the row
    # json_string = row.to_json()

    data = ObjDict()
    sd = ObjDict()

    # not in arrays
    data.id = remove_brackets(row['id'])
    data.type = row['type']

    if not isinstance(row['keywords'] , (int, float)):
      data.txt_keywords = [x.strip() for x in row['keywords'].split(',')]
    if not isinstance(row['name'] , (int, float)):
      data.txt_name = row['name']
    data.description = row['description']
    data.txt_url = [row['url']]
    data.txt_license = [row['license']]
    data.txt_creator = [row['creator']]
    data.txt_includedInDataCatalog = [remove_brackets(row['includedInDataCatalog'])]
    data.txt_distribution = [row['distribution']]
    data.txt_publisher = [remove_brackets(row['publisher'])]

    # # geo
    # if row["filteredgeom"] != np.nan:
    #      data.geotype = [row['geompred']]
    #      data.geom = [ row["filteredgeom"]]
        #  data.geojson_point = [ row["filteredgeom"]]
        #  data.geojson_simple = [ row["filteredgeom"]]
        #  data.geojson_geom = [ row["filteredgeom"]]
        #  data.geom_area = [ row["filteredgeom"]]
        #  data.geom_length = [ row["filteredgeom"]]

    # temporal
    data.dt_startDate = [row['dt_startDate']]
    data.dt_endDate = [row['dt_endDate']]
    data.n_startYear = [row['n_startYear']]
    data.n_endYear = [ row['n_endYear']]

     # write
    json_string = data.dumps(indent=4)

    # Define the filename based on the row index or a unique identifier from your data
    filename = os.path.join(output_directory, f'row_{index}.json')

    # Write the JSON string to the file
    with open(filename, 'w') as json_file:
        json_file.write(json_string)

In [None]:
### Core Items
## takes around 3 minutes for core query
# pdfstep1 = kg.query_as_df(coreq)
# pdfstep1['keywords'] = pdfstep1['keywords'].astype(str)  #  why is this needed?
# pdfstep1 = pdfstep1.astype(str)
# pdfstep1.info()
## need to role up the keywords into one cell
# pdf = pdfstep1.groupby('id').agg({'keywords': ', '.join, 'type': 'first', 'name': ', '.join, 'description': ', '.join, 'url': ', '.join}).reset_index()
# geodf_grouped = geodf.groupby('id').agg({'geotype': 'first', 'geompred': 'first', 'geom': 'first', 'filteredgeom': 'first'}).reset_index()
### Spatial Sections

# geodf = kg.query_as_df(geoq)
## remove geojson entries detected via contains_alpha and turn them into "" or np.nan
# geodf['filteredgeom'] = geodf['geom'].apply(lambda x: np.nan if contains_alpha(x) else x)
### Temporal Sections
## takes < 1 minute for temporal query
# timedf = kg.query_as_df(timeq)
# timedf['temporalCoverage'] = timedf['temporalCoverage'].astype('str')  # fine to make str since we don't use in the solr JSON
# timedf['dt_startDate'] = timedf['temporalCoverage'].apply(lambda x: re.split("/", x)[0] if "/" in x else np.nan)
# timedf['dt_endDate'] = timedf['temporalCoverage'].apply(lambda x: re.split("/", x)[1] if "/" in x else np.nan)
# timedf['n_startYear'] = timedf['dt_startDate'].apply(lambda x: parser.parse(x).year if "-" in str(x) else np.nan)
# timedf['n_endYear'] = timedf['dt_endDate'].apply(lambda x: parser.parse(x).year if "-" in str(x) else np.nan)
# timedf.info()
### Document section
# docdf = kg.query_as_df(docq)
# docdf.info()
### Merge results together
# mf = pd.merge(pdf, geodf, on='id', how='outer')
# mf = pd.merge(mf, timedf, on='id', how='outer')
# mf = pd.merge(mf, docdf, on='id', how='outer')
## this was just an example to show we could remove rows with a specific NaN in a column
# mf = mf.loc[mf['dt_startDate'].notnull()]
# mf['id'].nunique()
# mf.info()
# mf.head(5)