In [1]:
import os
from dotenv import load_dotenv, find_dotenv
from itertools import chain, starmap
import json
import pandas as pd
import weaviate
from queries import search_datasets_query, browse_datasets_query

# Pandas display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

In [2]:
client = weaviate.Client(os.environ["WEAVIATE_ENDPOINT"])

In [3]:
def flatten_json(dictionary, sep='.'):
    """Flatten a nested json file. For a list of dictionaries, use this
    inside a for loop before converting to pandas DataFrame."""

    def unpack(parent_key, parent_value):
        """Unpack one level of nesting in json file"""
        # Unpack one level only!!!
        
        if isinstance(parent_value, dict):
            for key, value in parent_value.items():
                temp1 = parent_key + sep + key
                yield temp1, value
        elif isinstance(parent_value, list):
            i = 0 
            for value in parent_value:
                temp2 = parent_key + sep +str(i) 
                i += 1
                yield temp2, value
        else:
            yield parent_key, parent_value    


    # Keep iterating until the termination condition is satisfied
    while True:
        # Keep unpacking the json file until all values are atomic elements (not dictionary or list)
        dictionary = dict(chain.from_iterable(starmap(unpack, dictionary.items())))
        # Terminate condition: not any value in the json file is dictionary or list
        if not any(isinstance(value, dict) for value in dictionary.values()) and \
           not any(isinstance(value, list) for value in dictionary.values()):
            break

    return dictionary

## Semantic Search

In [7]:
def run_semantic_search_query(concepts, limit=1000, distance=0.8, search_datasets_query=search_datasets_query):

    search_datasets_query = search_datasets_query.format(
        concepts=json.dumps(concepts.split(",")), 
        limit=str(limit),
        distance=str(distance)
        )

    return client.query.raw(search_datasets_query)['data']['Get']['Dataset']

In [9]:
# search_datasets_query = search_datasets_query.format(
#     concepts=json.dumps("k2".split(",")), 
#     limit=str(1000),
#     distance=str(0.8)
#     )

# res = client.query.raw(search_datasets_query)#['data']['Get']['Dataset']

In [10]:
data = run_semantic_search_query("k2")

In [11]:
data

[{'_additional': {'distance': 0.44141692},
  'description': 'The K2 mission observed 100 square degrees for 80 days each across 20 different pointings along the ecliptic, collecting high-precision photometry for a selection of targets within each field. The mission began when the original Kepler mission ended due to loss of the second reaction wheel in 2011. More information about the K2 mission is available at [MAST](https://archive.stsci.edu/k2/).\n',
  'documentation': 'http://astroquery.readthedocs.io/en/latest/mast/mast.html',
  'hasPublication': None,
  'hasResource': [{'arn': 'arn:aws:s3:::stpubdata/k2',
    'description': 'K2 Mission data files',
    'region': 'us-east-1',
    'requesterPays': False,
    'type': 'S3 Bucket'},
   {'arn': 'arn:aws:sns:us-east-1:879230861493:stpubdata',
    'description': 'Notifications for new data',
    'region': 'us-east-1',
    'requesterPays': None,
    'type': 'SNS Topic'}],
  'hasToolOrApplication': None,
  'hasTutorial': None,
  'managedBy

In [13]:
flat_data = [flatten_json(item) for item in data]

In [14]:
df = pd.DataFrame(flat_data)

In [15]:
df.head()

Unnamed: 0,_additional.distance,description,documentation,hasPublication,hasResource.0.arn,hasResource.0.description,hasResource.0.region,hasResource.0.requesterPays,hasResource.0.type,hasResource.1.arn,hasResource.1.description,hasResource.1.region,hasResource.1.requesterPays,hasResource.1.type,hasToolOrApplication,hasTutorial,managedBy.0.name,name,tags,hasPublication.0.authorName,hasPublication.0.title,hasPublication.0.url,hasResource.2.arn,hasResource.2.description,hasResource.2.region,hasResource.2.requesterPays,hasResource.2.type,hasResource.3.arn,hasResource.3.description,hasResource.3.region,hasResource.3.requesterPays,hasResource.3.type,hasToolOrApplication.0.authorName,hasToolOrApplication.0.title,hasToolOrApplication.0.url,hasToolOrApplication.1.authorName,hasToolOrApplication.1.title,hasToolOrApplication.1.url,hasToolOrApplication.2.authorName,hasToolOrApplication.2.title,hasToolOrApplication.2.url,hasToolOrApplication.3.authorName,hasToolOrApplication.3.title,hasToolOrApplication.3.url,hasToolOrApplication.4.authorName,hasToolOrApplication.4.title,hasToolOrApplication.4.url,hasToolOrApplication.5.authorName,hasToolOrApplication.5.title,hasToolOrApplication.5.url,hasTutorial.0.services,hasTutorial.0.title,hasTutorial.0.url,hasTutorial.1.services,hasTutorial.1.title,hasTutorial.1.url,hasTutorial.2.services,hasTutorial.2.title,hasTutorial.2.url,hasToolOrApplication.6.authorName,hasToolOrApplication.6.title,hasToolOrApplication.6.url,hasPublication.1.authorName,hasPublication.1.title,hasPublication.1.url,hasPublication.2.authorName,hasPublication.2.title,hasPublication.2.url,hasTutorial.3.services,hasTutorial.3.title,hasTutorial.3.url,hasTutorial.4.services,hasTutorial.4.title,hasTutorial.4.url,hasTutorial.5.services,hasTutorial.5.title,hasTutorial.5.url,hasTutorial.6.services,hasTutorial.6.title,hasTutorial.6.url,hasTutorial.7.services,hasTutorial.7.title,hasTutorial.7.url,hasTutorial.8.services,hasTutorial.8.title,hasTutorial.8.url,hasPublication.3.authorName,hasPublication.3.title,hasPublication.3.url
0,0.441417,"The K2 mission observed 100 square degrees for 80 days each across 20 different pointings along the ecliptic, collecting high-precision photometry for a selection of targets within each field. The mission began when the original Kepler mission ended due to loss of the second reaction wheel in 2011. More information about the K2 mission is available at [MAST](https://archive.stsci.edu/k2/).\n",http://astroquery.readthedocs.io/en/latest/mast/mast.html,,arn:aws:s3:::stpubdata/k2,K2 Mission data files,us-east-1,False,S3 Bucket,arn:aws:sns:us-east-1:879230861493:stpubdata,Notifications for new data,us-east-1,,SNS Topic,,,[Space Telescope Science Institute](http://www.stsci.edu/),K2 Mission Data,"astronomy,aws-pds",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,0.550664,"The Geo-KOMPSAT-2A (GK2A) is the new generation geostationary meteorological satellite (located in 128.2°E) of the Korea Meteorological Administration (KMA). The main mission of the GK2A is to observe the atmospheric phenomena over the Asia-Pacific region. The Advance Meteorological Imager (AMI) on GK2A scan the Earth full disk every 10 minutes and the Korean Peninsula area every 2 minutes with a high spatial resolution of 4 visible channels and 12 infrared channels. In addition, the AMI has an ability of flexible target area scanning useful for monitoring severe weather events such as typhoon and volcanic eruption and so on. And for space weather mission, the Korea Space wEather Monitor (KSEM) on the GK2A observes the space environment with the particle detector, magnetometer and charging monitor. For questions regarding GK2A imagery specifications, visit the GK2A site at https://nmsc.kma.go.kr/enhome/html/base/cmm/selectPage.do?page=satellite.gk2a.intro. To view the GK2A Fact Sheet please visit https://nmsc.kma.go.kr/enhome/html/base/cmm/selectPage.do?page=satellite.gk2a.fact.\n<br/>\n<br/>\nNOAA provides access to GK2A data on AWS in coordination with the Korean Meteorlogical Agency.\n<br/>\n",https://nmsc.kma.go.kr/enhome/html/base/cmm/selectPage.do?page=satellite.gk2a.fact,,arn:aws:s3:::noaa-gk2a-pds,GK2A Imagery,us-east-1,,S3 Bucket,arn:aws:sns:us-east-1:709902155096:NewGK2AObject,"New data notifications for GK2A, only Lambda and SQS protocols allowed",us-east-1,,SNS Topic,,,[NOAA](http://www.noaa.gov/),Korean Meteorlogical Agency (KMA) GK-2A Satellite Data,"aws-pds,agriculture,geospatial,weather,earth observation,meteorological,disaster response,satellite imagery",KMA,GK2A Full Fact Sheet,https://nmsc.kma.go.kr/enhome/html/base/cmm/selectPage.do?page=satellite.gk2a.fact,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,0.749331,"The Sentinel-2 mission is part of the European Union Copernicus programme for Earth observations. Sentinel-2 consists of twin satellites, Sentinel-2A (launched 23 June 2015) and Sentinel-2B (launched 7 March 2017). The two satellites have the same orbit, but 180° apart for optimal coverage and data delivery. Their combined data is used in the Digital Earth Africa Sentinel-2 product.\nTogether, they cover all Earth’s land surfaces, large islands, inland and coastal waters every 3-5 days.\nSentinel-2 data is tiered by level of pre-processing. Level-0, Level-1A and Level-1B data contain raw data from the satellites, with little to no pre-processing. Level-1C data is surface reflectance measured at the top of the atmosphere. This is processed using the Sen2Cor algorithm to give Level-2A, the bottom-of-atmosphere reflectance (Obregón et al, 2019). Level-2A data is the most ideal for research activities as it allows further analysis without applying additional atmospheric corrections.\nThe Digital Earth Africa Sentinel-2 dataset contains Level-2A data of the African continent. Digital Earth Africa does not host any lower-level Sentinel-2 data.\nNote that this data is a subset of the Sentinel-2 COGs dataset.\n",https://docs.digitalearthafrica.org/en/latest/data_specs/Sentinel-2_Level-2A_specs.html,,arn:aws:s3:::deafrica-sentinel-2,Sentinel-2 scenes and metadata,af-south-1,False,S3 Bucket,arn:aws:s3:::deafrica-sentinel-2-inventory,[S3 Inventory](https://docs.aws.amazon.com/AmazonS3/latest/dev/storage-inventory.html#storage-inventory-contents),af-south-1,,S3 Bucket,,,[Digital Earth Africa](https://www.digitalearthafrica.org/),Digital Earth Africa Sentinel-2 Level-2A,"aws-pds,agriculture,earth observation,satellite imagery,geospatial,natural resource,disaster response,deafrica,stac,cog",Dr Fang Yuan,Introduction to DE Africa,https://youtu.be/Wkf7N6O9jJQ,arn:aws:sns:af-south-1:543785577597:deafrica-sentinel-2-scene-topic,"New scene notifications, can subscribe with [Lambda](https://aws.amazon.com/lambda/) or [SQS](https://aws.amazon.com/sqs/). Message contains entire STAC record for each new Item.",af-south-1,,SNS Topic,arn:aws:sns:af-south-1:543785577597:deafrica-sentinel-2-topic,"Bucket creation event notification, can subscribe with [Lambda](https://aws.amazon.com/lambda/) or [SQS](https://aws.amazon.com/sqs/). Message sent by deafrica-sentinel-2 s3 bucket all object create events.",af-south-1,,SNS Topic,Digital Earth Africa Contributors,Digital Earth Africa Explorer,https://explorer.digitalearth.africa/products/s2_l2a/extents,Digital Earth Africa Contributors,Digital Earth Africa web services,https://ows.digitalearth.africa,Digital Earth Africa Contributors,Digital Earth Africa Map,https://maps.digitalearth.africa/,Digital Earth Africa Contributors,Digital Earth Africa Sandbox,https://sandbox.digitalearth.africa/,Digital Earth Africa Contributors,Digital Earth Africa Notebook Repo,https://github.com/digitalearthafrica/deafrica-sandbox-notebooks,Digital Earth Africa Contributors,Digital Earth Africa Geoportal,https://www.africageoportal.com/pages/digital-earth-africa,,Use Sentinel-2 data in the Open Data Cube,https://github.com/opendatacube/cube-in-a-box,,Digital Earth Africa Training,http://learn.digitalearthafrica.org/,,Downloading and streaming data using STAC metadata,https://docs.digitalearthafrica.org/en/latest/sandbox/notebooks/Frequently_used_code/Downloading_data_with_STAC.html,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,0.760129,Japanese Tokenizer Dictionaries for use with MeCab.,"This dataset includes dictionaries for tokenization and morphological\nanalysis of Japanese for use with MeCab. This includes NINJAL's UniDic, a\nmodified smaller version of UniDic for situations that require it, and the\nlegacy IPADic dictionary.\n",,arn:aws:s3:::cotonoha-dic,Dictionary Files,ap-northeast-1,,S3 Bucket,,,,,,,,Cotonoha,Japanese Tokenizer Dictionaries,"aws-pds,natural language processing,csv,japanese",Paul O'Leary McCann,How to Tokenize Japanese in Python,https://www.dampfkraft.com/nlp/how-to-tokenize-japanese.html,,,,,,,,,,,Paul O'Leary McCann,unidic-py,https://github.com/polm/unidic-py,,,,,,,,,,,,,,,,"[""SageMaker""]",Fugashi Word Count Tutorial,https://github.com/polm/fugashi-sagemaker-demo/blob/master/fugashi%20wordcount.ipynb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,0.763885,The ALOS/PALSAR annual mosaic is a global 25 m resolution dataset that combines data from many images captured by JAXA’s PALSAR and PALSAR-2 sensors on ALOS-1 and ALOS-2 satellites respectively. This product contains radar measurement in L-band and in HH and HV polarizations. It has a spatial resolution of 25 m and is available annually for 2007 to 2010 (ALOS/PALSAR) and 2015 to 2020 (ALOS-2/PALSAR-2).\nThe JERS annual mosaic is generated from images acquired by the SAR sensor on the Japanese Earth Resources Satellite-1 (JERS-1) satellite. This product contains radar measurement in L-band and HH polarization. It has a spatial resolution of 25 m and is available for 1996.\nThis mosaic data is part of a global dataset provided by the Japan Aerospace Exploration Agency (JAXA) Earth Observation Research Center.\n,https://docs.digitalearthafrica.org/en/latest/data_specs/ALOS_PALSAR_annual_mosaic_specs.html,,arn:aws:s3:::deafrica-input-datasets/alos_palsar_mosaic,ALOS PALSAR ALOS-2 PALSAR-2 data,af-south-1,False,S3 Bucket,arn:aws:s3:::deafrica-input-datasets-inventory,[S3 Inventory](https://docs.aws.amazon.com/AmazonS3/latest/dev/storage-inventory.html#storage-inventory-contents),af-south-1,,S3 Bucket,,,[Digital Earth Africa](https://www.digitalearthafrica.org/),"Digital Earth Africa ALOS PALSAR, ALOS-2 PALSAR-2 and JERS-1","aws-pds,agriculture,earth observation,satellite imagery,geospatial,natural resource,disaster response,synthetic aperture radar,deafrica,stac,cog",Dr Fang Yuan,Introduction to DE Africa,https://youtu.be/Wkf7N6O9jJQ,arn:aws:sns:af-south-1:543785577597:deafrica-input-datasets-topic,"Bucket creation event notification, can subscribe with [Lambda](https://aws.amazon.com/lambda/) or [SQS](https://aws.amazon.com/sqs/). Message sent contain all object creation events.",af-south-1,,SNS Topic,,,,,,Digital Earth Africa Contributors,Digital Earth Africa Explorer (ALOS PALSAR and ALOS-2 PALSAR-2),https://explorer.digitalearth.africa/products/alos_palsar_mosaic,Digital Earth Africa Contributors,Digital Earth Africa Explorer (JERS),https://explorer.digitalearth.africa/products/jers_sar_mosaic,Digital Earth Africa Contributors,Digital Earth Africa web services,https://ows.digitalearth.africa,Digital Earth Africa Contributors,Digital Earth Africa Map,https://maps.digitalearth.africa/,Digital Earth Africa Contributors,Digital Earth Africa Sandbox,https://sandbox.digitalearth.africa/,Digital Earth Africa Contributors,Digital Earth Africa Notebook Repo,https://github.com/digitalearthafrica/deafrica-sandbox-notebooks,,Digital Earth Africa Training,http://learn.digitalearthafrica.org/,,,,,,,Digital Earth Africa Contributors,Digital Earth Africa Geoportal,https://www.africageoportal.com/pages/digital-earth-africa,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Browse Datasets

In [10]:
print(browse_datasets_query.format())


{
    Get {
        Dataset (
            limit: 1000
        ) {
            name
            description
            documentation
            tags
            managedBy {
                ... on Publisher {
                    name
                }
            }
            hasResource {
                ... on Resource {
                    arn
                    region
                    description
                    type
                    requesterPays
                }
            }
            hasTutorial {
                ... on Tutorial {
                    title
                    url
                    services
                }
            }
            hasPublication {
                ... on Publication {
                    title
                    url
                    authorName
                }
            }
            hasToolOrApplication {
                ... on ToolOrApplication {
                    title
                    url
                    a

In [4]:
res

{'data': {'Get': {'Dataset': [{'description': 'The Virginia Coastal Resilience Master Plan builds on the 2020 Virginia Coastal Resilience Master Planning Framework, which outlined the goals and principles of the Commonwealth’s statewide coastal resilience strategy. Recognizing the urgent challenge flooding already poses, the Commonwealth developed Phase One of the Master Plan on an accelerated timeline and focused this first assessment on the impacts of tidal and storm surge coastal flooding on coastal Virginia.  The Master Plan leveraged the combined efforts of more than two thousand stakeholders, subject matter experts, and government personnel. We centered the development of this plan around three core components:\n<br/>\n<br/>\nA Technical Study compiled essential data, research, processes, products, and resilience efforts in the Coastal Resilience Database, which forms much of basis of this plan and the Coastal Resilience Web Explorer;\n<br/>\n<br/>\nA Technical Advisory Committee

In [5]:
def run_browse_datasets_query():
    return client.query.raw(browse_datasets_query.format())['data']['Get']['Dataset']

In [6]:
datasets = run_browse_datasets_query()

In [7]:
# sort by name
datasets.sort(key=lambda x: x['name'])

In [8]:
len(datasets)

453