In [3]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [4]:
import os
from pathlib import Path
from itertools import chain, starmap
import json
import yaml
import pandas as pd

# Pandas display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

In [5]:
def flatten_json(dictionary, sep='.'):
    """Flatten a nested json file. For a list of dictionaries, use this
    inside a for loop before converting to pandas DataFrame."""

    def unpack(parent_key, parent_value):
        """Unpack one level of nesting in json file"""
        # Unpack one level only!!!
        
        if isinstance(parent_value, dict):
            for key, value in parent_value.items():
                temp1 = parent_key + sep + key
                yield temp1, value
        elif isinstance(parent_value, list):
            i = 0 
            for value in parent_value:
                temp2 = parent_key + sep +str(i) 
                i += 1
                yield temp2, value
        else:
            yield parent_key, parent_value    


    # Keep iterating until the termination condition is satisfied
    while True:
        # Keep unpacking the json file until all values are atomic elements (not dictionary or list)
        dictionary = dict(chain.from_iterable(starmap(unpack, dictionary.items())))
        # Terminate condition: not any value in the json file is dictionary or list
        if not any(isinstance(value, dict) for value in dictionary.values()) and \
           not any(isinstance(value, list) for value in dictionary.values()):
            break

    return dictionary

In [6]:
input_dir = Path('../___data/awslabs/open-data-registry/datasets')

In [7]:
yaml_files = input_dir.glob('*.yaml')

datasets = []
for file in yaml_files:
    dataset = yaml.safe_load(file.read_text())
    # dataset = flatten_json(dataset, sep="_")
    datasets.append(dataset)

In [8]:
# save as json
output_dir = Path('../___data/output')
with open(output_dir / 'aws-odr.json', 'w') as f:
    json.dump(datasets, f, indent=4)

## Exploratory Data Analysis

In [9]:
df = pd.DataFrame(datasets)

In [10]:
df.head(5)

Unnamed: 0,Name,Description,Documentation,Contact,UpdateFrequency,ManagedBy,Collabs,Tags,License,Resources,DataAtWork,Deprecated,DeprecatedNotice,Citation
0,Open City Model (OCM),Open City Model is an initiative to provide cityGML data for all the buildings in the United States.\nBy using other open datasets in conjunction with our own code and algorithms it is our goal to provide 3D geometries for every US building.\n,https://github.com/opencitymodel/opencitymodel,https://github.com/opencitymodel/opencitymodel#contact,Quarterly,BuildZero,{'ASDI': {'Tags': ['infrastructure']}},"[aws-pds, events, cities, geospatial]",https://github.com/opencitymodel/opencitymodel#license,"[{'Description': 'Project data files', 'ARN': 'arn:aws:s3:::opencitymodel', 'Region': 'us-east-1', 'Type': 'S3 Bucket'}]","{'Tutorials': [{'Title': 'Using Open City Model with the 3dCityDB', 'URL': 'https://github.com/opencitymodel/opencitymodel/blob/master/examples/3dCityDB-to-GoogleEarth.md', 'AuthorName': 'Allen Gilliland', 'AuthorURL': 'https://github.com/agilliland'}, {'Title': 'Running queries on Open City Model using AWS Athena', 'URL': 'https://github.com/opencitymodel/opencitymodel/blob/master/examples/Query-OpenCityModel-using-AWS-Athena.md', 'AuthorName': 'Allen Gilliland', 'AuthorURL': 'https://github.com/agilliland', 'Services': ['Athena']}, {'Title': 'Investigating environmental characteristics of US cities using publicly available ASDI datasets', 'URL': 'https://github.com/aws-samples/aws-asdi-cities-smsl-notebook', 'NotebookURL': 'https://github.com/aws-samples/aws-asdi-cities-smsl-notebook/blob/main/ASDI_Cities_Demo.ipynb', 'AuthorName': 'Darren Ko', 'Services': ['SageMaker Studio Lab']}], 'Tools & Applications': None, 'Publications': None}",,,
1,NOAA Continuously Operating Reference Stations (CORS) Network (NCN),"The [NOAA Continuously Operating Reference Stations (CORS) Network (NCN)](https://geodesy.noaa.gov/CORS/), managed by NOAA/National Geodetic Survey ([NGS](https://geodesy.noaa.gov/)), provide Global Navigation Satellite System (GNSS) data, supporting three dimensional positioning, meteorology, space weather, and geophysical applications throughout the United States. The NCN is a multi-purpose, multi-agency cooperative endeavor, combining the efforts of hundreds of government, academic, and private organizations. The stations are independently owned and operated. Each agency shares their GNSS/GPS carrier phase and code range measurements and station metadata with NGS, which are analyzed and distributed free of charge.\nNGS provides access to all NCN data collected since 9 February (040) 1994.\n- #### Access to NCN Data and Products\n - [NOAA-NCN on AWS](https://noaa-cors-pds.s3.amazonaws.com/index.html)\n - [NGS server: https://geodesy.noaa.gov/corsdata/](https://geodesy.noaa.gov/corsdata/)\n - [NGS's customized data request service (UFCORS)](https://geodesy.noaa.gov/UFCORS/)\n - [NGS Anonymous ftp://geodesy.noaa.gov/cors/ - This service is going away on August 02, 2021!](ftp://geodesy.noaa.gov/cors/)\n- #### NCN Data and Products\n - **RINEX**: The GPS/GNSS data collected at NCN stations are made available to the public by NGS in Receiver INdependent EXchange (RINEX) format. Most data are available within 1 hour (60 minutes) from when they were recorded at the remote site, and a few sites have a delay of 24 hours (1440 minutes).<br/>RINEX data can be found at: *rinex/`YYYY`/`DDD`/`ssss`/*\n - **Station logs**: \n - Station log files contain all the historical equipment (receiver/antenna) used at that station, approximate location, owner and operating agency, etc..<br/>Station log files can be found at: *station_log/`ssss`.log.txt*\n - Historical and current equipment information of all NCN stations, except those that are considered IGS stations.<br/>These data can be found at: *station_log/cumulative.station.info.cors*\n - **Published Coordinates and Velocities**: NAD83 and ITRF coordinates and velocities of each NCN station. All published coordinates and velocities are given for the Antenna Reference Point (ARP).<br/>Published coordinate and velocity files can be found at: *coord/coord_`YY`/*<br/>In July 2019, NGS published [MYCS2](https://geodesy.noaa.gov/CORS/news/mycs2/mycs2.shtml)!\n - **Time-series Plots**:\n - *Short-term* plots show the repeatability of a site for the last 90-days with respect to the current published position, corrected for the effect of the published velocity. These plots are updated daily.<br/>Short-term plots can be found at: */Plots/`ssss`_`YY`.short.png*\n - *Long-term* plots show the show weekly residual positions with respect to the current published coordinates from our stacked solution. Newer sites may not have a long-term plot if they were added after our Multi-year Solution Processing campaign.<br/>Long-term plots can be found at: */Plots/Longterm/`ssss`_`YY`.long.png*\n - **Daily Broadcast Ephemeris**:\n - Daily GPS Broadcast ephemeris can be found at: *rinex/`YYYY`/`DDD`/brdc`DDD`0.`YY`n.gz*\n - Daily GLONASS-only Broadcast ephemeris can be found at: *rinex/`YYYY`/`DDD`/brdc`DDD`0.`YY`g.gz*\n - **Daily final, rapid, and hourly ultra-rapid GNSS Orbit** can be found at: \n - Daily final and rapid GNSS Orbit can be found at: *rinex/`YYYY`/`DDD`/`AAAWWWWD`.sp3.gz*\n - Hourly ultra-rapid GNSS Orbit can be found at: *rinex/`YYYY`/`DDD`/`AAAWWWWD`_`HH`.sp3.gz*\n - In which:\n - `YYYY`: 4-digit year\n - `YY`: The last 2-digit of year\n - `DDD`: 3-digit day of year [001,002,..366]\n - `D`: day of week [Sun=0, Mon=1,..,Fri=6]\n - `ssss`: 4-char station ID\n - `h`: 1-char hour of day (a=00, b=01, c=02,..,x=23)\n - `HH`: 2-digit hour of day (00,01,02,..,23)\n - `WWWW`: 4-digit GPS week number\n - `AAA`: 3-char analysis center name/type of solution, such as:\n - igs: IGS final solution combination\n - igl: IGS final solution combination (GLONASS-only)\n - igr: IGS rapid solution combination\n - igu: IGS ultra-rapid solution combination\n \n","For more information, visit [NCN Data and Products](https://geodesy.noaa.gov/CORS/data.shtml)","- For general inquiries about NCN data and products, email ✉ ngs.cors at noaa.gov\n- For any questions regarding data delivery or any general questions regarding the NOAA Open Data Dissemination (NODD) Program, email the NODD Team at nodd@noaa.gov.\n- We also seek to identify case studies on how NOAA data is being used and will be featuring those stories in joint publications and in upcoming events. If you are interested in seeing your story highlighted, please share it with the NODD team by emailing nodd@noaa.gov\n","Most data are available within 1 hour from when they were recorded at the remote site, and a few sites have a delay of 24 hours.",[NOAA](http://www.noaa.gov/),{'ASDI': {'Tags': ['elevation']}},"[aws-pds, broadcast ephemeris, Continuously Operating Reference Station (CORS), earth observation, geospatial, GPS, GNSS, mapping, NOAA CORS Network (NCN), post-processing, RINEX, survey]",There are no restrictions on the use of this data.,"[{'Description': '[NCN Data and Products](https://geodesy.noaa.gov/CORS/data.shtml)', 'ARN': 'arn:aws:s3:::noaa-cors-pds', 'Region': 'us-east-1', 'Type': 'S3 Bucket', 'Explore': ['[Browse NOAA-NCN Bucket](https://noaa-cors-pds.s3.amazonaws.com/index.html)']}]",,,,
2,"1940 Census Population Schedules, Enumeration District Maps, and Enumeration District Descriptions","The 1940 Census population schedules were created by the Bureau of the Census in an attempt to enumerate every person living in the United States on April 1, 1940, although some persons were missed. The 1940 census population schedules were digitized by the National Archives and Records Administration (NARA) and released publicly on April 2, 2012.\nThe 1940 Census enumeration district maps contain maps of counties, cities, and other minor civil divisions that show enumeration districts, census tracts, and related boundaries and numbers used for each census. The coverage is nation wide and includes territorial areas.\nThe 1940 Census enumeration district descriptions contain written descriptions of census districts, subdivisions, and enumeration districts.\n",https://www.archives.gov/developer/1940-census,public.dataset.program@nara.gov,Not updated,National Archives and Records Administration (NARA),{'ASDI': {'Tags': ['socioeconomic']}},"[nara, census, archives, 1940 census, demography, aws-pds]",US Government work,"[{'Description': '1940 Census', 'ARN': 'arn:aws:s3:::nara-1940-census', 'Region': 'us-east-2', 'Type': 'S3 Bucket'}]","{'Tutorials': [{'Title': '1940 Census on the AWS Registry of Open Data', 'URL': 'https://www.archives.gov/developer/1940-census', 'AuthorName': 'National Archives and Records Administration'}], 'Tools & Applications': [{'Title': 'National Archives 1940 Census', 'URL': 'https://1940census.archives.gov', 'AuthorName': 'National Archives and Records Administration', 'AuthorURL': 'https://www.archives.gov'}]}",,,
3,NOAA Global Hydro Estimator (GHE),"Global Hydro-Estimator provides a global\nmosaic imagery of rainfall estimates from\nmulti-geostationary satellites, which\ncurrently includes GOES-16, GOES-15,\nMeteosat-8, Meteosat-11 and Himawari-8.\nThe GHE products include: Instantaneous\nrain rate, 1 hour, 3 hour, 6 hour, 24 hour\nand also multi-day rainfall accumulation.\n",https://www.ospo.noaa.gov/Products/atmosphere/ghe/index.html,"For questions regarding product content or\nquality, visit https://www.ospo.noaa.gov/Products/atmosphere/ghe/index.html.\n<br />\nFor any questions regarding data delivery or any general questions regarding the NOAA Open Data Dissemination (NODD) Program, email the NODD Team at nodd@noaa.gov.\n<br /> We also seek to identify case studies on how NOAA data is being used and will be featuring those stories in joint publications and in upcoming events. If you are interested in seeing your story highlighted, please share it with the NODD team by emailing nodd@noaa.gov\n",15 minute-instantaneous,[NOAA](http://www.noaa.gov/),{'ASDI': {'Tags': ['water']}},"[aws-pds, agriculture, meteorological, sustainability, water, weather]",There are no restrictions on the use of this data.,"[{'Description': 'Project data files', 'ARN': 'arn:aws:s3:::noaa-ghe-pds', 'Region': 'us-east-1', 'Type': 'S3 Bucket', 'Explore': ['[Browse Bucket](https://noaa-ghe-pds.s3.amazonaws.com/index.html)']}, {'Description': 'New data notifications for GHE, only Lambda and SQS protocols allowed', 'ARN': 'arn:aws:sns:us-east-1:123901341784:NewGHEObject', 'Region': 'us-east-1', 'Type': 'SNS Topic'}]","{'Tutorials': None, 'Tools & Applications': None, 'Publications': None}",,,
4,USGS 3DEP LiDAR Point Clouds,"The goal of the [USGS 3D Elevation Program ](https://www.usgs.gov/core-science-systems/ngp/3dep) (3DEP) is to collect elevation data in the form of light detection and ranging (LiDAR) data over the conterminous United States, Hawaii, and the U.S. territories, with data acquired over an 8-year period. This dataset provides two realizations of the 3DEP point cloud data. The first resource is a public access organization provided in [Entwine Point Tiles](https://entwine.io/entwine-point-tile.html) format, which a lossless, full-density, streamable octree based on [LASzip](https://laszip.org) (LAZ) encoding. The second resource is a [Requester Pays](https://docs.aws.amazon.com/AmazonS3/latest/dev/RequesterPaysBuckets.html) of the original, Raw LAZ (Compressed LAS) 1.4 3DEP format, and more complete in coverage, as sources with incomplete or missing CRS, will not have an ETP tile generated. Resource names in both buckets correspond to the USGS project names.",https://github.com/hobu/usgs-lidar/,https://github.com/hobu/usgs-lidar,Periodically,"[Hobu, Inc.](https://hobu.co)",{'ASDI': {'Tags': ['elevation']}},"[aws-pds, agriculture, elevation, disaster response, geospatial, lidar, sustainability, stac]",US Government Public Domain https://www.usgs.gov/faqs/what-are-terms-uselicensing-map-services-and-data-national-map,"[{'Description': 'Public access Entwine Point Tiles of most resources from the ``arn:aws:s3:::usgs-lidar`` bucket.', 'ARN': 'arn:aws:s3:::usgs-lidar-public', 'Region': 'us-west-2', 'Type': 'S3 Bucket', 'Explore': ['[STAC Catalog](https://usgs-lidar-stac.s3-us-west-2.amazonaws.com/ept/catalog.json)']}, {'Description': 'A [Requester Pays](https://docs.aws.amazon.com/AmazonS3/latest/dev/RequesterPaysBuckets.html) Bucket of Raw LAZ 1.4 3DEP data. Data in this bucket is more complete in coverage than the EPT bucket, but it is not a complete 3DEP mirror. Some resources in this bucket also have incomplete and missing coordinate system information, which is why they might not be mirrored into the EPT bucket.', 'ARN': 'arn:aws:s3:::usgs-lidar', 'Region': 'us-west-2', 'Type': 'S3 Bucket', 'RequesterPays': True}]","{'Tutorials': [{'Title': 'Using Lambda Layers with USGS 3DEP LiDAR Point Clouds', 'URL': 'https://github.com/hobu/usgs-lidar/tree/master/lambda', 'AuthorName': 'Howard Butler', 'AuthorURL': 'https://twitter.com/howardbutler', 'Services': ['Lambda']}, {'Title': 'Extracting buildings and roads from AWS Open Data using Amazon SageMaker', 'URL': 'https://aws.amazon.com/blogs/machine-learning/extracting-buildings-and-roads-from-aws-open-data-using-amazon-sagemaker/', 'AuthorName': 'Yunzhi Shi, Tianyu Zhang, and Xin Chen', 'Services': ['SageMaker']}, {'Title': 'WebGL Visualization of USGS 3DEP Lidar Point Clouds with Potree and Plasio.js', 'URL': 'https://usgs.entwine.io/', 'AuthorName': 'Connor Manning', 'AuthorURL': 'https://twitter.com/csmannin'}], 'Tools & Applications': [{'Title': 'OpenTopography access to 3DEP lidar point cloud data', 'URL': 'https://portal.opentopography.org/datasets', 'AuthorName': 'OpenTopography', 'AuthorURL': 'https://opentopography.org/'}, {'Title': 'Facebook Line of Sight Check', 'URL': 'https://www.facebook.com/isptoolbox/line-of-sight-check/', 'AuthorName': 'Facebook', 'AuthorURL': 'https://www.facebook.com/isptoolbox/'}, {'Title': 'Equator - View, Process, and Download USGS 3DEP LiDAR data in-browser', 'URL': 'https://equatorstudios.com/lidar-viewer/', 'AuthorName': 'Equator Studios', 'AuthorURL': 'https://equatorstudios.com'}], 'Publications': [{'Title': 'USGS 3DEP Lidar Point Cloud Now Available as Amazon Public Dataset', 'URL': 'https://www.usgs.gov/news/usgs-3dep-lidar-point-cloud-now-available-amazon-public-dataset', 'AuthorName': 'Department of the Interior, U.S. Geological Survey', 'AuthorURL': 'https://www.usgs.gov'}, {'Title': 'Statewide USGS 3DEP Lidar Topographic Differencing Applied to Indiana, USA', 'URL': 'https://www.mdpi.com/2072-4292/14/4/847/htm', 'AuthorName': 'Chelsea Phipps Scott, Matthew Beckley, Minh Phan, Emily Zawacki, Christopher Crosby, Viswanath Nandigam, and Ramon Arrowsmith'}]}",,,


In [9]:
# # get unique entries in the DataAtWork column
# data_at_work = df[df["DataAtWork"].isnull()==0]["DataAtWork"].to_list()
# set(chain.from_iterable([ list(item.keys()) for item in data_at_work ]))

In [1]:
# weaviate schema draft
{
    # class
    "Dataset": [
        # properties
        "name",
        "description",
        "documentation",
        "managedBy[Publisher]",
        "updateFrequency",
        "hasTag[Tag]",
        "license",
        "hasResource[Resource]",
        "hasTutorial[Tutorial]",
        "hasPublication[Publication]",
        "hasToolOrApplication[ToolOrApplication]",
    ],
    "Publisher": [
        "name",
        "contact"        
    ],
    "Tag": [
        "name"
    ],
    "Resource": [
        "description",
        "arn",
        "region",
        "type"
    ],
    "Tutorial": [
        "title",
        "url",
        "authorName",
        "authorUrl",
        "services"
    ],
    "Publication": [
        "url",
        "authorName",
        "authorUrl"
    ],
    "ToolOrApplication": [
        "url",
        "authorName",
        "authorUrl"
    ]
}

{'Dataset': ['name',
  'description',
  'documentation',
  'managedBy[Publisher]',
  'updateFrequency',
  'hasTag[Tag]',
  'license',
  'hasResource[Resource]',
  'hasTutorial[Tutorial]',
  'hasPublication[Publication]',
  'hasToolOrApplication[ToolOrApplication]'],
 'Publisher': ['name', 'contact'],
 'Tag': ['name'],
 'Resource': ['description', 'arn', 'region', 'type'],
 'Tutorial': ['title', 'url', 'authorName', 'authorUrl', 'services'],
 'Publication': ['url', 'authorName', 'authorUrl'],
 'ToolOrApplication': ['url', 'authorName', 'authorUrl']}

In [11]:
# dataset_class_df = df[["Name", "Description"]]
# dataset_class_df.columns = ["name", "description"]
# dataset_class = dataset_class_df.to_dict(orient="records")

In [12]:
# path = "../___data/classes/dataset-class.json"
# with open(path, "w") as f:
#     json.dump(dataset_class, f, indent=4)

In [13]:
datasets[:3]

[{'Name': 'Open City Model (OCM)',
  'Description': 'Open City Model is an initiative to provide cityGML data for all the buildings in the United States.\nBy using other open datasets in conjunction with our own code and algorithms it is our goal to provide 3D geometries for every US building.\n',
  'Documentation': 'https://github.com/opencitymodel/opencitymodel',
  'Contact': 'https://github.com/opencitymodel/opencitymodel#contact',
  'UpdateFrequency': 'Quarterly',
  'ManagedBy': 'BuildZero',
  'Collabs': {'ASDI': {'Tags': ['infrastructure']}},
  'Tags': ['aws-pds', 'events', 'cities', 'geospatial'],
  'License': 'https://github.com/opencitymodel/opencitymodel#license',
  'Resources': [{'Description': 'Project data files',
    'ARN': 'arn:aws:s3:::opencitymodel',
    'Region': 'us-east-1',
    'Type': 'S3 Bucket'}],
  'DataAtWork': {'Tutorials': [{'Title': 'Using Open City Model with the 3dCityDB',
     'URL': 'https://github.com/opencitymodel/opencitymodel/blob/master/examples/3dCi