In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [28]:
import os
from pathlib import Path
from itertools import chain, starmap
import json
import yaml
import pandas as pd

# Pandas display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

In [21]:
def flatten_json(dictionary, sep='.'):
    """Flatten a nested json file. For a list of dictionaries, use this
    inside a for loop before converting to pandas DataFrame."""

    def unpack(parent_key, parent_value):
        """Unpack one level of nesting in json file"""
        # Unpack one level only!!!
        
        if isinstance(parent_value, dict):
            for key, value in parent_value.items():
                temp1 = parent_key + sep + key
                yield temp1, value
        elif isinstance(parent_value, list):
            i = 0 
            for value in parent_value:
                temp2 = parent_key + sep +str(i) 
                i += 1
                yield temp2, value
        else:
            yield parent_key, parent_value    


    # Keep iterating until the termination condition is satisfied
    while True:
        # Keep unpacking the json file until all values are atomic elements (not dictionary or list)
        dictionary = dict(chain.from_iterable(starmap(unpack, dictionary.items())))
        # Terminate condition: not any value in the json file is dictionary or list
        if not any(isinstance(value, dict) for value in dictionary.values()) and \
           not any(isinstance(value, list) for value in dictionary.values()):
            break

    return dictionary

In [22]:
data_dir = Path('../___data/awslabs/open-data-registry/datasets')

In [30]:
yaml_files = data_dir.glob('*.yaml')

datasets = []
for file in yaml_files:
    dataset = yaml.safe_load(file.read_text())
    # dataset = flatten_json(dataset, sep="_")
    datasets.append(dataset)



In [31]:
df = pd.DataFrame(datasets)

In [48]:
df.head(1)

Unnamed: 0,Name,Description,Documentation,Contact,UpdateFrequency,ManagedBy,Collabs,Tags,License,Resources,DataAtWork,Deprecated,DeprecatedNotice,Citation
0,Open City Model (OCM),Open City Model is an initiative to provide cityGML data for all the buildings in the United States.\nBy using other open datasets in conjunction with our own code and algorithms it is our goal to provide 3D geometries for every US building.\n,https://github.com/opencitymodel/opencitymodel,https://github.com/opencitymodel/opencitymodel#contact,Quarterly,BuildZero,{'ASDI': {'Tags': ['infrastructure']}},"[aws-pds, events, cities, geospatial]",https://github.com/opencitymodel/opencitymodel#license,"[{'Description': 'Project data files', 'ARN': 'arn:aws:s3:::opencitymodel', 'Region': 'us-east-1', 'Type': 'S3 Bucket'}]","{'Tutorials': [{'Title': 'Using Open City Model with the 3dCityDB', 'URL': 'https://github.com/opencitymodel/opencitymodel/blob/master/examples/3dCityDB-to-GoogleEarth.md', 'AuthorName': 'Allen Gilliland', 'AuthorURL': 'https://github.com/agilliland'}, {'Title': 'Running queries on Open City Model using AWS Athena', 'URL': 'https://github.com/opencitymodel/opencitymodel/blob/master/examples/Query-OpenCityModel-using-AWS-Athena.md', 'AuthorName': 'Allen Gilliland', 'AuthorURL': 'https://github.com/agilliland', 'Services': ['Athena']}, {'Title': 'Investigating environmental characteristics of US cities using publicly available ASDI datasets', 'URL': 'https://github.com/aws-samples/aws-asdi-cities-smsl-notebook', 'NotebookURL': 'https://github.com/aws-samples/aws-asdi-cities-smsl-notebook/blob/main/ASDI_Cities_Demo.ipynb', 'AuthorName': 'Darren Ko', 'Services': ['SageMaker Studio Lab']}], 'Tools & Applications': None, 'Publications': None}",,,


In [47]:
# get unique entries in the DataAtWork column
data_at_work = df[df["DataAtWork"].isnull()==0]["DataAtWork"].to_list()
set(chain.from_iterable([ list(item.keys()) for item in data_at_work ]))

{'Publications', 'Tools & Applications', 'Tutorials'}

In [51]:
# weaviate schema draft
{
    # class
    "Dataset": [
        # properties
        "name",
        "description",
        "documentation",
        "managedBy[Publisher]",
        "updateFrequency",
        "hasTag[Tag]",
        "license",
        "hasResource[Resource]",
        "hasTutorial[Tutorial]",
        "hasPublication[Publication]",
        "hasToolOrApplication[ToolOrApplication]",
    ],
    "Publisher": [
        "name",
        "contact"        
    ],
    "Tag": [
        "name"
    ],
    "Resource": [
        "description",
        "arn",
        "region",
        "type"
    ],
    "Tutorial": [
        "title",
        "url",
        "authorName",
        "authorUrl",
        "services"
    ],
    "Publication": [
        "url",
        "authorName",
        "authorUrl"
    ],
    "ToolOrApplication": [
        "url",
        "authorName",
        "authorUrl"
    ]
}

{'Dataset': ['name',
  'description',
  'documentation',
  'managedBy[Publisher]',
  'updateFrequency',
  'hasTag[Tag]',
  'license',
  'hasResource[Resource]',
  'hasTutorial[Tutorial]',
  'hasPublication[Publication]',
  'hasToolOrApplication[ToolOrApplication]'],
 'Publisher': ['name', 'contact'],
 'Tag': ['name'],
 'Resource': ['description', 'arn', 'region', 'type'],
 'Tutorial': [''],
 'Publication': [''],
 'ToolOrApplication': ['']}

In [72]:
dataset_class_df = df[["Name", "Description"]]
dataset_class_df.columns = ["name", "description"]
dataset_class = dataset_class_df.to_dict(orient="records")

In [73]:
path = "../___data/classes/dataset-class.json"
with open(path, "w") as f:
    json.dump(dataset_class, f, indent=4)