In [2]:
from uuid import uuid4
import json
import jmespath
import weaviate

In [3]:
# open json file from root folder
with open('../___schema.json') as f:
    schema_obj = json.load(f)

In [4]:
classes = [ { k:v for k, v in clss.items() if k != "properties" } for clss in schema_obj['classes'] ]
classes

[{'class': 'Dataset', 'description': 'AWS Open Registry Dataset'},
 {'class': 'Publisher',
  'description': 'Organization that publishes the dataset'},
 {'class': 'Tag', 'description': 'Related topics'},
 {'class': 'Resource', 'description': 'Dataset resources'},
 {'class': 'Tutorial',
  'description': 'Tutorials for how to analyze the dataset'},
 {'class': 'Publication',
  'description': 'Publications that mention the dataset'},
 {'class': 'ToolOrApplication',
  'description': 'Tools or applications that use the dataset'}]

In [5]:
# create client; takes some time to start up
host = "52.3.229.64"
uri = f"http://{host}:8080"
print(uri)
client = weaviate.Client(uri)

http://52.3.229.64:8080


ConnectTimeout: HTTPConnectionPool(host='52.3.229.64', port=8080): Max retries exceeded with url: /v1/.well-known/openid-configuration (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x110f83310>, 'Connection to 52.3.229.64 timed out. (connect timeout=10)'))

In [None]:
full_schema_obj = {
    "classes": [ { k:v for k, v in clss.items() if k != "properties" } for clss in schema_obj['classes'] ]
}

In [None]:
# # delete existing schema
# client.schema.delete_all()

client.schema.create(full_schema_obj)

## Loading Database

### Get the data

Every JSON value in a record has these possible datatypes:
* primitive &rarr; 'field'
* object &rarr; 'field.sub-field'
* array of primitives &rarr; 'field[]'
* array of objects &rarr; 'field[].sub-field'

The mapping for a primitive object is easy:
"to_field": "field"

```python
is_primitive = type(field) in [str, int, float, bool]
is_object = type(field) == dict # named collection of objects, no arrays
is_primitive_array = type(field) == list and len(field) > 0 and type(field[0]) in [str, int, float, bool] # collection of primitives
is_object_array = type(field) == list and len(field) > 0 and type(field[0]) == dict # collection of objects
```

Weaviate schema for class objects:
```python
{
    "field": "value"
}
```

### Utility Functions

In [6]:
# ready
def from_object_or_primitive(class_name, data, map):
    return [ { 
        "id": str(uuid4()),
        "class": class_name,
        "data": {
        k: jmespath.search(v, data) for k, v in map.items() 
    }
} ]

# ready
def from_primitive_array(class_name, data, map):
    return [ { 
        "id": str(uuid4()), 
        "class": class_name,
        "data": { k: i }
    } for k, v in map.items() for i in jmespath.search(v, data) ]

# works, maybe ready
def from_object_array(class_name, data, key, map):
    return [ {
        "id": str(uuid4()), 
        "class": class_name, 
        "data": {
            k: jmespath.search(v, item) for k, v in map.items() 
        }
    } for item in data[key] ]


In [7]:
with open('../___data/output/aws-odr.json') as f:
    data_objs = json.load(f)

In [9]:
data = data_objs[2]
data

{'Name': '1940 Census Population Schedules, Enumeration District Maps, and Enumeration District Descriptions',
 'Description': 'The 1940 Census population schedules were created by the Bureau of the Census in an attempt to enumerate every person living in the United States on April 1, 1940, although some persons were missed. The 1940 census population schedules were digitized by the National Archives and Records Administration (NARA) and released publicly on April 2, 2012.\nThe 1940 Census enumeration district maps contain maps of counties, cities, and other minor civil divisions that show enumeration districts, census tracts, and related boundaries and numbers used for each census. The coverage is nation wide and includes territorial areas.\nThe 1940 Census enumeration district descriptions contain written descriptions of census districts, subdivisions, and enumeration districts.\n',
 'Documentation': 'https://www.archives.gov/developer/1940-census',
 'Contact': 'public.dataset.progra

In [11]:
expr = 'map(&{title: @.Title, url: @.URL, authorName: @.Author.Name, authorUrl: @.Author.URL}, DataAtWork.Tutorials)'
print(jmespath.search(expr, data))

[{'title': '1940 Census on the AWS Registry of Open Data', 'url': 'https://www.archives.gov/developer/1940-census', 'authorName': None, 'authorUrl': None}]


In [197]:
# values
expr = "[{name: Name, desc: Description}]"
print(jmespath.search(expr, data))

# list of values
expr = "map(&{name: @}, Tags)"
print(jmespath.search(expr, data))

# list of objects
expr = "map(&{arn: @.ARN, region: @.region}, Resources)"
print(jmespath.search(expr, data))

[{'name': 'Open City Model (OCM)', 'desc': 'Open City Model is an initiative to provide cityGML data for all the buildings in the United States.\nBy using other open datasets in conjunction with our own code and algorithms it is our goal to provide 3D geometries for every US building.\n'}]
[{'name': 'aws-pds'}, {'name': 'events'}, {'name': 'cities'}, {'name': 'geospatial'}]
[{'arn': 'arn:aws:s3:::opencitymodel', 'region': None}]


In [218]:
# mapping
{
    # option 1: rename object fields (works for object or object array)
    "class1_name": {
        "object_property": "json_field_value",
    },
    # option 2: keep field names (only works for values object)
    "class2_name": "json_field_array"
}

# weaviate object
{
    "id": "<uuid>",
    "class": "class1_name",
    "data": {
        "property": "value"
    }
}

{'id': '<uuid>', 'class': 'class1_name', 'data': {'property': 'value'}}

In [None]:
mappings = {
    "Dataset": {
        "name" : "Name",
        "description" : "Description",
        "documentation" : "Documentation",
        "updateFrequency" : "UpdateFrequency",
        "license" : "License"
    }
}

In [222]:
expr_map = {
    "values": "[{name: Name, desc: Description}]",
    "list_of_values": "map(&{name: @}, Tags)",
    "list_of_objects": "map(&{arn: @.ARN, region: @.region}, Resources)"
}

In [223]:
def build_weaviate_object(class_name, expr, data):
    return [{
        "id": str(uuid4()),
        "class": class_name,
        "data": item 
    } for item in jmespath.search(expr, data)]

In [224]:
for class_name, expr in zip(["Dataset", "Tag", "Resource"], expr_map.values()):
    print(build_weaviate_object(class_name, expr, data), "\n")

[{'id': '5c972002-f671-4daa-9800-77399df518ef', 'class': 'Dataset', 'data': {'name': 'Open City Model (OCM)', 'desc': 'Open City Model is an initiative to provide cityGML data for all the buildings in the United States.\nBy using other open datasets in conjunction with our own code and algorithms it is our goal to provide 3D geometries for every US building.\n'}}] 

[{'id': '96c51555-f0a2-417c-8dc1-c3e1b3ee43ef', 'class': 'Tag', 'data': {'name': 'aws-pds'}}, {'id': '899ddf7b-85a4-4254-ac4a-b76ef575d59f', 'class': 'Tag', 'data': {'name': 'events'}}, {'id': 'db380e40-df47-4989-a1fa-cca2c0b4a40e', 'class': 'Tag', 'data': {'name': 'cities'}}, {'id': '4364e4a6-01e2-49c3-997c-2b8b2740a3f4', 'class': 'Tag', 'data': {'name': 'geospatial'}}] 

[{'id': '10d1655a-d1d3-44e2-baae-be28fbe3d416', 'class': 'Resource', 'data': {'arn': 'arn:aws:s3:::opencitymodel', 'region': None}}] 



In [185]:
import jmespath

data = {'Tags': ['aws-pds', 'events', 'cities', 'geospatial']}

expression = "map(&{name: @}, Tags)"
result = jmespath.search(expression, data)

print(result)


[{'name': 'aws-pds'}, {'name': 'events'}, {'name': 'cities'}, {'name': 'geospatial'}]


In [164]:
# {"Tags": [ {"name": item } for item in  jmespath.search("Tags[]", data)]}
{ "Tags": jmespath.search("Tags[]", data)}

{'Tags': ['aws-pds', 'events', 'cities', 'geospatial']}

In [136]:
# mapping structure:
# "output_field_name": "jmespath_expression"

payload = {}

# dataset
dataset_map = {
    "name" : "Name",
    "description" : "Description",
    "documentation" : "Documentation",
    "updateFrequency" : "UpdateFrequency",
    "license" : "License"
}

payload["Dataset"] = from_object_or_primitive("Dataset", data, dataset_map)

# publisher
publisher_map = {
    "name" : "Name",
    "contact": "Contact",
}
payload["Publisher"] = from_object_or_primitive("Publisher", data, publisher_map)

# resources
resources_object_array_map = {
    "arn": "ARN",
    "type": "Type",
    "region": "Region",
    "description": "Description",
}
payload["Resource"] = from_object_array("Resource", data, "Resources", resources_object_array_map)

# tags
tags_prim_array_map = {
    "name": "Tags"
}
payload["Tag"] = from_primitive_array("Tag", data, tags_prim_array_map)


In [131]:
# # works, maybe ready
# [ {
#     "id": str(uuid4()), 
#     "class": "Resource", 
#     "data": {
#         k: jmespath.search(v, item) for k, v in resources_object_array_map.items() 
#     }
# } for item in data["Resources"] ]

[{'id': '880f2d16-16d6-4ca2-8616-494b7426f788',
  'class': 'Resource',
  'data': {'arn': 'arn:aws:s3:::opencitymodel',
   'type': 'S3 Bucket',
   'region': 'us-east-1',
   'description': 'Project data files'}}]

In [132]:
# # works, maybe ready
# def from_object_array(class_name, data, key, map):
#     return [ {
#         "id": str(uuid4()), 
#         "class": class_name, 
#         "data": {
#             k: jmespath.search(v, item) for k, v in map.items() 
#         }
#     } for item in data[key] ]

In [137]:
payload

{'Dataset': [{'id': '7a106eef-10d9-44ec-840c-8475951ac6a6',
   'class': 'Dataset',
   'data': {'name': 'Open City Model (OCM)',
    'description': 'Open City Model is an initiative to provide cityGML data for all the buildings in the United States.\nBy using other open datasets in conjunction with our own code and algorithms it is our goal to provide 3D geometries for every US building.\n',
    'documentation': 'https://github.com/opencitymodel/opencitymodel',
    'updateFrequency': 'Quarterly',
    'license': 'https://github.com/opencitymodel/opencitymodel#license'}}],
 'Publisher': [{'id': '1a66b2d5-dd6b-4044-8cef-873be68588af',
   'class': 'Publisher',
   'data': {'name': 'Open City Model (OCM)',
    'contact': 'https://github.com/opencitymodel/opencitymodel#contact'}}],
 'Resource': [{'id': 'b46b9134-06ba-48c6-b8ec-d4ec6b572da4',
   'class': 'Resource',
   'data': {'arn': 'arn:aws:s3:::opencitymodel',
    'type': 'S3 Bucket',
    'region': 'us-east-1',
    'description': 'Project d

In [141]:
[item for key in payload for item in payload[key]]

[{'id': '7a106eef-10d9-44ec-840c-8475951ac6a6',
  'class': 'Dataset',
  'data': {'name': 'Open City Model (OCM)',
   'description': 'Open City Model is an initiative to provide cityGML data for all the buildings in the United States.\nBy using other open datasets in conjunction with our own code and algorithms it is our goal to provide 3D geometries for every US building.\n',
   'documentation': 'https://github.com/opencitymodel/opencitymodel',
   'updateFrequency': 'Quarterly',
   'license': 'https://github.com/opencitymodel/opencitymodel#license'}},
 {'id': '1a66b2d5-dd6b-4044-8cef-873be68588af',
  'class': 'Publisher',
  'data': {'name': 'Open City Model (OCM)',
   'contact': 'https://github.com/opencitymodel/opencitymodel#contact'}},
 {'id': 'b46b9134-06ba-48c6-b8ec-d4ec6b572da4',
  'class': 'Resource',
  'data': {'arn': 'arn:aws:s3:::opencitymodel',
   'type': 'S3 Bucket',
   'region': 'us-east-1',
   'description': 'Project data files'}},
 {'id': 'd67c4e95-0e1a-47f1-aa87-c3edf0f5

In [139]:
payload_ext

['D',
 'a',
 't',
 'a',
 's',
 'e',
 't',
 'P',
 'u',
 'b',
 'l',
 'i',
 's',
 'h',
 'e',
 'r',
 'R',
 'e',
 's',
 'o',
 'u',
 'r',
 'c',
 'e',
 'T',
 'a',
 'g']

In [32]:
resources_array_map = {
    "arn": "Arn",
    "type": "Type",
    "region": "Region",
}

dataatwork_tutorials = {
    "title": "Title",
    "url": "Url",
    "authorName": "AuthorName",
    "authorUrl": "AuthorUrl",
    "services": "Services",
}

dataatwork_tools_apps = {
    "title": "Title",
    "url": "Url",
    "authorName": "AuthorName",
    "authorUrl": "AuthorUrl"
}

In [35]:
weaviate_object_from_json(data, dataset_map)

{'name': 'Open City Model (OCM)',
 'description': 'Open City Model is an initiative to provide cityGML data for all the buildings in the United States.\nBy using other open datasets in conjunction with our own code and algorithms it is our goal to provide 3D geometries for every US building.\n',
 'documentation': 'https://github.com/opencitymodel/opencitymodel',
 'updateFrequency': 'Quarterly',
 'license': 'https://github.com/opencitymodel/opencitymodel#license'}

In [34]:
weaviate_object_from_json(data, publisher_map)

{'name': 'Open City Model (OCM)',
 'contact': 'https://github.com/opencitymodel/opencitymodel#contact'}

In [33]:
for resource in data['Resources']:
    weaviate_object_from_json(resource, resources_array_map)

In [41]:
for tutorial in data['DataAtWork']['Tutorials']:
    print(weaviate_object_from_json(tutorial, dataatwork_tutorials))

{'title': 'Using Open City Model with the 3dCityDB', 'url': None, 'authorName': 'Allen Gilliland', 'authorUrl': None}
{'title': 'Running queries on Open City Model using AWS Athena', 'url': None, 'authorName': 'Allen Gilliland', 'authorUrl': None}
{'title': 'Investigating environmental characteristics of US cities using publicly available ASDI datasets', 'url': None, 'authorName': 'Darren Ko', 'authorUrl': None}


### Load

In [13]:
# configure the batch settings
client.batch.configure(
  batch_size=100,
  dynamic=False,
  timeout_retries=3,
  callback=weaviate.util.check_batch_result,
)

<weaviate.batch.crud_batch.Batch at 0x10e837a90>

In [14]:
# load data for that class
with open('../___data/classes/dataset-class.json') as f:
    datasets = json.load(f)

In [15]:
# add data to weaviate
with client.batch as batch:
    for dataset in datasets:
        print(dataset)
        batch.add_data_object(dataset, class_name='Dataset')

{'name': 'Open City Model (OCM)', 'description': 'Open City Model is an initiative to provide cityGML data for all the buildings in the United States.\nBy using other open datasets in conjunction with our own code and algorithms it is our goal to provide 3D geometries for every US building.\n'}
{'name': 'NOAA Continuously Operating Reference Stations (CORS) Network (NCN)', 'description': "The [NOAA Continuously Operating Reference Stations (CORS) Network (NCN)](https://geodesy.noaa.gov/CORS/), managed by NOAA/National Geodetic Survey ([NGS](https://geodesy.noaa.gov/)), provide Global Navigation Satellite System (GNSS) data, supporting three dimensional positioning, meteorology, space weather, and geophysical applications throughout the United States. The NCN is a multi-purpose, multi-agency cooperative endeavor, combining the efforts of hundreds of government, academic, and private organizations. The stations are independently owned and operated. Each agency shares their GNSS/GPS carri

In [51]:
# query
result = client.query.get("Dataset", ["name", "description"]).do()

In [52]:
len(result)

1

In [60]:
result['data']['Get']['Dataset'][0]

{'description': "Imagery acquired\nby the China-Brazil Earth Resources Satellite (CBERS), 4 and 4A.\nThe\nimage files are recorded and processed by Instituto Nacional de Pesquisas\nEspaciais (INPE) and are converted to Cloud Optimized Geotiff\nformat in order to optimize its use for cloud based applications.\nContains all CBERS-4 MUX, AWFI, PAN5M and\nPAN10M scenes acquired since\nthe start of the satellite mission and is daily updated with\nnew scenes.\nCBERS-4 PAN5M and PAN10M starting from 05-2022 are temporarily not ingested\ndue to an error in the bands identification on INPE's catalog.\nCBERS-4A MUX Level 4 (Orthorectified) scenes are being\ningested starting from 04-13-2021. CBERS-4A WFI Level 4 (Orthorectified)\nscenes are being ingested starting from 10-12-2022.\n",
 'name': 'CBERS on AWS'}

In [61]:
# query = """
# {
#     Get {
#         Dataset {
#             name,
#             description
#         }
#     }
# }
# """

In [111]:
# query = """
# {
#     Get {
#         Dataset (
#             nearText: {
#                 concepts: ["whales"]
#                 certainty: 0.5
#                 moveAwayFrom: {
#                     concepts: ["satellites", "European Space Agency", "Sentinel"]
#                     force: 0.45
#                 }
#                 moveTo: {
#                 concepts: ["oceans", "marine", "NOAA", "water"],
#                 force: 0.85
#                 }
#             }
#         ) {
#             name
#             description
#             _additional {
#                 certainty # only works if distance==cosine
#                 distance  # always works
#             }
#         }
#     }
# }
# """

In [112]:
# genomics datasets
query = """
{
    Get {
        Dataset (
            nearText: {
                concepts: ["genomics", "metabolism", "proteins", "bioinformatics", "metagenomics"]
                certainty: 0.5
            }
        ) {
            name
            description
            _additional {
                certainty
                distance 
            }
        }
    }
}
"""

In [119]:
# legal datasets
query = """
{
    Get {
        Dataset (
            nearText: {
                concepts: ["legal", "law", "court", "judicial", "judiciary", "finance", "corporate", "business"]
                certainty: 0.6
            }
        ) {
            name
            description
            _additional {
                certainty
            }
        }
    }
}
"""

In [120]:
result = client.query.raw(query)

In [121]:
len(result['data']['Get']['Dataset'])
# result

4

In [122]:
# result['data']['Get']['Dataset'][:3]

In [123]:
for result in result['data']['Get']['Dataset'][:25]:
    print(f'Name: {result["name"]}')
    print(f'Description: {result["description"]}')
    print(f'Certainty: {result["_additional"]["certainty"]}')
    print("\n\n")

Name: Amazon-PQA
Description: Amazon product questions and their answers, along with the public product information.
Certainty: 0.6178012490272522



Name: Legal Entity Identifier (LEI) and Legal Entity Reference Data (LE-RD)
Description: The Legal Entity Identifier (LEI) is a 20-character, alpha-numeric code based on the ISO 17442 standard developed by the International Organization for Standardization (ISO). It connects to key reference information that enables clear and unique identification of legal entities participating in financial transactions. Each LEI contains information about an entity’s ownership structure and thus answers the questions of 'who is who’ and ‘who owns whom’. Simply put, the publicly available LEI data pool can be regarded as a global directory, which greatly enhances transparency in the global marketplace. The Financial Stability Board (FSB) has reiterated that global LEI adoption underpins “multiple financial stability objectives” such as improved risk mana