# OIH Graph Pre-Processor - Graph to network viz



## requirements.txt

In [None]:
%%capture
!pip install -q rdflib
!pip install -q shapely
!pip install -q pyld
!pip install -q kglab
!pip install -q minio
!pip install -q objdict
!pip install -q shapely
!pip install -q geopandas
!pip install -q oxrdflib

## imports

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)  ## remove pandas future warning

import kglab
from minio import Minio
import rdflib
from urllib.request import urlopen


## Definitions

In [2]:
# pop out last element in a quad to make a triple
def popper(input):
    lines = input.decode().split('\n') # Split input into separate lines
    modified_lines = []

    for line in lines:
        newline = line.replace("http://schema.org", "https://schema.org")
        segments = newline.split(' ')

        if len(segments) > 3:
            segments.pop()   # Remove the last two segment
            segments.pop()
            new_line = ' '.join(segments) + ' .'
            modified_lines.append(new_line)

    result_string = '\n'.join(modified_lines)

    return(result_string)

def contextAlignment(input):
    lines = input.decode().split('\n') # Split input into separate lines
    modified_lines = []

    for line in lines:
        newline = line.replace("http://schema.org", "https://schema.org")

        modified_lines.append(newline)

    result_string = '\n'.join(modified_lines)

    return(result_string)

def publicurls(client, bucket, prefix):
    urls = []
    objects = client.list_objects(bucket, prefix=prefix, recursive=True)
    for obj in objects:
        result = client.stat_object(bucket, obj.object_name)

        if result.size > 0:  #  how to tell if an objet   obj.is_public  ?????
            url = client.presigned_get_object(bucket, obj.object_name)
            # print(f"Public URL for object: {url}")
            urls.append(url)

    return urls

def to_wkt(polygon_string):
    # split the input string into pairs
    pairs = polygon_string.split(',')

    # transform each pair into 'y x' format
    # transformed_pairs = [' '.join(reversed(pair.split())) for pair in pairs]
    transformed_pairs = [' '.join(pair.split()) for pair in pairs]


    # join the transformed pairs with a comma and a space
    transformed_string = ', '.join(transformed_pairs)

    # return the final WKT string
    return f"POLYGON (({transformed_string}))"

def contains_alpha(s):
    if isinstance(s, (int, float)):
      return False
    return any(c.isalpha() for c in s)


## Load Graph(s)

At this point we have the URLs, and we could either loop load all of them or pull one out manually and use.  This section dmonstrates loading and working with one


In [3]:
client = Minio("0.0.0.0:9000", access_key="minioadmin", secret_key="minioadmin", secure=False) # Create client with anonymous access.
urls = publicurls(client, "devbucket", "graphs")
for u in urls:
  print(u)

http://0.0.0.0:9000/devbucket/graphs/archive/obis/prov__obis_2023-12-04-16-12-47_release.nq?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=minioadmin%2F20231204%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20231204T181929Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=3ca1f076c3c351d09aaf5c017f2a0aa90d42b0482fefd91e8d4838171e0177bd
http://0.0.0.0:9000/devbucket/graphs/archive/obis/summoned__obis_2023-12-04-16-07-16_release.nq?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=minioadmin%2F20231204%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20231204T181929Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=efe0f71f83d8667cbcc4b95ab15630645a672ff34c4cd330b223ea1afde82570
http://0.0.0.0:9000/devbucket/graphs/latest/obis_prov.nq?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=minioadmin%2F20231204%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20231204T181929Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=0c738f9dd93bae0425b2ef206fa8168d078e7e6cc1b

In [4]:
# load single quad graph into a RDFLIB conjunctive graph

u = "http://0.0.0.0:9000/devbucket/graphs/latest/obis_release.nq?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=minioadmin%2F20231204%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20231204T172625Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=75f3f14abfd3618a0dcd737d9d112fee580c8de92e398ea99f6b857e8b11a91a"

df = urlopen(u)
dg = df.read()
r = contextAlignment(dg)

# g = ConjunctiveGraph()
g = rdflib.ConjunctiveGraph()
g.parse(data=r, format="nquads")
print(len(g))

176871


In [5]:
## Convert the RDFLIB graph to a kglabs graph

namespaces = {
    "sh":   "http://www.w3.org/ns/shacl#" ,
    "schema":   "https://schema.org/" ,
    "schemawrong": "http://schema.org/",
    "geo":      "http://www.opengis.net/ont/geosparql#",
}

kg = kglab.KnowledgeGraph(name = "OIH test", base_uri = "https://gleanerio.org/id/", namespaces = namespaces, use_gpus=True, import_graph = g)

## Query Section

## Notes

* many of the above can be arrays, we need to note this in the shacl cardinality
* also the Pandas dataframes need to roll these up into comma seperated items, ie, python lists via aggregate and join.   These will then serialize out to solr JSON correctly


In [6]:
#would be nice to do a type count SPARQL here as a sanity check...
q = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX schema: <https://schema.org/>

SELECT DISTINCT ?source ?type ?target ?sType ?tType
WHERE {
  graph ?g {
    ?source a ?sType .
    ?target a ?tType .
    ?source ?type ?target .
    FILTER((?sType) IN (schema:Person, schema:Organization, schema:Dataset, schema:Course, schema:Document))
    FILTER((?tType) IN (schema:Person, schema:Organization, schema:Dataset, schema:Course, schema:Document))
  }
}

"""

df = kg.query_as_df(q)


In [None]:
df.info()

In [None]:
df.head(10)

In [None]:
df.to_csv('vizSet.csv')

## look at the graphml approach in the sparql notebook
