# MDP Via Duck

A demonstrator notebook for DuckDB operations against Parquet files.  The goal is to develop and performance test queries in SQL against Parquet.

In [None]:
!pip install -q minio

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.5/83.5 kB[0m [31m412.4 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import duckdb
import pandas as pd
import numpy as np
import os
import spacy
import tqdm
from minio import Minio

In [None]:
def publicurls(client, bucket, prefix):
    urls = []
    objects = client.list_objects(bucket, prefix=prefix, recursive=True)
    for obj in objects:
        result = client.stat_object(bucket, obj.object_name)

        if result.size > 0:  #  how to tell if an objet   obj.is_public  ?????
            url = client.presigned_get_object(bucket, obj.object_name)
            # print(f"Public URL for object: {url}")
            urls.append(url)

    return urls

client = Minio("ossapi.oceaninfohub.org:80",  secure=False) # Create client with anonymous access.
urls = publicurls(client, "public", "assets")
for u in urls:
  print(u)

http://ossapi.oceaninfohub.org/public/assets/OIHGraph_25032023.parquet
http://ossapi.oceaninfohub.org/public/assets/africaioc.parquet
http://ossapi.oceaninfohub.org/public/assets/cioos.parquet
http://ossapi.oceaninfohub.org/public/assets/combined.parquet
http://ossapi.oceaninfohub.org/public/assets/edmerp.parquet
http://ossapi.oceaninfohub.org/public/assets/edmo.parquet
http://ossapi.oceaninfohub.org/public/assets/emodnet.parquet
http://ossapi.oceaninfohub.org/public/assets/inanodc.parquet
http://ossapi.oceaninfohub.org/public/assets/invemardocuments.parquet
http://ossapi.oceaninfohub.org/public/assets/invemarexperts.parquet
http://ossapi.oceaninfohub.org/public/assets/invemarinstitutions.parquet
http://ossapi.oceaninfohub.org/public/assets/invemartraining.parquet
http://ossapi.oceaninfohub.org/public/assets/invemarvessels.parquet
http://ossapi.oceaninfohub.org/public/assets/marinetraining.parquet
http://ossapi.oceaninfohub.org/public/assets/obis.parquet
http://ossapi.oceaninfohub.org/

In [None]:
## load the combined graph
urlCombined = "http://ossapi.oceaninfohub.org/public/assets/combined.parquet"
urlCIOOS = "http://ossapi.oceaninfohub.org/public/assets/cioos.parquet"

duckdb.install_extension("httpfs")

# Instantiate the DuckDB connection
con = duckdb.connect()
con.execute("CREATE TABLE data AS SELECT  row_number() OVER () AS idx, * FROM read_parquet('{}')".format(urlCombined))  # load from url

con2 = duckdb.connect()
con2.execute("CREATE TABLE data AS SELECT  row_number() OVER () AS idx, * FROM read_parquet('{}')".format(urlCIOOS))  # load from url


# con.execute("CREATE TABLE my_table AS SELECT * FROM read_parquet('/content/drive/MyDrive/Data/combined.parquet')") # load from local parquet
# con.execute("CREATE TABLE data AS SELECT row_number() OVER () AS idx, * FROM '/content/drive/MyDrive/Data/combined.parquet';")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x7a65a807f870>

In [None]:
r = con.execute("DESCRIBE SELECT * FROM data").fetchdf()
print(r)

         column_name column_type null   key default extra
0                idx      BIGINT  YES  None    None  None
1                  s     VARCHAR  YES  None    None  None
2               type     VARCHAR  YES  None    None  None
3               name     VARCHAR  YES  None    None  None
4           keywords     VARCHAR  YES  None    None  None
5                url     VARCHAR  YES  None    None  None
6               desc     VARCHAR  YES  None    None  None
7            provder     VARCHAR  YES  None    None  None
8  __index_level_0__      BIGINT  YES  None    None  None


In [None]:
r = con2.execute("DESCRIBE SELECT * FROM data").fetchdf()
print(r)

              column_name column_type null   key default extra
0                     idx      BIGINT  YES  None    None  None
1                      id     VARCHAR  YES  None    None  None
2                keywords     VARCHAR  YES  None    None  None
3                    type     VARCHAR  YES  None    None  None
4                    name     VARCHAR  YES  None    None  None
5             description     VARCHAR  YES  None    None  None
6                     url     VARCHAR  YES  None    None  None
7                 geotype     VARCHAR  YES  None    None  None
8                geompred     VARCHAR  YES  None    None  None
9                    geom     VARCHAR  YES  None    None  None
10       temporalCoverage     VARCHAR  YES  None    None  None
11          datePublished     VARCHAR  YES  None    None  None
12                license     VARCHAR  YES  None    None  None
13                creator     VARCHAR  YES  None    None  None
14  includedInDataCatalog     VARCHAR  YES  None    Non

In [None]:
# Now you can execute SQL queries on the Parquet file as if it was a regular table
r = con.execute("SELECT DISTINCT provder FROM data").fetchdf()

print(r)

                provder
0              aquadocs
1                 cioos
2                edmerp
3                  edmo
4          oceanexperts
5       invemartraining
6        invemarvessels
7                  obps
8               emodnet
9               inanodc
10     invemardocuments
11                  pdh
12            africaioc
13       invemarexperts
14  invemarinstitutions
15       marinetraining
16                 obis


In [None]:
# Now you can execute SQL queries on the Parquet file as if it was a regular table
r = con.execute(" SELECT DISTINCT provder, type, COUNT(*) AS count FROM data GROUP BY provder, type").fetchdf()


print(r)

                provder                      type   count
0                 cioos        schemawrong:Person    1372
1               inanodc            schmea:Dataset     235
2          oceanexperts             schmea:Course     491
3             africaioc       schmea:Organization      52
4      invemardocuments       schmea:CreativeWork   18647
5   invemarinstitutions       schmea:Organization     269
6        invemarvessels            schmea:Vehicle      85
7        marinetraining     schmea:CourseInstance     520
8                  obis            schmea:Dataset   20120
9                  obps       schmea:CreativeWork    8414
10             aquadocs       schmea:Organization   15198
11                cioos       schemawrong:Dataset   74251
12               edmerp    schmea:ResearchProject    3432
13                 edmo       schmea:Organization    4757
14     invemardocuments       schmea:Organization    3623
15     invemardocuments             schmea:Person   13351
16         oce

In [None]:
# Now you can execute SQL queries on the Parquet file as if it was a regular table
r = con.execute(" SELECT DISTINCT provder, type, ANY_VALUE(s),  COUNT(*) AS count FROM data GROUP BY provder, type  order by count desc").fetchdf()

print(r)

                provder                      type  \
0              aquadocs       schmea:CreativeWork   
1              aquadocs             schmea:Person   
2                 cioos       schemawrong:Dataset   
3          oceanexperts             schmea:Person   
4                   pdh       schemawrong:Dataset   
5          oceanexperts              schmea:Event   
6                  obis            schmea:Dataset   
7      invemardocuments       schmea:CreativeWork   
8              aquadocs       schmea:Organization   
9                edmerp       schmea:Organization   
10     invemardocuments             schmea:Person   
11                 obps       schmea:CreativeWork   
12                 obps             schmea:Person   
13         oceanexperts       schmea:Organization   
14                 edmo       schmea:Organization   
15     invemardocuments       schmea:Organization   
16                  pdh  schemawrong:Organization   
17               edmerp    schmea:ResearchProj