# Environment

In [5]:
!pip3 install mlcroissant
!pip3 install rdflib
!pip3 install pandas
import mlcroissant as mlc
from rdflib import Graph, Namespace, URIRef, Literal
import json
import pandas as pd



# Load HuggingFace RDF

In [2]:
# Smaller dataset
g = Graph().parse('https://github.com/david4096/huggingface-rdf/releases/download/data-release/huggingface.ttl?download=true', format='ttl')
# Larger dataset
# g = Graph().parse('https://huggingface.co/datasets/david4096/huggingface-ttl/resolve/main/huggingface-30k.ttl?download=true', format='ttl')


# Queries

Query to get property counts by dataset

In [3]:
query = """
SELECT ?dataset_name ?predicate (count(?o) as ?predicate_count)
WHERE {
    ?dataset <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://schema.org/Dataset> .
    ?dataset <https://schema.org/name> ?dataset_name .
    ?dataset ?predicate ?o .
}
GROUP BY ?predicate ?dataset
"""

qres = g.query(query)
# for row in qres:
#     print(f"Dataset: {row.dataset_name} Predicate: {row.predicate} Count: {row.predicate_count}")

# Export data

Export to TSV
(Note that this is currently broken since it will skip one triple.)

In [4]:
with open("query_results.tsv", "w") as outfile:
  first_row = True
  for row in qres:
    if first_row:
      for label in row.labels.keys():
        outfile.write(label + "\t")
      outfile.write("\n")
      first_row = False
    for value in row:
      outfile.write(value + "\t")
    outfile.write("\n")

In [6]:
df = pd.read_csv("query_results.tsv", sep = "\t")

In [8]:
print(df['predicate'].value_counts())

predicate
http://www.w3.org/1999/02/22-rdf-syntax-ns#type    2428
http://purl.org/dc/terms/conformsTo                2428
https://schema.org/alternateName                   2428
https://schema.org/creator                         2428
https://schema.org/description                     2428
https://schema.org/distribution                    2428
https://schema.org/keywords                        2428
https://schema.org/name                            2428
https://schema.org/url                             2428
http://mlcommons.org/croissant/recordSet           2208
https://schema.org/sameAs                           713
https://schema.org/license                          695
https://schema.org/identifier                         5
Name: count, dtype: int64


# Archive

In [None]:
query = """
SELECT ?s ?p ?o
WHERE {
    ?s ?p ?o.
}"""

qres = g.query(query)
for row in qres:
    print(f"s: {row.s} p: {row.p} o: {row.o}")

s: Nbdc01e1f61894a2eb3e05f2c1a291007 p: https://schema.org/name o: cnn_dailymail
s: Ncf5b9749d0cd437fa3912f78260fda1c p: https://schema.org/keywords o: 🇺🇸 Region: US
s: Ncf5b9749d0cd437fa3912f78260fda1c p: https://schema.org/distribution o: http://example.org/parquet-files-for-config-fashion_mnist
s: Nadd3c5e7564c4dfdbab84ff8fa8c8b8e p: http://mlcommons.org/croissant/fileSet o: http://example.org/parquet-files-for-config-3.0.0
s: N68e999326798480da3808edb733cf5d3 p: http://mlcommons.org/croissant/column o: id
s: http://example.org/3.0.0 p: https://schema.org/name o: 3.0.0
s: http://example.org/2.0.0 p: http://mlcommons.org/croissant/field o: http://example.org/2.0.0/id
s: http://example.org/parquet-files-for-config-2.0.0 p: https://schema.org/encodingFormat o: application/x-parquet
s: http://example.org/1.0.0/highlights p: http://mlcommons.org/croissant/source o: Ne8899dbfbdd54a5f95352a6b893cc559
s: Nbdc01e1f61894a2eb3e05f2c1a291007 p: https://schema.org/keywords o: Datasets
s: Ncf5b97

In [None]:
query = """
SELECT ?p (count(?p) as ?lol)
WHERE {
    ?s ?p ?o.
}
GROUP BY ?p
"""

qres = g.query(query)
for row in qres:
    print(f"Predicate: {row.p} Count: {row.lol}")

Predicate: https://schema.org/keywords Count: 17
Predicate: https://schema.org/contentUrl Count: 1
Predicate: https://schema.org/distribution Count: 2
Predicate: http://mlcommons.org/croissant/fileSet Count: 2
Predicate: http://mlcommons.org/croissant/column Count: 2
Predicate: https://schema.org/name Count: 7
Predicate: https://schema.org/creator Count: 1
Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type Count: 7
Predicate: https://schema.org/description Count: 6
Predicate: http://mlcommons.org/croissant/jsonPath Count: 1
Predicate: https://schema.org/sameAs Count: 1
Predicate: https://schema.org/url Count: 2
Predicate: http://mlcommons.org/croissant/source Count: 2
Predicate: https://schema.org/encodingFormat Count: 2
Predicate: https://schema.org/containedIn Count: 1
Predicate: http://mlcommons.org/croissant/recordSet Count: 1
Predicate: http://mlcommons.org/croissant/extract Count: 2
Predicate: http://mlcommons.org/croissant/field Count: 2
Predicate: https://schema.org/lic

In [None]:
query = """
SELECT ?dataset ?p ?o
WHERE {
    ?dataset <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://schema.org/Dataset> .
    ?dataset ?p ?o .
}
"""

qres = g.query(query)
for row in qres:
    print(f"Dataset {row.dataset} Predicate: {row.p} Object: {row.o}")

Dataset N6c06665412894f8594788860913d6223 Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type Object: https://schema.org/Dataset
Dataset N6c06665412894f8594788860913d6223 Predicate: https://schema.org/name Object: fashion_mnist
Dataset N6c06665412894f8594788860913d6223 Predicate: https://schema.org/description Object: 
	
		
	
	
		Dataset Card for FashionMNIST
	


	
		
	
	
		Dataset Summary
	

Fashion-MNIST is a dataset of Zalando's article images—consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes. We intend Fashion-MNIST to serve as a direct drop-in replacement for the original MNIST dataset for benchmarking machine learning algorithms. It shares the same image size and structure of training and… See the full description on the dataset page: https://huggingface.co/datasets/zalando-datasets/fashion_mnist.
Dataset N6c06665412894f8594788860913d6223 Predicate: http://purl

# Generate ttl

In [None]:
urls = ["https://huggingface.co/api/datasets/fashion_mnist/croissant", "https://huggingface.co/api/datasets/abisee/cnn_dailymail/croissant"]
baseurl = "http://example.org/"
g = Graph()

for url in urls:
  json_metadata = mlc.Dataset(url).metadata.to_json()

  g.parse(
      data=json.dumps(json_metadata),
      format='json-ld',
      base=URIRef(baseurl)
  )

print(g.serialize(format='ttl'))

  -  [Metadata(fashion_mnist)] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.
  -  [Metadata(fashion_mnist)] Property "https://schema.org/datePublished" is recommended, but does not exist.
  -  [Metadata(fashion_mnist)] Property "https://schema.org/version" is recommended, but does not exist.
  -  [Metadata(cnn_dailymail)] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.
  -  [Metadata(cnn_dailymail)] Property "https://schema.org/datePublished" is recommended, but does not exist.
  -  [Metadata(cnn_dailymail)] Property "https://schema.org/version" is recommended, but does not exist.


@prefix cr: <http://mlcommons.org/croissant/> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix schema: <https://schema.org/> .

<http://example.org/1.0.0> a cr:RecordSet ;
    cr:field <http://example.org/1.0.0/article>,
        <http://example.org/1.0.0/highlights>,
        <http://example.org/1.0.0/id> ;
    schema:description """abisee/cnn_dailymail - '1.0.0' subset

Additional information:
- 3 splits: train, validation, test"""@en ;
    schema:name "1.0.0"@en .

<http://example.org/1.0.0/article> a cr:Field ;
    cr:dataType schema:Text ;
    cr:source [ cr:extract [ cr:column "article"@en ] ;
            cr:fileSet <http://example.org/parquet-files-for-config-1.0.0> ] ;
    schema:description "Column 'article' from the Hugging Face parquet file."@en ;
    schema:name "1.0.0/article"@en .

<http://example.org/1.0.0/highlights> a cr:Field ;
    cr:dataType schema:Text ;
    cr:source [ cr:extract [ cr:column "highlights"@en ] ;
            cr:fileSet <http://example.org/parq