# Environment

In [5]:
!pip3 install mlcroissant
!pip3 install rdflib
!pip3 install pandas
import mlcroissant as mlc
from rdflib import Graph, Namespace, URIRef, Literal
import json
import pandas as pd



# Load HuggingFace RDF

In [2]:
# Smaller dataset
g = Graph().parse('https://github.com/david4096/huggingface-rdf/releases/download/data-release/huggingface.ttl?download=true', format='ttl')
# Larger dataset
# g = Graph().parse('https://huggingface.co/datasets/david4096/huggingface-ttl/resolve/main/huggingface-30k.ttl?download=true', format='ttl')


# Queries

Query to get property counts by dataset

In [3]:
query = """
SELECT ?dataset_name ?predicate (count(?o) as ?predicate_count)
WHERE {
    ?dataset <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://schema.org/Dataset> .
    ?dataset <https://schema.org/name> ?dataset_name .
    ?dataset ?predicate ?o .
}
GROUP BY ?predicate ?dataset
"""

qres = g.query(query)
# for row in qres:
#     print(f"Dataset: {row.dataset_name} Predicate: {row.predicate} Count: {row.predicate_count}")

# Export data

Export to TSV

In [4]:
with open("query_results.tsv", "w") as outfile:
  first_row = True
  for row in qres:
    if first_row:
      for label in row.labels.keys():
        outfile.write(label + "\t")
      outfile.write("\n")
      first_row = False
    for value in row:
      outfile.write(value + "\t")
    outfile.write("\n")

In [6]:
df = pd.read_csv("query_results.tsv", sep = "\t")

In [8]:
print(df['predicate'].value_counts())

predicate
http://www.w3.org/1999/02/22-rdf-syntax-ns#type    2428
http://purl.org/dc/terms/conformsTo                2428
https://schema.org/alternateName                   2428
https://schema.org/creator                         2428
https://schema.org/description                     2428
https://schema.org/distribution                    2428
https://schema.org/keywords                        2428
https://schema.org/name                            2428
https://schema.org/url                             2428
http://mlcommons.org/croissant/recordSet           2208
https://schema.org/sameAs                           713
https://schema.org/license                          695
https://schema.org/identifier                         5
Name: count, dtype: int64


# Archive

# Generate ttl

In [None]:
urls = ["https://huggingface.co/api/datasets/fashion_mnist/croissant", "https://huggingface.co/api/datasets/abisee/cnn_dailymail/croissant"]
baseurl = "http://example.org/"
g = Graph()

for url in urls:
  json_metadata = mlc.Dataset(url).metadata.to_json()

  g.parse(
      data=json.dumps(json_metadata),
      format='json-ld',
      base=URIRef(baseurl)
  )

print(g.serialize(format='ttl'))