In [None]:
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

import sys
import os
for path in [os.getcwd(),"SchemaExamples"]:
  sys.path.insert( 1, path ) #Pickup libs from shipped lib directory

import logging
logging.basicConfig(level=logging.INFO) # dev_appserver.py --log_level debug .
log = logging.getLogger(__name__)

from schemaexamples import Example, SchemaExamples

"""
Load examples from file
"""
import glob
globpatterns = ["/path/to/example"]

files = []
for g in globpatterns:
    files.extend(glob.glob(g))
    
print("Loading %d files" % len(files))
SchemaExamples.loadExamplesFiles(files)

In [None]:
examples = []
for i in SchemaExamples.allExamples():
    examples.append(i)

In [None]:
import pandas as pd
jsonld = []
microdata = []
rdfa = []
original_html = []
count_jsonld = 0
count_microdata = 0
count_rdfa = 0

for e in examples:
    if e.hasJsonld():
        jsonld.append(e.getJsonld())
        count_jsonld += 1
    else:
        jsonld.append(None)
    if e.hasMicrodata():
        microdata.append(e.getMicrodata())
        count_microdata += 1
    else:
        microdata.append(None)
    if e.hasRdfa():
        rdfa.append(e.getRdfa())
        count_rdfa += 1
    else:
        rdfa.append(None)
    original_html.append(e.original_html)

In [None]:
print("jsonld: %d, microdata: %d, rdfa: %d" % (count_jsonld, count_microdata, count_rdfa))
# Create a DataFrame from the original_html and jsonld lists
df = pd.DataFrame({
    'original_html': original_html,
    'jsonld': jsonld
})
df.to_csv('schema.csv', index=False)

In [None]:
from bs4 import BeautifulSoup
import requests

response = requests.get('https://schema.org/docs/full.html')

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Find all anchor tags within the tree component
tree_links = soup.select('.dttTree a')

# Extract the href values
href_values = [link['href'] for link in tree_links]

hrefs = []
# Print the href values
for href in href_values:
    hrefs.append(href)

In [None]:
len(hrefs)

In [None]:
import pandas as pd

df = pd.read_csv('schema.csv')
df.head()

In [None]:
import pandas as pd
from datasets import Dataset


def build_training_data(df):
    
    texts = []
    for d in df.iterrows():
        inst = d[1]['original_html']
        response = d[1]['jsonld']
        prompt = f"<s>[INST] Convert the raw data to ld+json format.\n{inst} [/INST] {response}"
        texts.append(prompt)
    
    return {"text": texts}

data = build_training_data(df)
dataset = Dataset.from_dict(data)

dataset.save_to_disk('./schema_data/')
train_dataset, test_dataset = dataset.train_test_split(test_size=0.1).values()
train_dataset.save_to_disk('./schema_data_train/')
test_dataset.save_to_disk('./schema_data_test/')

In [None]:
from datasets import Dataset
train_dataset = Dataset.load_from_disk('./schema_data_train/')
print(train_dataset)
test_dataset = Dataset.load_from_disk('./schema_data_test/')
print(test_dataset)