In [2]:
import gzip
import json

import pandas as pd
from rdflib import Namespace, Graph, URIRef, Literal, BNode
from rdflib.namespace import RDFS
from UtilityFunctions.dictionary_functions import flatten_dictionary
from UtilityFunctions.get_data_path import get_path
from UtilityFunctions.string_functions import split_words, turn_words_singular, split_words_inc_slash
from UtilityFunctions.wikidata_functions import wikidata_query, retrieve_wikidata_claims, category_query, min_qid, get_all_wikidata_claims, compare_qids, categories_dict_singular
from UtilityFunctions.schema_functions import get_schema_predicate, get_schema_type, get_class_mappings
from UtilityFunctions.get_uri import get_uri

In [12]:
class_mappings = pd.read_csv(get_path("class_mappings.csv"))

In [13]:
class_mappings

Unnamed: 0,YelpCategory,SchemaType
0,Airport,Airport
1,Osteopath,Osteopathic
2,Repair,AutoRepair
3,RadioStation,RadioStation
4,HardwareStore,HardwareStore
...,...,...
213,Hostel,Hostel
214,EmploymentLaw,EmploymentAgency
215,Supplement,DietarySupplement
216,Trust,UKTrust


In [6]:
from shexer.shaper import Shaper
from shexer.consts import NT, SHEXC, SHACL_TURTLE

target_classes = [
    "http://example.org/Person",
    "http://example.org/Gender"
]

namespaces_dict = {"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
                   "http://example.org/": "ex",
                   "http://weso.es/shapes/": "",
                   "http://www.w3.org/2001/XMLSchema#": "xsd"
                   }

raw_graph = """
<http://example.org/sarah> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Person> .
<http://example.org/sarah> <http://example.org/age> "30"^^<http://www.w3.org/2001/XMLSchema#int> .
<http://example.org/sarah> <http://example.org/name> "Sarah" .
<http://example.org/sarah> <http://example.org/gender> <http://example.org/Female> .
<http://example.org/sarah> <http://example.org/occupation> <http://example.org/Doctor> .
<http://example.org/sarah> <http://example.org/brother> <http://example.org/Jim> .

<http://example.org/jim> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Person> .
<http://example.org/jim> <http://example.org/age> "28"^^<http://www.w3.org/2001/XMLSchema#int> .
<http://example.org/jim> <http://example.org/name> "Jimbo".
<http://example.org/jim> <http://example.org/surname> "Mendes".
<http://example.org/jim> <http://example.org/gender> <http://example.org/Male> .

<http://example.org/Male> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Gender> .
<http://example.org/Male> <http://www.w3.org/2000/01/rdf-schema#label> "Male" .
<http://example.org/Female> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Gender> .
<http://example.org/Female> <http://www.w3.org/2000/01/rdf-schema#label> "Female" .
<http://example.org/Other> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Gender> .
<http://example.org/Other> <http://www.w3.org/2000/01/rdf-schema#label> "Other gender" .
"""



input_nt_file = "target_graph.nt"

shaper = Shaper(target_classes=target_classes,
                raw_graph=raw_graph,
                input_format=NT,
                namespaces_dict=namespaces_dict,  # Default: no prefixes
                instantiation_property="http://www.w3.org/1999/02/22-rdf-syntax-ns#type")  # Default rdf:type

output_file = "shaper_example.shex"

shaper.shex_graph(output_file=output_file,
                  acceptance_threshold=0.1,)

print("Done!")


Done!


In [16]:
from shexer.shaper import Shaper
from shexer.consts import NT, SHEXC, SHACL_TURTLE

target_classes = [
    "https://example.org/SchemaClass",
    "https://example.org/YelpCategory",
    "https://example.org/ExampleClass",
    "https://schema.org/Restaurant"
]


input_nt_file = "/home/ubuntu/vol1/virtuoso/import/yelp_business.nt"

shaper = Shaper(target_classes=target_classes,
                raw_graph=input_nt_file,
                input_format=NT,
                instantiation_property="http://www.w3.org/1999/02/22-rdf-syntax-ns#type")  # Default rdf:type

output_file = "shaper_example.shex"

shaper.shex_graph(output_file=output_file,
                  acceptance_threshold=0.1,)

print("Done!")


KeyboardInterrupt: 

In [5]:
from Code.UtilityFunctions.run_query import run_query
query = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
  
SELECT DISTINCT ?type
WHERE {
  ?s a ?type.
}"""
run_query(query=query, as_dataframe=True, do_print=False, include_types=False)

Unnamed: 0,type.value
0,http://www.openlinksw.com/schemas/virtrdf#QuadMapFormat
1,http://www.openlinksw.com/schemas/virtrdf#QuadStorage
2,http://www.openlinksw.com/schemas/virtrdf#array-of-QuadMapFormat
3,http://www.openlinksw.com/schemas/virtrdf#QuadMap
4,http://www.openlinksw.com/schemas/virtrdf#QuadMapValue
5,http://www.openlinksw.com/schemas/virtrdf#array-of-QuadMapColumn
6,http://www.openlinksw.com/schemas/virtrdf#QuadMapColumn
7,http://www.openlinksw.com/schemas/virtrdf#array-of-QuadMapATable
8,http://www.openlinksw.com/schemas/virtrdf#QuadMapATable
9,http://www.openlinksw.com/schemas/virtrdf#QuadMapFText


In [42]:
class_mappings = pd.read_csv(get_path("class_mappings.csv"))
class_mappings_dict = dict([(i,x) for i, x in zip(class_mappings['YelpCategory'], class_mappings['SchemaType'])])
class_mappings_dict

{'Airport': 'Airport',
 'Osteopath': 'Osteopathic',
 'Repair': 'AutoRepair',
 'RadioStation': 'RadioStation',
 'HardwareStore': 'HardwareStore',
 'SportsClub': 'SportsClub',
 'DryCleaning': 'DryCleaningOrLaundry',
 'Hotel': 'Hotel',
 'RealEstate': 'RealEstateAgent',
 'Recording': 'MusicRecording',
 'Computer': 'ComputerStore',
 'LocalService': 'LegalService',
 'RvRepair': 'AutoRepair',
 'AmusementPark': 'AmusementPark',
 'BuddhistTemple': 'BuddhistTemple',
 'Throat': 'Throat',
 'Service': 'Service',
 'WaterStore': 'ComputerStore',
 'Home': 'Abdomen',
 'PhysicalTherapy': 'PhysicalTherapy',
 'Midwife': 'Midwifery',
 'Kosher': 'KosherDiet',
 'Pool': 'School',
 'Food': 'WPFooter',
 'Casino': 'Casino',
 'Mosque': 'Mosque',
 'PostOffice': 'PostOffice',
 'Gastroenterologist': 'Gastroenterologic',
 'Florist': 'Florist',
 'AdultEntertainment': 'AdultEntertainment',
 'Vegan': 'VeganDiet',
 'Wholesaler': 'Wholesale',
 'Dentist': 'Dentist',
 'Podiatrist': 'Podiatric',
 'Fence': 'Audience',
 'Gate'

In [32]:
split_categories_df = pd.read_excel(get_path("split_categories.xlsx"), names=["category", "split_category"])
split_categories_dict = dict([(i,x.split(', ')) for i, x in zip(split_categories_df['category'], split_categories_df['split_category'])])
split_categories_dict

{'Beauty & Spas': ['Beauty', 'Spas'],
 'Health & Medical': ['Health', 'Medical'],
 'Event Planning & Services': ['Event Planning', 'Event Services'],
 'Coffee & Tea': ['Coffee', 'Tea'],
 'Breakfast & Brunch': ['Breakfast', 'Brunch'],
 'Hotels & Travel': ['Hotels', 'Travels'],
 'Home & Garden': ['Home', 'Garden'],
 'Arts & Entertainment': ['Arts', 'Entertainment'],
 'Fitness & Instruction': ['Fitness', 'Instruction'],
 'Ice Cream & Frozen Yogurt': ['Ice Cream', 'Frozen Yogurt'],
 'Venues & Event Spaces': ['Venues', 'Event Spaces'],
 'Wine & Spirits': ['Wine', 'Spirits'],
 'Flowers & Gifts': ['Flowers', 'Gifts'],
 'Auto Parts & Supplies': ['Auto Parts', 'Auto Supplies'],
 'Juice Bars & Smoothies': ['Juice Bars', 'Smoothies'],
 'Cosmetics & Beauty Supply': ['Cosmetics', 'Beauty Supplies'],
 'Arts & Crafts': ['Arts', 'Crafts'],
 'Public Services & Government': ['Public Services', 'Government'],
 'Heating & Air Conditioning/HVAC': ['Heating', 'Air Conditioning', 'HVAC'],
 'IT Services & Com

In [37]:
for subcategory in split_categories_dict:
    print(split_categories_dict[subcategory])

['Beauty', 'Spas']
['Health', 'Medical']
['Event Planning', 'Event Services']
['Coffee', 'Tea']
['Breakfast', 'Brunch']
['Hotels', 'Travels']
['Home', 'Garden']
['Arts', 'Entertainment']
['Fitness', 'Instruction']
['Ice Cream', 'Frozen Yogurt']
['Venues', 'Event Spaces']
['Wine', 'Spirits']
['Flowers', 'Gifts']
['Auto Parts', 'Auto Supplies']
['Juice Bars', 'Smoothies']
['Cosmetics', 'Beauty Supplies']
['Arts', 'Crafts']
['Public Services', 'Government']
['Heating', 'Air Conditioning', 'HVAC']
['IT Services', 'Computer Repiar']
['Dry Cleaning', 'Laundry']
['Party Planning', 'Event Planning']
['Eyewear', 'Opticians']
['Music', 'Videos']
['Nurseries', 'Gardening']
['Sewing', 'Alterations']
['Appliances', 'Repair']
['Kitchen', 'Bath']
['Vintage', 'Consignment']
['Banks', 'Credit Unions']
['Fruits', 'Veggies']
['Photography Stores', 'Photography Services']
['Child Care', 'Day Care']
['Junk Removal', 'Hauling']
['Windshield Installation', 'Windshield Repair']
['Chocolatiers', 'Shops']
['Lig