In [20]:
import gzip
import json
import sys
sys.path.append(sys.path[0][:sys.path[0].find('DVML-P7') + len('DVML-P7')])

import pandas as pd
from rdflib import Namespace, Graph, URIRef, Literal, BNode

from rdflib.namespace import RDFS
from Code.UtilityFunctions.dictionary_functions import flatten_dictionary
from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.string_functions import split_words, turn_words_singular, split_words_inc_slash
from Code.UtilityFunctions.wikidata_functions import wikidata_query, retrieve_wikidata_claims, category_query, min_qid, get_all_wikidata_claims, compare_qids, categories_dict_singular
from Code.UtilityFunctions.schema_functions import get_schema_predicate, get_schema_type, get_class_mappings
from Code.UtilityFunctions.get_uri import get_uri
from collections import Counter

In [3]:
class_mappings = pd.read_csv(get_path("class_mappings.csv"))

In [7]:
from shexer.shaper import Shaper
from shexer.consts import NT, SHEXC, SHACL_TURTLE

target_classes = [
    "http://example.org/Person",
    "http://example.org/Gender"
]

namespaces_dict = {"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
                   "http://example.org/": "ex",
                   "http://weso.es/shapes/": "",
                   "http://www.w3.org/2001/XMLSchema#": "xsd"
                   }

raw_graph = """
<http://example.org/sarah> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Person> .
<http://example.org/sarah> <http://example.org/age> "30"^^<http://www.w3.org/2001/XMLSchema#int> .
<http://example.org/sarah> <http://example.org/name> "Sarah" .
<http://example.org/sarah> <http://example.org/gender> <http://example.org/Female> .
<http://example.org/sarah> <http://example.org/occupation> <http://example.org/Doctor> .
<http://example.org/sarah> <http://example.org/brother> <http://example.org/Jim> .

<http://example.org/jim> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Person> .
<http://example.org/jim> <http://example.org/age> "28"^^<http://www.w3.org/2001/XMLSchema#int> .
<http://example.org/jim> <http://example.org/name> "Jimbo".
<http://example.org/jim> <http://example.org/surname> "Mendes".
<http://example.org/jim> <http://example.org/gender> <http://example.org/Male> .

<http://example.org/Male> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Gender> .
<http://example.org/Male> <http://www.w3.org/2000/01/rdf-schema#label> "Male" .
<http://example.org/Female> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Gender> .
<http://example.org/Female> <http://www.w3.org/2000/01/rdf-schema#label> "Female" .
<http://example.org/Other> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://example.org/Gender> .
<http://example.org/Other> <http://www.w3.org/2000/01/rdf-schema#label> "Other gender" .
"""



input_nt_file = "target_graph.nt"

shaper = Shaper(target_classes=target_classes,
                graph_file_input=input_nt_file,
                input_format=NT,
                namespaces_dict=namespaces_dict,  # Default: no prefixes
                instantiation_property="http://www.w3.org/1999/02/22-rdf-syntax-ns#type")  # Default rdf:type

output_file = "shaper_example.shex"

shaper.shex_graph(output_file=output_file,
                  acceptance_threshold=0.)

print("Done!")

Done!


In [56]:
qid = 'Q'
def get_qid_label(qid):
    query = f"""PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
                PREFIX wd: <http://www.wikidata.org/entity/> 
                SELECT  *
                WHERE {{
                        wd:{qid} rdfs:label ?label .
                        FILTER (langMatches( lang(?label), "EN" ) )
                    }}
                LIMIT 1"""
    try:
        qid_label = wikidata_query(query)['label.value'][0]
        return qid_label
    except:
        return "not found"

get_qid_label(qid)

'Q not found'

In [64]:
df = pd.DataFrame({'qid': ['Q31', 'Q31', 'Q31'], 'sublassOf': ['Q34', 'Q32', 'Q33']})

In [65]:
df['subclassOf_label'] = df.apply(lambda x: get_qid_label(x['sublassOf']), axis=1)

In [66]:
df

Unnamed: 0,qid,sublassOf,subclassOf_label
0,Q31,Q34,Sweden
1,Q31,Q32,Luxembourg
2,Q31,Q33,Finland


In [80]:
df_coffee = pd.DataFrame({'qid': ['Q31', 'Q8486', 'Q32']})

In [81]:
qid='Q8486'
query = f"""SELECT ?item ?itemLabel 
            WHERE 
                {{
                wd:{qid} wdt:P279 ?item .
                SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
                }}"""
df = wikidata_query(query)[['item.value', 'itemLabel.value']]
df['item.value'] = df.apply(lambda x: x['item.value'][31:], axis=1)
df['qid'] = qid
df

Unnamed: 0,item.value,itemLabel.value,qid
0,Q40050,drink,Q8486
1,Q147538,soft drink,Q8486
2,Q473666,colonial goods,Q8486
3,Q1365365,stimulant foodstuff,Q8486
4,Q2647467,non-alcoholic beverage,Q8486
5,Q19359564,hot beverage,Q8486
6,Q37756327,coffee drink,Q8486


In [82]:
df_coffee.merge(df, on='qid', how='left')

Unnamed: 0,qid,item.value,itemLabel.value
0,Q31,,
1,Q8486,Q40050,drink
2,Q8486,Q147538,soft drink
3,Q8486,Q473666,colonial goods
4,Q8486,Q1365365,stimulant foodstuff
5,Q8486,Q2647467,non-alcoholic beverage
6,Q8486,Q19359564,hot beverage
7,Q8486,Q37756327,coffee drink
8,Q32,,


In [23]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)
categories = list(biz['categories'].str.cat(sep=', ').split(sep=', '))
_categories_dict_singular = categories_dict_singular(categories)

category_occurences = pd.DataFrame(list(dict(Counter(categories)).items()),
                                   columns=['category', 'occurences'
                                            ]).sort_values(by='occurences',
                                                           ascending=False)
# Maps the split categories to the original categories
category_occurences['split_category'] = category_occurences['category'].map(_categories_dict_singular)
category_occurences = category_occurences.explode('split_category')

# Maps the yelp categories that are already mapped to a schemaType to the original category.
class_mapping = pd.read_csv(get_path('class_mappings.csv'))
category_occurences['split_category'] = category_occurences['split_category'].apply(lambda x: x.title().replace(' ', ''))
category_occurences = category_occurences.merge(class_mapping,
                                                left_on='category',
                                                right_on='YelpCategory',
                                                how='left')

In [34]:
category_occurences

Unnamed: 0,category,occurences,split_category,YelpCategory,SchemaType
0,Restaurants,52268,Restaurant,Restaurants,['Restaurant']
1,Food,27781,Food,,
2,Shopping,24395,Shopping,Shopping,['Retail']
3,Home Services,14356,HomeService,Home Services,['Service']
4,Beauty & Spas,14292,Beauty,Beauty & Spas,['DaySpa']
...,...,...,...,...,...
1422,Beach Bars,1,BeachBar,Beach Bars,['Beach']
1423,DUI Schools,1,DuiSchool,,
1424,Patent Law,1,PatentLaw,,
1425,Housing Cooperatives,1,HousingCooperative,,


In [26]:
class_mapping

Unnamed: 0,YelpCategory,SchemaType
0,Accountants,['AccountingService']
1,Addiction Medicine,['Drug']
2,Adult Entertainment,['AdultEntertainment']
3,Airlines,['Airline']
4,Airport Lounges,['Airport']
...,...,...
379,Wine Bars,['Winery']
380,Wine Tasting Room,['Winery']
381,Wine Tours,['Winery']
382,Wineries,['Winery']
