In [260]:
# https://github.com/maxlath/wikibase-dump-filter
# https://github.com/maxlath/wikibase-dump-filter/blob/main/docs/cli.md
# extracter instance of (P31) city (Q515) eller state in US (Q35657) og som er i country (P17) USA (Q30) eller Mexico (Q96) eller Canada(Q16)
#!cat latest-all.json.gz | gzip -d | wikibase-dump-filter --languages en --claim 'P31:Q515,Q35657&P17:Q30,Q96,Q16' --omit sitelinks > city_state_america.ndjson

# extracter instance of (P31) business (Q4830453)
# led efter has subsidiary (P355)
#!cat latest-all.json.gz | gzip -d | wikibase-dump-filter --languages en --claim 'P31:Q4830453' --omit sitelinks > business.ndjson

In [261]:
# https://linuxhint.com/bash_head_tail_command/
#!head -n 1 simplified_dump.ndjson > simpl.ndjson

In [262]:
# https://stedolan.github.io/jq/tutorial/
# https://www.linode.com/docs/guides/using-jq-to-process-json-on-the-command-line/
#!jq '.' simpl.ndjson > simpl.json

In [263]:
import sys
sys.path.append(sys.path[0][:sys.path[0].find('DVML-P7') + len('DVML-P7')])
from math import ceil
import numpy as np
import pandas as pd
import requests
from SPARQLWrapper import JSON, SPARQLWrapper

from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.wikidata_query_tools import (
    retrieve_wikidata_claims,
    wikidata_query,
)

In [264]:
query_all_us_states = """
SELECT ?state
WHERE {
  ?state wdt:P31 wd:Q35657 .
}
LIMIT 10
"""
wikidata_query(sparql_query=query_all_us_states)

Unnamed: 0,state.type,state.value
0,uri,http://www.wikidata.org/entity/Q99
1,uri,http://www.wikidata.org/entity/Q173
2,uri,http://www.wikidata.org/entity/Q724
3,uri,http://www.wikidata.org/entity/Q759
4,uri,http://www.wikidata.org/entity/Q771
5,uri,http://www.wikidata.org/entity/Q779
6,uri,http://www.wikidata.org/entity/Q782
7,uri,http://www.wikidata.org/entity/Q797
8,uri,http://www.wikidata.org/entity/Q812
9,uri,http://www.wikidata.org/entity/Q816


In [265]:
def category_query(category: str):
    return f"""SELECT distinct ?item ?itemLabel ?itemDescription WHERE{{
    ?item ?label "{category}"@en.
    ?article schema:about ?item .
    ?article schema:inLanguage "en" .
    ?article schema:isPartOf <https://en.wikipedia.org/>.
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}}}"""

In [266]:
wiki_qid_of_category = wikidata_query(category_query(category="restaurant"))[
    ["item.value", "itemLabel.value", "itemDescription.value"]
].head()

In [267]:
def min_qid(df_qid):
    # Getting the minimum value of the QID number and the itemLabel
    index = df_qid['item.value'].apply(lambda x: int(x.split("/")[-1].replace("Q", ""))).idxmin()
    df = df_qid.loc[index][['item.value','itemLabel.value']]
    return df[0][31:], df[1]
min_qid(df_qid=wiki_qid_of_category)

('Q11707', 'restaurant')

In [325]:
test_query = wikidata_query(category_query(category="garden"))
test_query["item.value"][0][31:], test_query["itemLabel.value"][0]

('Q909444', 'Sentō Imperial Palace')

In [328]:
df = (wikidata_query(category_query(category="garden"))[['itemLabel.value', 'item.value']])
df

Unnamed: 0,itemLabel.value,item.value
0,Sentō Imperial Palace,http://www.wikidata.org/entity/Q909444
1,garden,http://www.wikidata.org/entity/Q1107656
2,Soswaewon,http://www.wikidata.org/entity/Q493693
3,Parco del Valentino,http://www.wikidata.org/entity/Q1061804
4,Classical Gardens of Suzhou,http://www.wikidata.org/entity/Q1144337
5,McGovern Centennial Gardens,http://www.wikidata.org/entity/Q21196493
6,Schedel Arboretum and Gardens,http://www.wikidata.org/entity/Q7431032
7,Gardens of the French Renaissance,http://www.wikidata.org/entity/Q5522491
8,Désert de Retz,http://www.wikidata.org/entity/Q1270698
9,Weymouth Peace Garden,http://www.wikidata.org/entity/Q28003099


In [326]:
min_qid(wikidata_query(category_query(category="garden")))

('Q493693', 'Soswaewon')

In [271]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)

# schema_types = pd.read_csv(get_path("schemaorg-current-https-types.csv"))

In [272]:
categories_unique = list(set(biz["categories"].str.cat(sep=", ").split(sep=", ")))
categories = list(biz["categories"].str.cat(sep=", ").split(sep=", "))

In [273]:
from collections import Counter

category_occurences = pd.DataFrame(list(dict(Counter(categories)).items()), columns=["category", "occurences"]).sort_values(by="occurences", ascending=False)

In [274]:
display(category_occurences)

Unnamed: 0,category,occurences
17,Restaurants,52268
18,Food,27781
12,Shopping,24395
135,Home Services,14356
107,Beauty & Spas,14292
...,...,...
1261,Beach Bars,1
1267,DUI Schools,1
1270,Patent Law,1
1274,Housing Cooperatives,1


In [275]:
word = "Undersea/Hyperbaric Medicine"


def split_word_inc_slash(word):
    # Splitting the words that have a slash in them, and turning them into two words
    word_space = word.split(" ")
    word_space
    new_wordlist_a = []
    new_wordlist_b = []
    for i in word_space:
        i = i.lower()
        if "/" not in i:
            new_wordlist_a.append(i)
            new_wordlist_b.append(i)
        else:
            slash_split = i.split("/")
            new_wordlist_a.append(slash_split[0])
            new_wordlist_b.append(slash_split[1])
    new_word_a = " ".join(new_wordlist_a)
    new_word_b = " ".join(new_wordlist_b)
    return [new_word_a, new_word_b]


split_word_inc_slash(word)

['undersea medicine', 'hyperbaric medicine']

In [276]:
# transforms Yelp category words to words that is succesfull in finding the QID on wikidata
# TODO: fix & split issues
# TODO: fix & split has to be done first, and / split after
import re
import inflect
p = inflect.engine()

categories_dict = {}
for word in categories_unique:
    if '&' in word and '/' in word:
        word_list = re.split('&|/',word.lower())
        print(word_list)
        categories_dict[word] = word_list
    elif '&' in word:
        word_list = list(filter(None, word.lower().split(sep=' & ')))
        categories_dict[word] = word_list
    elif '/' in word:
        categories_dict[word] = split_word_inc_slash(word)
    else:
        categories_dict[word] = [word.lower()]

categories_dict_singular = {}
for key, value in categories_dict.items():
    new_value = []
    for word in value:
        if p.singular_noun(word) is False:
            word = word
        else:
            word = p.singular_noun(word)
        new_value.append(word)
    categories_dict_singular[key] = new_value

['heating ', ' air conditioning', 'hvac']


In [277]:
categories_dict_singular['Heating & Air Conditioning/HVAC']

['heating ', ' air conditioning', 'hvac']

In [278]:
category_occurences["split_category"] = category_occurences["category"].map(
    categories_dict_singular
)
category_occurences = category_occurences.explode("split_category")

In [279]:
category_occurences[category_occurences["category"].str.contains("/")][
    ["category", "occurences"]
].drop_duplicates()

Unnamed: 0,category,occurences
436,Heating & Air Conditioning/HVAC,1199
189,Cajun/Creole,923
610,Water Heater Installation/Repair,557
396,Community Service/Non-Profit,503
363,Blow Dry/Out Services,456
331,Tapas/Small Plates,440
177,Masonry/Concrete,363
218,Door Sales/Installation,277
348,Bike Repair/Maintenance,273
2,Naturopathic/Holistic,221


In [280]:
category_occurences[category_occurences["category"].str.contains("&")][
    ["category", "occurences"]
].drop_duplicates().head(30)

Unnamed: 0,category,occurences
107,Beauty & Spas,14292
4,Health & Medical,11890
90,Event Planning & Services,9895
20,Coffee & Tea,6703
43,Breakfast & Brunch,6239
58,Hotels & Travel,5857
14,Home & Garden,5799
87,Arts & Entertainment,5434
130,Fitness & Instruction,3293
27,Ice Cream & Frozen Yogurt,2657


In [392]:
category_occurences[category_occurences["category"].str.contains("air")]

Unnamed: 0,category,occurences,split_category,qid,qid_label
223,Auto Repair,5433,auto repair,,
105,Hair Salons,5046,hair salon,,
116,Hair Removal,3239,hair removal,Q625145,depilation
281,Hair Stylists,1459,hair stylist,Q55187,hairdresser
96,IT Services & Computer Repair,1189,it service,,
96,IT Services & Computer Repair,1189,computer repair,,
365,Men's Hair Salons,828,men's hair salon,,
152,Appliances & Repair,800,appliance,Q212920,home appliance
152,Appliances & Repair,800,repair,,
464,Transmission Repair,639,transmission repair,,


In [282]:
category_qid = {}
category_qid2 = {}
for cat in category_occurences["split_category"].to_list():
    try:
        wikidata_query_cat_query = wikidata_query(category_query(category=cat)) # Querys wikidata for the QID of the category
        category_qid[cat] = (wikidata_query_cat_query["item.value"][0][31:],wikidata_query_cat_query["itemLabel.value"][0]) # Adds QID and label of the first result of the query
        category_qid2[cat] = min_qid(wikidata_query_cat_query) # Adds QID and label with min_qid function
    except:
        pass


In [283]:
from pprint import pprint
from deepdiff import DeepDiff
# compares the two dictionaries and returns the differences in old value and new value for every key
ddiff = DeepDiff(category_qid, category_qid2, verbose_level=1) 

In [284]:
def compare_qids(new_value: str, old_value: str):
    # check if the new qid is an instance of old qid
    return f"""SELECT ?s 
                WHERE {{?s wdt:P31 wd:{old_value} . 
                        VALUES ?s {{wd:{new_value}}} .
                }}"""

update_qid_dict = {}
for key, value in ddiff['values_changed'].items():
    # check if the new qid is an instance of old qid, then update with old qid if true
    if wikidata_query(compare_qids(new_value=value['new_value'][0], old_value=value['old_value'][0])).empty is False:
        print(f"Updating {key} from {value['old_value']} to {value['new_value']}")
        update_qid_dict[key[6:-2]] = value['old_value']
category_qid2.update(update_qid_dict) # update the qid dict with the new qids, updated values: {'airline': 'Q46970', 'boat tour': 'Q25040412', 'magazine': 'Q41298'}

In [387]:
category_qid2

{'restaurant': ('Q11707', 'restaurant'),
 'food': ('Q2095', 'food'),
 'shopping': ('Q830036', 'shopping'),
 'beauty': ('Q7242', 'beauty'),
 'spa': ('Q1296555', 'Thermes Szent Lukács'),
 'nightlife': ('Q1594437', 'nightlife'),
 'health': ('Q12147', 'health'),
 'medical': ('Q11190', 'medicine'),
 'bar': ('Q860', 'foobar'),
 'automotive': ('Q1850259', 'Renault Rodeo'),
 'event planning': ('Q2040532', 'event planning'),
 'service': ('Q44127', 'server'),
 'sandwich': ('Q28803', 'sandwich'),
 'pizza': ('Q177', 'pizza'),
 'coffee': ('Q8486', 'coffee'),
 'tea': ('Q6097', 'tea'),
 'fast food': ('Q81799', 'fast food'),
 'breakfast': ('Q80973', 'breakfast'),
 'brunch': ('Q734263', 'brunch'),
 'hotel': ('Q27686', 'hotel'),
 'travel': ('Q61509', 'travel'),
 'home': ('Q7743', 'home'),
 'garden': ('Q493693', 'Soswaewon'),
 'fashion': ('Q12684', 'fashion'),
 'burger': ('Q6663', 'hamburger'),
 'art': ('Q735', 'art'),
 'entertainment': ('Q173799', 'entertainment'),
 'nail salon': ('Q8007048', 'nail salo

In [315]:
category_occurences['qid'] = category_occurences['split_category'].map(category_qid2)
category_occurences[['qid','qid_label']] = pd.DataFrame(category_occurences['qid'].tolist(),index=category_occurences.index)

In [317]:
category_qid_list = category_occurences["qid"].tolist()
category_qid_list = [i for i in category_qid_list if i is not np.nan]

In [318]:
item_list = category_qid_list

item_list_len = len(item_list)
# The limit is set to meet the requirements of the wikibase API wbgetentities (max 50)
# Ceil makes sure that the each subset from item_list is no longer than 50
limit = ceil(item_list_len / 50)

# Seperates the item_list to a nested_list with max 50 items in each list
piped_list = [item_list[pipe::limit] for pipe in range(limit)]

category_wikidata = {}
for i in piped_list:
    category_wikidata.update(retrieve_wikidata_claims(i))

In [319]:
category_triple = {}
for key, values in category_wikidata.items():
    for value in values:
        for obj in value:
            if obj["mainsnak"]["property"] == "P279":
                data_value = obj["mainsnak"]["datavalue"]["value"]["id"]
                category_triple[key] = category_triple.get(key, []) + [data_value]
category_triple

{'Q11707': ['Q62602544', 'Q1431026', 'Q41176', 'Q41958'],
 'Q6097': ['Q61951',
  'Q19359564',
  'Q1365365',
  'Q11090700',
  'Q2647467',
  'Q473666'],
 'Q39201': ['Q622852'],
 'Q1294114': ['Q999981'],
 'Q188507': ['Q699405', 'Q1497384'],
 'Q837171': ['Q7406919', 'Q25351891', 'Q2424752', 'Q806750', 'Q815823'],
 'Q1996635': ['Q5422651', 'Q213441', 'Q108290424'],
 'Q124946': ['Q48803', 'Q47728'],
 'Q181055': ['Q746549', 'Q2957687', 'Q28803', 'Q81799', 'Q7427595'],
 'Q211578': ['Q3632343', 'Q2207288', 'Q17200001'],
 'Q1318959': ['Q213441'],
 'Q1971625': ['Q206615'],
 'Q364005': ['Q13226383', 'Q121359', 'Q178706'],
 'Q22657': ['Q181790', 'Q34669510'],
 'Q1880871': ['Q18643213'],
 'Q1501': ['Q1076486', 'Q12004466'],
 'Q317309': ['Q1403186', 'Q19829125', 'Q12147'],
 'Q1592332': ['Q1374250'],
 'Q8341': ['Q373342'],
 'Q11033': ['Q340169', 'Q17537576', 'Q121182', 'Q24229398'],
 'Q339836': ['Q3897491'],
 'Q39809': ['Q8187769', 'Q42240'],
 'Q3095365': ['Q148571'],
 'Q1410837': ['Q37038'],
 'Q84684

In [320]:
wiki_subclasses = pd.DataFrame(
    list(category_triple.items()), columns=["category_qid", "subclassOf"]
).explode("subclassOf")

In [321]:
wiki_subclasses

Unnamed: 0,category_qid,subclassOf
0,Q11707,Q62602544
0,Q11707,Q1431026
0,Q11707,Q41176
0,Q11707,Q41958
1,Q6097,Q61951
...,...,...
584,Q895060,Q1368898
584,Q895060,Q15855160
584,Q895060,Q877517
585,Q942297,Q362200


In [381]:
df = category_occurences.merge(
    wiki_subclasses, left_on="qid", right_on="category_qid", how="left"
)

In [402]:
schema_mappings = pd.read_csv(get_path("class_mappings.csv"))
df = df.merge(schema_mappings, left_on="category", right_on="YelpCategory", how="left")

In [403]:
df

Unnamed: 0,category,occurences,split_category,qid,qid_label,category_qid,subclassOf,YelpCategory,SchemaType
0,Restaurants,52268,restaurant,Q11707,restaurant,Q11707,Q62602544,,
1,Restaurants,52268,restaurant,Q11707,restaurant,Q11707,Q1431026,,
2,Restaurants,52268,restaurant,Q11707,restaurant,Q11707,Q41176,,
3,Restaurants,52268,restaurant,Q11707,restaurant,Q11707,Q41958,,
4,Food,27781,food,Q2095,food,Q2095,Q1194058,Food,WPFooter
...,...,...,...,...,...,...,...,...,...
1974,Housing Cooperatives,1,housing cooperative,Q562166,housing cooperative,Q562166,Q4539,,
1975,Housing Cooperatives,1,housing cooperative,Q562166,housing cooperative,Q562166,Q4830453,,
1976,Housing Cooperatives,1,housing cooperative,Q562166,housing cooperative,Q562166,Q699405,,
1977,Trade Fairs,1,trade fair,Q57305,trade fair,Q57305,Q288514,,


In [393]:
from rdflib import Namespace, Graph, URIRef, Literal, BNode
from rdflib.namespace import RDFS
import urllib.parse
schema = Namespace("https://schema.org/")
example = Namespace("https://example.org/")
wiki = Namespace("https://www.wikidata.org/entity/")

# triple_file = gzip.open(filename=f"/home/ubuntu/vol1/virtuoso/import/yelp_business.nt.gz", mode="at",encoding="utf-8")

G = Graph()
for i in df.itertuples():
    if i.qid is not np.nan and i.SchemaType is not np.nan:
        G.add(
            (
                URIRef(schema[i.SchemaType]),
                URIRef(schema["sameAs"]),
                URIRef(wiki[i.qid]),
            )
        )
    else:
        if i.qid is not np.nan:
            G.add(
                (
                    URIRef(wiki[i.qid]),
                    URIRef(RDFS["label"]),
                    URIRef(example[urllib.parse.quote("_".join(i.qid_label.split(" ")))])
                )
            )
            if "&" in i.category:
                G.add(
                    (
                        URIRef(example["_".join(i.category.split(" "))]),
                        URIRef(example["superclassOf"]),
                        URIRef(wiki[i.qid])
                    )
                )
            else:
                G.add(
                    (
                        URIRef(example["_".join(i.category.split(" "))]),
                        URIRef(schema["sameAs"]),
                        URIRef(wiki[i.qid])
                    )
                )
    if i.subclassOf is not None:
        G.add((wiki[i.category_qid], wiki["P279"], wiki[i.subclassOf]))

nt = G.serialize(destination="categories.nt", format="nt")


# triple_file.write(G.serialize(format="nt"))



'coffee and tea' superclassOf 'qid of coffee'
'coffee and tea' superclassOf 'qid of tea'

'Restaurants' schema:sameAs 'qid of restaurant'

'qid of coffee' wiki:subclassOf(P279) 'qid'
'qid of tea' wiki:subclassOf(P279) 'qid'

schema:'restaurant' schema:sameAs wiki:'restaurant'