In [1]:
# https://github.com/maxlath/wikibase-dump-filter
# https://github.com/maxlath/wikibase-dump-filter/blob/main/docs/cli.md
# extracter instance of (P31) city (Q515) eller state in US (Q35657) og som er i country (P17) USA (Q30) eller Mexico (Q96) eller Canada(Q16)
#!cat latest-all.json.gz | gzip -d | wikibase-dump-filter --languages en --claim 'P31:Q515,Q35657&P17:Q30,Q96,Q16' --omit sitelinks > city_state_america.ndjson

# extracter instance of (P31) business (Q4830453)
# led efter has subsidiary (P355)
#!cat latest-all.json.gz | gzip -d | wikibase-dump-filter --languages en --claim 'P31:Q4830453' --omit sitelinks > business.ndjson

In [2]:
# https://linuxhint.com/bash_head_tail_command/
#!head -n 1 simplified_dump.ndjson > simpl.ndjson

In [3]:
# https://stedolan.github.io/jq/tutorial/
# https://www.linode.com/docs/guides/using-jq-to-process-json-on-the-command-line/
#!jq '.' simpl.ndjson > simpl.json

In [4]:
import sys
sys.path.append(sys.path[0][:sys.path[0].find('DVML-P7') + len('DVML-P7')])
from math import ceil
import numpy as np
import pandas as pd
import requests
from SPARQLWrapper import JSON, SPARQLWrapper

from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.wikidata_query_tools import (
    retrieve_wikidata_claims,
    wikidata_query,
)

In [5]:
query_all_us_states = """
SELECT ?state
WHERE {
  ?state wdt:P31 wd:Q35657 .
}
LIMIT 10
"""
wikidata_query(sparql_query=query_all_us_states)

Unnamed: 0,state.type,state.value
0,uri,http://www.wikidata.org/entity/Q99
1,uri,http://www.wikidata.org/entity/Q173
2,uri,http://www.wikidata.org/entity/Q724
3,uri,http://www.wikidata.org/entity/Q759
4,uri,http://www.wikidata.org/entity/Q771
5,uri,http://www.wikidata.org/entity/Q779
6,uri,http://www.wikidata.org/entity/Q782
7,uri,http://www.wikidata.org/entity/Q797
8,uri,http://www.wikidata.org/entity/Q812
9,uri,http://www.wikidata.org/entity/Q816


In [6]:
def category_query(category: str):
    return f"""SELECT distinct ?item ?itemLabel ?itemDescription WHERE{{
    ?item ?label "{category}"@en .
    ?article schema:about ?item .
    ?article schema:inLanguage "en" .
    ?article schema:isPartOf <https://en.wikipedia.org/>.
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}}}"""

In [7]:
wiki_qid_of_category = wikidata_query(category_query(category="ear"))[
    ["item.value", "itemLabel.value", "itemDescription.value"]
].head()
wiki_qid_of_category

Unnamed: 0,item.value,itemLabel.value,itemDescription.value
0,http://www.wikidata.org/entity/Q7362,ear,organ that detects sound; organ of hearing and...
1,http://www.wikidata.org/entity/Q147398,ear,grain-bearing tip part of the stem of a cereal...


In [8]:
def min_qid(df_qid):
    # Getting the minimum value of the QID number and the itemLabel
    index = df_qid['item.value'].apply(lambda x: int(x.split("/")[-1].replace("Q", ""))).idxmin()
    df = df_qid.loc[index][['item.value','itemLabel.value']]
    return df[0][31:], df[1]
min_qid(df_qid=wiki_qid_of_category)

('Q7362', 'ear')

In [9]:
test_query = wikidata_query(category_query(category="garden"))
test_query["item.value"][0][31:], test_query["itemLabel.value"][0]

('Q909444', 'Sentō Imperial Palace')

In [10]:
df = (wikidata_query(category_query(category="garden"))[['itemLabel.value', 'item.value']])
df

Unnamed: 0,itemLabel.value,item.value
0,Sentō Imperial Palace,http://www.wikidata.org/entity/Q909444
1,garden,http://www.wikidata.org/entity/Q1107656
2,Soswaewon,http://www.wikidata.org/entity/Q493693
3,Parco del Valentino,http://www.wikidata.org/entity/Q1061804
4,Classical Gardens of Suzhou,http://www.wikidata.org/entity/Q1144337
5,McGovern Centennial Gardens,http://www.wikidata.org/entity/Q21196493
6,Schedel Arboretum and Gardens,http://www.wikidata.org/entity/Q7431032
7,Gardens of the French Renaissance,http://www.wikidata.org/entity/Q5522491
8,Désert de Retz,http://www.wikidata.org/entity/Q1270698
9,Weymouth Peace Garden,http://www.wikidata.org/entity/Q28003099


In [11]:
min_qid(wikidata_query(category_query(category="garden")))

('Q493693', 'Soswaewon')

In [12]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)

# schema_types = pd.read_csv(get_path("schemaorg-current-https-types.csv"))

In [13]:
categories_unique = list(set(biz["categories"].str.cat(sep=", ").split(sep=", ")))
categories = list(biz["categories"].str.cat(sep=", ").split(sep=", "))

In [14]:
from collections import Counter

category_occurences = pd.DataFrame(list(dict(Counter(categories)).items()), columns=["category", "occurences"]).sort_values(by="occurences", ascending=False)

In [15]:
display(category_occurences)

Unnamed: 0,category,occurences
17,Restaurants,52268
18,Food,27781
12,Shopping,24395
135,Home Services,14356
107,Beauty & Spas,14292
...,...,...
1261,Beach Bars,1
1267,DUI Schools,1
1270,Patent Law,1
1274,Housing Cooperatives,1


In [16]:
cat_string_man_handle_dict = pd.read_excel(get_path("split_categories.xlsx"), sheet_name="Sheet1", index_col=0).to_dict()['Unnamed: 1']
cat_string_man_handle_dict = {k: v.split(', ') for k, v in cat_string_man_handle_dict.items()}

In [17]:
cat_string_man_handle_dict

{'Beauty & Spas': ['Beauty', 'Spas'],
 'Health & Medical': ['Health', 'Medical'],
 'Event Planning & Services': ['Event Planning', 'Event Services'],
 'Coffee & Tea': ['Coffee', 'Tea'],
 'Breakfast & Brunch': ['Breakfast', 'Brunch'],
 'Hotels & Travel': ['Hotels', 'Travels'],
 'Home & Garden': ['Home', 'Garden'],
 'Arts & Entertainment': ['Arts', 'Entertainment'],
 'Fitness & Instruction': ['Fitness', 'Instruction'],
 'Ice Cream & Frozen Yogurt': ['Ice Cream', 'Frozen Yogurt'],
 'Venues & Event Spaces': ['Venues', 'Event Spaces'],
 'Wine & Spirits': ['Wine', 'Spirits'],
 'Flowers & Gifts': ['Flowers', 'Gifts'],
 'Auto Parts & Supplies': ['Auto Parts', 'Auto Supplies'],
 'Juice Bars & Smoothies': ['Juice Bars', 'Smoothies'],
 'Cosmetics & Beauty Supply': ['Cosmetics', 'Beauty Supplies'],
 'Arts & Crafts': ['Arts', 'Crafts'],
 'Public Services & Government': ['Public Services', 'Government'],
 'Heating & Air Conditioning/HVAC': ['Heating', 'Air Conditioning', 'HVAC'],
 'IT Services & Com

In [18]:
from Code.UtilityFunctions.string_functions import turn_words_singular

categories_dict = {i: [i] for i in categories_unique}
categories_dict.update(cat_string_man_handle_dict)

In [19]:
categories_dict_singular = turn_words_singular(categories_dict)

In [20]:
categories_dict_singular['Heating & Air Conditioning/HVAC']

['heating', 'air conditioning', 'hvac']

In [21]:
category_occurences["split_category"] = category_occurences["category"].map(
    categories_dict_singular
)
category_occurences = category_occurences.explode("split_category")

In [22]:
category_occurences[category_occurences["category"].str.contains("/")][
    ["category", "occurences"]
].drop_duplicates()

Unnamed: 0,category,occurences
436,Heating & Air Conditioning/HVAC,1199
189,Cajun/Creole,923
610,Water Heater Installation/Repair,557
396,Community Service/Non-Profit,503
363,Blow Dry/Out Services,456
331,Tapas/Small Plates,440
177,Masonry/Concrete,363
218,Door Sales/Installation,277
348,Bike Repair/Maintenance,273
2,Naturopathic/Holistic,221


In [23]:
andlist = category_occurences[category_occurences["category"].str.contains("&")]["category"].drop_duplicates().to_list()

In [24]:
category_occurences[category_occurences["category"].str.contains("air")]

Unnamed: 0,category,occurences,split_category
223,Auto Repair,5433,auto repair
105,Hair Salons,5046,hair salon
116,Hair Removal,3239,hair removal
281,Hair Stylists,1459,hair stylist
96,IT Services & Computer Repair,1189,it service
96,IT Services & Computer Repair,1189,computer repiar
365,Men's Hair Salons,828,men's hair salon
152,Appliances & Repair,800,appliance
152,Appliances & Repair,800,repair
464,Transmission Repair,639,transmission repair


In [25]:
category_qid = {}
category_qid2 = {}
for cat in category_occurences["split_category"].to_list():
    try:
        wikidata_query_cat_query = wikidata_query(category_query(category=cat)) # Querys wikidata for the QID of the category
        category_qid[cat] = (wikidata_query_cat_query["item.value"][0][31:],wikidata_query_cat_query["itemLabel.value"][0]) # Adds QID and label of the first result of the query
        category_qid2[cat] = min_qid(wikidata_query_cat_query) # Adds QID and label with min_qid function
    except:
        pass


In [26]:
from pprint import pprint
from deepdiff import DeepDiff
category_qid_only_qid = {key:value for (key,value) in category_qid.items()}
category_qid2_only_qid = {key:value for (key,value) in category_qid2.items()}
# compares the two dictionaries and returns the differences in old value and new value for every key
ddiff = DeepDiff(category_qid_only_qid, category_qid2_only_qid, verbose_level=1) 

In [27]:
def compare_qids(new_value: str, old_value: str):
    # check if the new qid is an instance of old qid
    return f"""SELECT ?s 
                WHERE {{?s wdt:P31 wd:{old_value} . 
                        VALUES ?s {{wd:{new_value}}} .
                }}"""

update_qid_dict = {}
for key, value in ddiff['values_changed'].items():
    if key.__contains__("[0]") is True:
        # check if the new qid is an instance of old qid, then update with old qid if true
        if wikidata_query(compare_qids(new_value=value['new_value'], old_value=value['old_value'])).empty is False:
            print(f"Updating {key} from {value['new_value']} to {value['old_value']}")
            update_qid_dict[key[6:-5]] = category_qid[key[6:-5]]
category_qid2.update(update_qid_dict) # update the qid dict with the new qids, updated values: {'airline': 'Q46970', 'boat tour': 'Q25040412', 'magazine': 'Q41298'}

Updating root['airline'][0] from Q8782 to Q46970
Updating root['boat tour'][0] from Q981249 to Q25040412
Updating root['magazine'][0] from Q25738 to Q41298


In [28]:
category_qid2['restaurant']

('Q11707', 'restaurant')

In [29]:
category_occurences['qid'] = category_occurences['split_category'].map(category_qid2)
category_occurences[['qid','qid_label']] = pd.DataFrame(category_occurences['qid'].tolist(),index=category_occurences.index)

In [30]:
category_qid_list = category_occurences["qid"].tolist()
category_qid_list = [i for i in category_qid_list if i is not np.nan]

In [31]:
item_list = category_qid_list

item_list_len = len(item_list)
# The limit is set to meet the requirements of the wikibase API wbgetentities (max 50)
# Ceil makes sure that the each subset from item_list is no longer than 50
limit = ceil(item_list_len / 50)

# Seperates the item_list to a nested_list with max 50 items in each list
piped_list = [item_list[pipe::limit] for pipe in range(limit)]

category_wikidata = {}
for i in piped_list:
    category_wikidata.update(retrieve_wikidata_claims(i))

In [32]:
category_triple = {}
for key, values in category_wikidata.items():
    for value in values:
        for obj in value:
            if obj["mainsnak"]["property"] == "P279":
                data_value = obj["mainsnak"]["datavalue"]["value"]["id"]
                category_triple[key] = category_triple.get(key, []) + [data_value]
category_triple

{'Q11707': ['Q62602544', 'Q1431026', 'Q41176', 'Q41958'],
 'Q81799': ['Q746549', 'Q8195619'],
 'Q683595': ['Q684740'],
 'Q14092': ['Q641226', 'Q1076486'],
 'Q55187': ['Q27943370', 'Q33394058', 'Q108289644'],
 'Q148958': ['Q1183543', 'Q987767'],
 'Q858012': ['Q5307737'],
 'Q152095': ['Q987767', 'Q2024731'],
 'Q171495': ['Q1762457', 'Q1183543'],
 'Q5526883': ['Q212198', 'Q11707'],
 'Q11442': ['Q595658', 'Q233040', 'Q768186'],
 'Q43183': ['Q15809678', 'Q837171'],
 'Q11424': ['Q2431196', 'Q10301427', 'Q4502142', 'Q20937557'],
 'Q35054': ['Q294422'],
 'Q10611118': ['Q24455304', 'Q4830453', 'Q63922515'],
 'Q11419': ['Q11417', 'Q212434'],
 'Q638': ['Q184485', 'Q173799', 'Q11461', 'Q114088986'],
 'Q7075': ['Q1030034', 'Q13226383', 'Q2668072'],
 'Q329737': ['Q2588761', 'Q108286217'],
 'Q339836': ['Q3897491'],
 'Q39809': ['Q8187769', 'Q42240'],
 'Q3095365': ['Q148571'],
 'Q1410837': ['Q37038'],
 'Q208386': ['Q7708485'],
 'Q1065252': ['Q2460422'],
 'Q792635': ['Q12898216', 'Q349'],
 'Q14350': ['Q

In [33]:
wiki_subclasses = pd.DataFrame(
    list(category_triple.items()), columns=["category_qid", "subclassOf"]
).explode("subclassOf")

In [34]:
wiki_subclasses

Unnamed: 0,category_qid,subclassOf
0,Q11707,Q62602544
0,Q11707,Q1431026
0,Q11707,Q41176
0,Q11707,Q41958
1,Q81799,Q746549
...,...,...
571,Q772298,Q16605260
571,Q772298,Q699405
571,Q772298,Q13226383
572,Q57305,Q288514


In [35]:
df = category_occurences.merge(
    wiki_subclasses, left_on="qid", right_on="category_qid", how="left"
)

In [36]:
schema_mappings = pd.read_csv(get_path("class_mappings.csv"))
df['split_category'] = df['split_category'].apply(lambda x: x.title().replace(' ',''))
df1 = df.merge(schema_mappings, left_on="split_category", right_on="YelpCategory", how="left")


In [37]:
df1.loc[df1['qid'] == 'Q103597']

Unnamed: 0,category,occurences,split_category,qid,qid_label,category_qid,subclassOf,YelpCategory,SchemaType
884,Surgeons,123,Surgeon,Q103597,Georg Hartog Gerson,,,,


In [38]:
from rdflib import Namespace, Graph, URIRef, Literal, BNode
from rdflib.namespace import RDFS
import urllib.parse
schema = Namespace("https://schema.org/")
example = Namespace("https://example.org/")
wiki = Namespace("https://www.wikidata.org/entity/")
# triple_file = gzip.open(filename=f"/home/ubuntu/vol1/virtuoso/import/yelp_business.nt.gz", mode="at",encoding="utf-8")

G = Graph()
for i in df1.itertuples():
    if i.subclassOf is not np.nan:
        G.add((wiki[i.qid], wiki["P279"], wiki[i.subclassOf]))
    if i.qid is not np.nan:
        G.add((wiki[i.qid], RDFS["label"], Literal(i.qid_label)))
        G.add((wiki[i.qid], RDFS["Class"], example['WikiCategory']))
        if i.SchemaType is not np.nan:
            G.add((schema[i.SchemaType], schema["sameAs"], wiki[i.qid]))
        else:
            G.add((example[i.split_category], schema["sameAs"], wiki[i.qid]))

nt = G.serialize(destination="categories.nt", format="nt")


# triple_file.write(G.serialize(format="nt"))



'coffee and tea' superclassOf 'qid of coffee'
'coffee and tea' superclassOf 'qid of tea'

'Restaurants' schema:sameAs 'qid of restaurant'

'qid of coffee' wiki:subclassOf(P279) 'qid'
'qid of tea' wiki:subclassOf(P279) 'qid'

schema:'restaurant' schema:sameAs wiki:'restaurant'