In [1]:
# https://github.com/maxlath/wikibase-dump-filter
# https://github.com/maxlath/wikibase-dump-filter/blob/main/docs/cli.md
# extracter instance of (P31) city (Q515) eller state in US (Q35657) og som er i country (P17) USA (Q30) eller Mexico (Q96) eller Canada(Q16)
#!cat latest-all.json.gz | gzip -d | wikibase-dump-filter --languages en --claim 'P31:Q515,Q35657&P17:Q30,Q96,Q16' --omit sitelinks > city_state_america.ndjson

# extracter instance of (P31) business (Q4830453)
# led efter has subsidiary (P355)
#!cat latest-all.json.gz | gzip -d | wikibase-dump-filter --languages en --claim 'P31:Q4830453' --omit sitelinks > business.ndjson


In [2]:
# https://linuxhint.com/bash_head_tail_command/
#!head -n 1 simplified_dump.ndjson > simpl.ndjson

In [3]:
# https://stedolan.github.io/jq/tutorial/
# https://www.linode.com/docs/guides/using-jq-to-process-json-on-the-command-line/
#!jq '.' simpl.ndjson > simpl.json

In [4]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
from UtilityFunctions.get_data_path import get_path
import sys
import requests
from math import ceil
import numpy as np

In [5]:
def wikidata_query(sparql_query: str):
    # From https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples#Cats
    user_agent = "Yelp knowledge graph mapping/%s.%s" % (sys.version_info[0], sys.version_info[1])
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)
    sparql.setQuery(sparql_query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    results_df = pd.json_normalize(results['results']['bindings'])
    return results_df
query_all_us_states = """
SELECT ?state
WHERE {
  ?state wdt:P31 wd:Q35657 .
}
"""
wikidata_query(sparql_query=query_all_us_states)

Unnamed: 0,state.type,state.value
0,uri,http://www.wikidata.org/entity/Q99
1,uri,http://www.wikidata.org/entity/Q173
2,uri,http://www.wikidata.org/entity/Q724
3,uri,http://www.wikidata.org/entity/Q759
4,uri,http://www.wikidata.org/entity/Q771
5,uri,http://www.wikidata.org/entity/Q779
6,uri,http://www.wikidata.org/entity/Q782
7,uri,http://www.wikidata.org/entity/Q797
8,uri,http://www.wikidata.org/entity/Q812
9,uri,http://www.wikidata.org/entity/Q816


In [6]:
def wikidata_query_categories(category: str):
    # From https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples#Cats
    user_agent = "Yelp knowledge graph mapping/%s.%s" % (sys.version_info[0], sys.version_info[1])
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)
    sparql.setQuery(f"""
                    SELECT distinct ?item ?itemLabel ?itemDescription WHERE{{  
                    ?item ?label "{category}"@en.  
                    ?article schema:about ?item .
                    ?article schema:inLanguage "en" .
                    ?article schema:isPartOf <https://en.wikipedia.org/>. 
                    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}    
                    }}
                    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    results_df = pd.json_normalize(results['results']['bindings'])
    return results_df

In [58]:
wikidata_query_categories('Shopping')[['item.value', 'itemLabel.value', 'itemDescription.value']].head()

Unnamed: 0,item.value,itemLabel.value,itemDescription.value
0,http://www.wikidata.org/entity/Q17054429,Shopping,1998 novel by Gavin Kramer
1,http://www.wikidata.org/entity/Q250995,Shopping,1994 film by Paul W. S. Anderson
2,http://www.wikidata.org/entity/Q2279849,Shopping,Wikimedia disambiguation page
3,http://www.wikidata.org/entity/Q10667142,Shopping,"shopping mall in Luleå, Sweden"
4,http://www.wikidata.org/entity/Q28224756,Shopping,British post-punk trio based in London and Gla...


In [8]:
wikidata_query_categories('restaurant')['item.value'][0][31:]

'Q263063'

In [9]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)

schema = pd.read_csv(get_path("schemaorg-current-https-types.csv"))

In [10]:
schema[["label", "subTypeOf"]]

Unnamed: 0,label,subTypeOf
0,3DModel,https://schema.org/MediaObject
1,AMRadioChannel,https://schema.org/RadioChannel
2,APIReference,https://schema.org/TechArticle
3,Abdomen,https://schema.org/PhysicalExam
4,AboutPage,https://schema.org/WebPage
...,...,...
1348,WritePermission,https://schema.org/DigitalDocumentPermissionType
1349,XPathType,https://schema.org/Text
1350,XRay,https://schema.org/MedicalImagingTechnique
1351,ZoneBoardingPolicy,https://schema.org/BoardingPolicyType


In [11]:
categories_unique = list(set(biz['categories'].str.cat(sep=', ').split(sep=', ')))
categories = list(biz['categories'].str.cat(sep=', ').split(sep=', '))

In [12]:
from collections import Counter
category_occurences = pd.DataFrame.from_dict(dict(Counter(categories)), orient='index', columns=['occurences'])
category_occurences.sort_values(by='occurences', ascending=False, inplace=True)
category_occurences.reset_index(inplace=True)
category_occurences.rename(columns = {'index':'category'}, inplace=True)

In [13]:
category_occurences

Unnamed: 0,category,occurences
0,Restaurants,52268
1,Food,27781
2,Shopping,24395
3,Home Services,14356
4,Beauty & Spas,14292
...,...,...
1306,Beach Bars,1
1307,DUI Schools,1
1308,Patent Law,1
1309,Housing Cooperatives,1


In [61]:
word = 'Undersea/Hyperbaric Medicine'

def split_word_inc_slash(word):
    # Splitting the words that have a slash in them, and turning them into two words
    word_space = word.split(' ')
    word_space
    new_wordlist_a = []
    new_wordlist_b = []
    for i in word_space:
        i = i.lower()
        if '/' not in i:
            new_wordlist_a.append(i)
            new_wordlist_b.append(i)
        else:
            slash_split = i.split('/')
            new_wordlist_a.append(slash_split[0])
            new_wordlist_b.append(slash_split[1])
    new_word_a = ' '.join(new_wordlist_a)
    new_word_b = ' '.join(new_wordlist_b)
    return [new_word_a, new_word_b]

split_word_inc_slash(word)

['undersea medicine', 'hyperbaric medicine']

In [15]:
# transforms Yelp category words to words that is succesfull in finding the QID on wikidata
# TODO: fix & split issues
# TODO: fix & split has to be done first, and / split after
import inflect
p = inflect.engine()

categories_dict = {}
for word in categories_unique:
    if '&' in word:
        word_list = list(filter(None, word.lower().split(sep=' & ')))
        categories_dict[word] = word_list
    elif '/' in word:
        categories_dict[word] = split_word_inc_slash(word)
    else:
        categories_dict[word] = [word.lower()]
    
categories_dict
categories_dict_singular = {}
for key, value in categories_dict.items():
    new_value = []
    for word in value:
        if p.singular_noun(word) is False:
            word = word
        else:
            word = p.singular_noun(word)
        new_value.append(word)
    categories_dict_singular[key] = new_value

In [16]:
categories_dict_singular

{'Kitchen Supplies': ['kitchen supply'],
 'Mobile Home Dealers': ['mobile home dealer'],
 'Food Trucks': ['food truck'],
 'Watch Repair': ['watch repair'],
 'High Fidelity Audio Equipment': ['high fidelity audio equipment'],
 'Keys & Locksmiths': ['key', 'locksmith'],
 'Waterproofing': ['waterproofing'],
 'Hearing Aid Providers': ['hearing aid provider'],
 'Safe Stores': ['safe store'],
 'Karaoke Rental': ['karaoke rental'],
 'Ethnic Food': ['ethnic food'],
 'Italian': ['italian'],
 'Stucco Services': ['stucco service'],
 'Cultural Center': ['cultural center'],
 'Cheese Shops': ['cheese shop'],
 'Internal Medicine': ['internal medicine'],
 'Himalayan/Nepalese': ['himalayan', 'nepalese'],
 'Bars': ['bar'],
 'Game Truck Rental': ['game truck rental'],
 'Thai': ['thai'],
 'Dumpster Rental': ['dumpster rental'],
 'Contract Law': ['contract law'],
 'Ethiopian': ['ethiopian'],
 'Brewing Supplies': ['brewing supply'],
 'Car Dealers': ['car dealer'],
 'Horseback Riding': ['horseback riding'],


In [17]:
category_occurences['splitted_category'] = category_occurences['category'].map(categories_dict_singular)
category_occurences = category_occurences.explode('splitted_category')

In [122]:
class_mapping = pd.read_csv('/home/ubuntu/DVML-P7/Code/UtilityFiles/class_mappings.csv')
category_occurences = category_occurences.merge(class_mapping, left_on='category', right_on='YelpCategory', how='left')

In [123]:
category_occurences

Unnamed: 0,category,occurences,splitted_category,qid,YelpCategory,SchemaType
0,Restaurants,52268,restaurant,Q263063,Restaurants,Restaurant
1,Food,27781,food,Q894875,,
2,Shopping,24395,shopping,Q830036,Shopping,ShoppingCenter
3,Home Services,14356,home service,,,
4,Beauty & Spas,14292,beauty,Q7242,,
...,...,...,...,...,...,...
1424,Beach Bars,1,beach bar,,,
1425,DUI Schools,1,dui school,,,
1426,Patent Law,1,patent law,,,
1427,Housing Cooperatives,1,housing cooperative,Q17063278,,


In [18]:
category_qid = {}
for i in category_occurences['splitted_category'].to_list():
    try:
        cat = i.lower()
        cat_qid = wikidata_query_categories(cat)['item.value'][0][31:]
        category_qid[cat] = cat_qid
    except:
        pass

In [19]:
wikidata_query_categories('cat')['item.value'][0][31:]

'Q146'

In [23]:
category_occurences['qid'] = category_occurences['splitted_category'].map(category_qid)

In [24]:
category_occurences

Unnamed: 0,category,occurences,splitted_category,qid
0,Restaurants,52268,restaurant,Q263063
1,Food,27781,food,Q894875
2,Shopping,24395,shopping,Q830036
3,Home Services,14356,home service,
4,Beauty & Spas,14292,beauty,Q7242
...,...,...,...,...
1306,Beach Bars,1,beach bar,
1307,DUI Schools,1,dui school,
1308,Patent Law,1,patent law,
1309,Housing Cooperatives,1,housing cooperative,Q17063278


In [25]:
category_qid_list = category_occurences['qid'].tolist()
category_qid_list = [i for i in category_qid_list if i is not np.nan]


In [26]:
len(category_qid_list)

728

In [66]:
def retrieve_wikidata_claims(item_list):
    """
    Sends a request to the Wikidata API and transform the data from JSON into a dictionary to
    extract the claims each property has.
    :param item_list: A list with up to 50 wikidata items written with Q-code
    :return: A nested list, with all the properties each item has
    """
    # Creates the query by seperating each item with "|"
    item_list_query = ""
    for item in range(len(item_list)):
        if item == (len(item_list) - 1):
            item_list_query += item_list[item]
        else:
            item_list_query += item_list[item] + "%7C"

    # The string with API wbgetentities to find multiple items in an optimal format
    URL = f"https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&ids={item_list_query}&props=claims&languages=en&formatversion=2"

    # Opens a HTMl session and gets the DATA from the API
    with requests.Session() as S:
        DATA = dict(S.post(url=URL, headers={"user-agent": "magic browser", "Content-Type": "application/json"}).json())
    # Appends the properties of each item to a nested list
        
    nested_dict = {}
    for entity in DATA["entities"]:
        try:
            nested_dict[entity] = list(DATA["entities"][entity]["claims"].values())
        except:
            pass

    return nested_dict

In [28]:
item_list = category_qid_list

item_list_len = len(item_list)
# The limit is set to meet the requirements of the wikibase API wbgetentities (max 50)
# Ceil makes sure that the each subset from item_list is no longer than 50
limit = ceil(item_list_len / 50)

# Seperates the item_list to a nested_list with max 50 items in each list
piped_list = [item_list[pipe::limit] for pipe in range(limit)]

category_wikidata = {}
for i in piped_list:
    category_wikidata.update(retrieve_wikidata_claims(i))


In [29]:
category_triple = {}
for key, values in category_wikidata.items():
    # category_triple_temp_list = []
    # category_triple_temp = {}
    for value in values:
        for obj in value:
            if obj['mainsnak']['property'] == 'P279':
                data_value = obj['mainsnak']['datavalue']['value']['id']
                category_triple[key] = category_triple.get(key, []) + [data_value]
category_triple

{'Q6097': ['Q61951',
  'Q19359564',
  'Q1365365',
  'Q11090700',
  'Q2647467',
  'Q473666'],
 'Q39201': ['Q622852'],
 'Q1294114': ['Q999981'],
 'Q188507': ['Q699405', 'Q1497384'],
 'Q837171': ['Q7406919', 'Q25351891', 'Q2424752', 'Q806750', 'Q815823'],
 'Q124946': ['Q48803'],
 'Q181055': ['Q746549', 'Q2957687', 'Q28803', 'Q81799', 'Q7427595'],
 'Q211578': ['Q3632343', 'Q2207288', 'Q17200001'],
 'Q1318959': ['Q213441'],
 'Q15843013': ['Q309162'],
 'Q364005': ['Q13226383', 'Q121359', 'Q178706'],
 'Q22657': ['Q181790', 'Q34669510'],
 'Q10273457': ['Q8205328', 'Q2424752'],
 'Q1592332': ['Q1374250'],
 'Q8341': ['Q373342'],
 'Q11033': ['Q340169', 'Q17537576', 'Q121182', 'Q24229398'],
 'Q339836': ['Q3897491'],
 'Q1900657': ['Q43845', 'Q10355417'],
 'Q3095365': ['Q148571'],
 'Q1410837': ['Q37038'],
 'Q4358444': ['Q772630', 'Q2920963', 'Q10514020', 'Q18697490'],
 'Q212105': ['Q8434', 'Q627208'],
 'Q964401': ['Q1347367'],
 'Q41298': ['Q1002697', 'Q340169', 'Q39725049', 'Q11033', 'Q1261026'],
 'Q

Unnamed: 0,category_qis,subclassOf
0,Q6097,"[Q61951, Q19359564, Q1365365, Q11090700, Q2647..."
1,Q39201,[Q622852]
2,Q1294114,[Q999981]
3,Q188507,"[Q699405, Q1497384]"
4,Q837171,"[Q7406919, Q25351891, Q2424752, Q806750, Q815823]"
...,...,...
507,Q3636009,[Q1551574]
508,Q1428011,"[Q1493881, Q28640]"
509,Q217108,[Q206989]
510,Q7291,"[Q2426135, Q212434]"


In [81]:
wiki_subclasses = pd.DataFrame(list(category_triple.items()), columns=['category_qid', 'subclassOf']).explode('subclassOf')

In [82]:
wiki_subclasses

Unnamed: 0,category_qid,subclassOf
0,Q6097,Q61951
0,Q6097,Q19359564
0,Q6097,Q1365365
0,Q6097,Q11090700
0,Q6097,Q2647467
...,...,...
510,Q7291,Q2426135
510,Q7291,Q212434
511,Q895060,Q1368898
511,Q895060,Q15855160


In [112]:

df = category_occurences.merge(wiki_subclasses, left_on='qid', right_on='category_qid', how='left')

In [113]:
len(df.subclassOf.dropna())

1010

In [127]:
df

Unnamed: 0,category,occurences,splitted_category,qid,Yelp Category,Schema.org Type,category_qid,subclassOf
0,Restaurants,52268,restaurant,Q263063,Restaurants,Restaurant,,
1,Food,27781,food,Q894875,,,,
2,Shopping,24395,shopping,Q830036,Shopping,ShoppingCenter,Q830036,Q451967
3,Home Services,14356,home service,,,,,
4,Beauty & Spas,14292,beauty,Q7242,,,Q7242,Q367293
...,...,...,...,...,...,...,...,...
1897,Beach Bars,1,beach bar,,,,,
1898,DUI Schools,1,dui school,,,,,
1899,Patent Law,1,patent law,,,,,
1900,Housing Cooperatives,1,housing cooperative,Q17063278,,,,


In [87]:
df.query('splitted_category == "tattoo"')

Unnamed: 0,category,occurences,splitted_category,qid,category_qid,subclassOf
351,Tattoo,609,tattoo,Q72941682,Q72941682,Q890057
352,Tattoo,609,tattoo,Q72941682,Q72941682,Q838948
353,Tattoo,609,tattoo,Q72941682,Q72941682,Q478798


In [126]:
for i in df.itertuples():
    # print("_".join(i.category.split(' ')))
    # print("_".join(i.splitted_category.split(' ')))

restaurant
food
shopping
home service
beauty
spa
nightlife
health
health
health
medical
medical
local service
bar
automotive
event planning
service
service
sandwich
sandwich
american (traditional)
active life
pizza
pizza
pizza
pizza
coffee
coffee
coffee
coffee
coffee
coffee
coffee
tea
tea
tea
tea
tea
tea
fast food
breakfast
brunch
brunch
american (new)
hotel
hotel
travel
travel
home
home
garden
fashion
fashion
fashion
burger
art
art
art
entertainment
entertainment
auto repair
hair salon
nail salon
mexican
italian
specialty food
doctor
pet
real estate
seafood
seafood
seafood
fitnes
instruction
instruction
instruction
professional service
hair removal
dessert
chinese
bakery
bakery
bakery
bakery
bakery
bakery
grocery
salad
salad
hotel
hotel
chicken wing
cafe
cafe
ice cream
ice cream
ice cream
frozen yogurt
caterer
pet service
dentist
skin care
venue
event space
tire
tire
wine
spirit
spirit
spirit
beer
beer
deli
deli
oil change station
waxing
contractor
women's clothing
massage
massage
spo

In [49]:
from rdflib import Namespace, Graph, URIRef, Literal, BNode
from rdflib.namespace import RDFS

schema = Namespace("https://schema.org/")
example = Namespace("https://example.org/")
wiki = Namespace("https://www.wikidata.org/entity/")

# triple_file = gzip.open(filename=f"/home/ubuntu/vol1/virtuoso/import/yelp_business.nt.gz", mode="at",encoding="utf-8")

G = Graph()
for i in df.itertuples():
    G.add((URIRef(example["_".join(i.category.split(' '))]), URIRef(schema + 'sameAs'), URIRef(example["_".join(i.splitted_category.split(' '))])))
    if i.subclassOf is not None:
        G.add((wiki[i.category_qid], wiki['P279'], wiki[i.subclassOf]))

nt = G.serialize(destination='.nt')


# triple_file.write(G.serialize(format="nt"))

'coffee and tea' superclassOf 'qid of coffee'
'coffee and tea' superclassOf 'qid of tea'

'qid of coffee' wiki:subclassOf(P279) 'qid'
'qid of tea' wiki:subclassOf(P279) 'qid'

schema:'restaurant' sameAs wiki:'restaurant'

In [50]:
nt

"<https://example.org/Candy_Stores> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <https://example.org/candy_store> .\n<https://www.wikidata.org/entity/Q855691> <https://www.wikidata.org/entity/P279> <https://www.wikidata.org/entity/Q11460> .\n<https://example.org/Zoos> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <https://example.org/zoo> .\n<https://example.org/Water_Stores> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <https://example.org/water_store> .\n<https://www.wikidata.org/entity/Q6586445> <https://www.wikidata.org/entity/P279> <https://www.wikidata.org/entity/Q57660343> .\n<https://example.org/Lingerie> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <https://example.org/lingerie> .\n<https://example.org/Waterproofing> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <https://example.org/waterproofing> .\n<https://example.org/Gelato> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <https://example.org/gelato> .\n<https://www.wikidata.org/entity/Q7979077> <