In [137]:
# https://github.com/maxlath/wikibase-dump-filter
# https://github.com/maxlath/wikibase-dump-filter/blob/main/docs/cli.md
# extracter instance of (P31) city (Q515) eller state in US (Q35657) og som er i country (P17) USA (Q30) eller Mexico (Q96) eller Canada(Q16)
#!cat latest-all.json.gz | gzip -d | wikibase-dump-filter --languages en --claim 'P31:Q515,Q35657&P17:Q30,Q96,Q16' --omit sitelinks > city_state_america.ndjson

# extracter instance of (P31) business (Q4830453)
# led efter has subsidiary (P355)
#!cat latest-all.json.gz | gzip -d | wikibase-dump-filter --languages en --claim 'P31:Q4830453' --omit sitelinks > business.ndjson


In [138]:
# https://linuxhint.com/bash_head_tail_command/
#!head -n 1 simplified_dump.ndjson > simpl.ndjson

In [139]:
# https://stedolan.github.io/jq/tutorial/
# https://www.linode.com/docs/guides/using-jq-to-process-json-on-the-command-line/
#!jq '.' simpl.ndjson > simpl.json

In [3]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.wikidata_query_tools import wikidata_query, retrieve_wikidata_claims
import sys
import requests
from math import ceil
import numpy as np

In [5]:
query_all_us_states = """
SELECT ?state
WHERE {
  ?state wdt:P31 wd:Q35657 .
}
LIMIT 10
"""
wikidata_query(sparql_query=query_all_us_states)

Unnamed: 0,state.type,state.value
0,uri,http://www.wikidata.org/entity/Q99
1,uri,http://www.wikidata.org/entity/Q173
2,uri,http://www.wikidata.org/entity/Q724
3,uri,http://www.wikidata.org/entity/Q759
4,uri,http://www.wikidata.org/entity/Q771
5,uri,http://www.wikidata.org/entity/Q779
6,uri,http://www.wikidata.org/entity/Q782
7,uri,http://www.wikidata.org/entity/Q797
8,uri,http://www.wikidata.org/entity/Q812
9,uri,http://www.wikidata.org/entity/Q816


In [237]:
def category_query(category: str):
    return f"""SELECT distinct ?item ?itemLabel ?itemDescription WHERE{{
    ?item ?label "{category}"@en.
    ?article schema:about ?item .
    ?article schema:inLanguage "en" .
    ?article schema:isPartOf <https://en.wikipedia.org/>.
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}}}"""

In [239]:
wikidata_query(category_query(category="Shopping"))[['item.value', 'itemLabel.value', 'itemDescription.value']].head()

Unnamed: 0,item.value,itemLabel.value,itemDescription.value
0,http://www.wikidata.org/entity/Q17054429,Shopping,1998 novel by Gavin Kramer
1,http://www.wikidata.org/entity/Q250995,Shopping,1994 film by Paul W. S. Anderson
2,http://www.wikidata.org/entity/Q2279849,Shopping,Wikimedia disambiguation page
3,http://www.wikidata.org/entity/Q10667142,Shopping,"shopping mall in Luleå, Sweden"
4,http://www.wikidata.org/entity/Q28224756,Shopping,British post-punk trio based in London and Gla...


In [241]:
wikidata_query(category_query(category="restaurant"))['item.value'][0][31:]

'Q263063'

In [252]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)

# schema_types = pd.read_csv(get_path("schemaorg-current-https-types.csv"))

In [147]:
categories_unique = list(set(biz['categories'].str.cat(sep=', ').split(sep=', ')))
categories = list(biz['categories'].str.cat(sep=', ').split(sep=', '))

In [284]:
from collections import Counter

category_occurences = pd.DataFrame(list(dict(Counter(categories)).items()), columns=['category', 'occurences']).sort_values(by='occurences', ascending=False)

In [285]:
display(category_occurences)

Unnamed: 0,category,occurences
17,Restaurants,52268
18,Food,27781
12,Shopping,24395
135,Home Services,14356
107,Beauty & Spas,14292
...,...,...
1261,Beach Bars,1
1267,DUI Schools,1
1270,Patent Law,1
1274,Housing Cooperatives,1


In [286]:
word = 'Undersea/Hyperbaric Medicine'

def split_word_inc_slash(word):
    # Splitting the words that have a slash in them, and turning them into two words
    word_space = word.split(' ')
    word_space
    new_wordlist_a = []
    new_wordlist_b = []
    for i in word_space:
        i = i.lower()
        if '/' not in i:
            new_wordlist_a.append(i)
            new_wordlist_b.append(i)
        else:
            slash_split = i.split('/')
            new_wordlist_a.append(slash_split[0])
            new_wordlist_b.append(slash_split[1])
    new_word_a = ' '.join(new_wordlist_a)
    new_word_b = ' '.join(new_wordlist_b)
    return [new_word_a, new_word_b]

split_word_inc_slash(word)

['undersea medicine', 'hyperbaric medicine']

In [287]:
# transforms Yelp category words to words that is succesfull in finding the QID on wikidata
# TODO: fix & split issues
# TODO: fix & split has to be done first, and / split after
import inflect
p = inflect.engine()

categories_dict = {}
for word in categories_unique:
    if '&' in word:
        word_list = list(filter(None, word.lower().split(sep=' & ')))
        categories_dict[word] = word_list
    elif '/' in word:
        categories_dict[word] = split_word_inc_slash(word)
    else:
        categories_dict[word] = [word.lower()]

categories_dict
categories_dict_singular = {}
for key, value in categories_dict.items():
    new_value = []
    for word in value:
        if p.singular_noun(word) is False:
            word = word
        else:
            word = p.singular_noun(word)
        new_value.append(word)
    categories_dict_singular[key] = new_value

In [313]:
categories_dict_singular

{'Kitchen Supplies': ['kitchen supply'],
 'Mobile Home Dealers': ['mobile home dealer'],
 'Food Trucks': ['food truck'],
 'Watch Repair': ['watch repair'],
 'High Fidelity Audio Equipment': ['high fidelity audio equipment'],
 'Keys & Locksmiths': ['key', 'locksmith'],
 'Waterproofing': ['waterproofing'],
 'Hearing Aid Providers': ['hearing aid provider'],
 'Safe Stores': ['safe store'],
 'Karaoke Rental': ['karaoke rental'],
 'Ethnic Food': ['ethnic food'],
 'Italian': ['italian'],
 'Stucco Services': ['stucco service'],
 'Cultural Center': ['cultural center'],
 'Cheese Shops': ['cheese shop'],
 'Internal Medicine': ['internal medicine'],
 'Himalayan/Nepalese': ['himalayan', 'nepalese'],
 'Bars': ['bar'],
 'Game Truck Rental': ['game truck rental'],
 'Thai': ['thai'],
 'Dumpster Rental': ['dumpster rental'],
 'Contract Law': ['contract law'],
 'Ethiopian': ['ethiopian'],
 'Brewing Supplies': ['brewing supply'],
 'Car Dealers': ['car dealer'],
 'Horseback Riding': ['horseback riding'],


In [289]:
category_occurences['splitted_category'] = category_occurences['category'].map(categories_dict_singular)
category_occurences = category_occurences.explode('splitted_category')

In [290]:
class_mapping = pd.read_csv('/home/ubuntu/DVML-P7/Code/UtilityFiles/class_mappings.csv')
category_occurences = category_occurences.merge(class_mapping, left_on='category', right_on='YelpCategory', how='left')

In [335]:
category_occurences[category_occurences['category'].str.contains('&')][['category', 'occurences']].drop_duplicates()

Unnamed: 0,category,occurences
4,Beauty & Spas,14292
7,Health & Medical,11890
12,Event Planning & Services,9895
18,Coffee & Tea,6703
21,Breakfast & Brunch,6239
...,...,...
1268,Coffee & Tea Supplies,5
1324,Jails & Prisons,3
1330,IP & Internet Law,3
1332,Sheds & Outdoor Storage,3


In [292]:
category_qid = {}
for i in category_occurences['splitted_category'].to_list():
    try:
        cat = i.lower()
        cat_qid = wikidata_query(category_query(category=cat))['item.value'][0][31:]
        category_qid[cat] = cat_qid
    except:
        pass

In [301]:
category_occurences['qid'] = category_occurences['splitted_category'].map(category_qid)

In [302]:
category_occurences

Unnamed: 0,category,occurences,splitted_category,YelpCategory,SchemaType,qid
0,Restaurants,52268,restaurant,Restaurants,Restaurant,Q263063
1,Food,27781,food,,,Q894875
2,Shopping,24395,shopping,Shopping,ShoppingCenter,Q830036
3,Home Services,14356,home service,,,
4,Beauty & Spas,14292,beauty,,,Q7242
...,...,...,...,...,...,...
1424,Beach Bars,1,beach bar,,,
1425,DUI Schools,1,dui school,,,
1426,Patent Law,1,patent law,,,
1427,Housing Cooperatives,1,housing cooperative,,,Q17063278


In [303]:
category_qid_list = category_occurences['qid'].tolist()
category_qid_list = [i for i in category_qid_list if i is not np.nan]

In [306]:
item_list = category_qid_list

item_list_len = len(item_list)
# The limit is set to meet the requirements of the wikibase API wbgetentities (max 50)
# Ceil makes sure that the each subset from item_list is no longer than 50
limit = ceil(item_list_len / 50)

# Seperates the item_list to a nested_list with max 50 items in each list
piped_list = [item_list[pipe::limit] for pipe in range(limit)]

category_wikidata = {}
for i in piped_list:
    category_wikidata.update(retrieve_wikidata_claims(i))


In [307]:
category_triple = {}
for key, values in category_wikidata.items():
    for value in values:
        for obj in value:
            if obj['mainsnak']['property'] == 'P279':
                data_value = obj['mainsnak']['datavalue']['value']['id']
                category_triple[key] = category_triple.get(key, []) + [data_value]
category_triple

{'Q6097': ['Q61951',
  'Q19359564',
  'Q1365365',
  'Q11090700',
  'Q2647467',
  'Q473666'],
 'Q39201': ['Q622852'],
 'Q1294114': ['Q999981'],
 'Q188507': ['Q699405', 'Q1497384'],
 'Q837171': ['Q7406919', 'Q25351891', 'Q2424752', 'Q806750', 'Q815823'],
 'Q124946': ['Q48803'],
 'Q181055': ['Q746549', 'Q2957687', 'Q28803', 'Q81799', 'Q7427595'],
 'Q211578': ['Q3632343', 'Q2207288', 'Q17200001'],
 'Q1318959': ['Q213441'],
 'Q15843013': ['Q309162'],
 'Q364005': ['Q13226383', 'Q121359', 'Q178706'],
 'Q22657': ['Q181790', 'Q34669510'],
 'Q10273457': ['Q8205328', 'Q2424752'],
 'Q1592332': ['Q1374250'],
 'Q8341': ['Q373342'],
 'Q11033': ['Q340169', 'Q17537576', 'Q121182', 'Q24229398'],
 'Q339836': ['Q3897491'],
 'Q1900657': ['Q43845', 'Q10355417'],
 'Q3095365': ['Q148571'],
 'Q1410837': ['Q37038'],
 'Q4358444': ['Q772630', 'Q2920963', 'Q10514020', 'Q18697490'],
 'Q212105': ['Q8434', 'Q627208'],
 'Q964401': ['Q1347367'],
 'Q41298': ['Q1002697', 'Q340169', 'Q39725049', 'Q11033', 'Q1261026'],
 'Q

In [308]:
wiki_subclasses = pd.DataFrame(list(category_triple.items()), columns=['category_qid', 'subclassOf']).explode('subclassOf')

In [309]:
wiki_subclasses

Unnamed: 0,category_qid,subclassOf
0,Q6097,Q61951
0,Q6097,Q19359564
0,Q6097,Q1365365
0,Q6097,Q11090700
0,Q6097,Q2647467
...,...,...
512,Q7291,Q2426135
512,Q7291,Q212434
513,Q895060,Q1368898
513,Q895060,Q15855160


In [310]:

df = category_occurences.merge(wiki_subclasses, left_on='qid', right_on='category_qid', how='left')

In [311]:
len(df.subclassOf.dropna())

1012

In [312]:
df

Unnamed: 0,category,occurences,splitted_category,YelpCategory,SchemaType,qid,category_qid,subclassOf
0,Restaurants,52268,restaurant,Restaurants,Restaurant,Q263063,,
1,Food,27781,food,,,Q894875,,
2,Shopping,24395,shopping,Shopping,ShoppingCenter,Q830036,Q830036,Q451967
3,Home Services,14356,home service,,,,,
4,Beauty & Spas,14292,beauty,,,Q7242,Q7242,Q367293
...,...,...,...,...,...,...,...,...
1897,Beach Bars,1,beach bar,,,,,
1898,DUI Schools,1,dui school,,,,,
1899,Patent Law,1,patent law,,,,,
1900,Housing Cooperatives,1,housing cooperative,,,Q17063278,,


In [181]:
from rdflib import Namespace, Graph, URIRef, Literal, BNode
from rdflib.namespace import RDFS

schema = Namespace("https://schema.org/")
example = Namespace("https://example.org/")
wiki = Namespace("https://www.wikidata.org/entity/")

# triple_file = gzip.open(filename=f"/home/ubuntu/vol1/virtuoso/import/yelp_business.nt.gz", mode="at",encoding="utf-8")

G = Graph()
for i in df.itertuples():
    if i.SchemaType is not np.nan:
        G.add((URIRef(schema[i.SchemaType]), URIRef(schema['sameAs']), URIRef(wiki[i.qid])))
    else:
        if i.qid is not np.nan:
            if '&' in i.category:
                G.add((URIRef(example["_".join(i.category.split(' '))]), URIRef(example['superclassOf']), URIRef(wiki[i.qid])))
            else:
                G.add((URIRef(example["_".join(i.category.split(' '))]), URIRef(schema['sameAs']), URIRef(wiki[i.qid])))
    if i.subclassOf is not None:
        G.add((wiki[i.category_qid], wiki['P279'], wiki[i.subclassOf]))

nt = G.serialize(destination='categories.nt', format='nt')


# triple_file.write(G.serialize(format="nt"))



'coffee and tea' superclassOf 'qid of coffee'
'coffee and tea' superclassOf 'qid of tea'

'Restaurants' schema:sameAs 'qid of restaurant'

'qid of coffee' wiki:subclassOf(P279) 'qid'
'qid of tea' wiki:subclassOf(P279) 'qid'

schema:'restaurant' schema:sameAs wiki:'restaurant'