In [1]:
import sys
sys.path.append(sys.path[0][:sys.path[0].find('DVML-P7') + len('DVML-P7')])
import os
import gzip
import datetime
import numpy as np
import pandas as pd
from collections import Counter
from rdflib import Namespace, Graph, URIRef, Literal, XSD
from rdflib.namespace import RDFS

from Code.UtilityFunctions.wikidata_functions import wikidata_query, get_subclass_of_wikientity, category_query, min_qid, categories_dict_singular
from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.string_functions import space_words_lower
from Code.UtilityFunctions.run_query import run_query

In [2]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)
categories = list(biz['categories'].str.cat(sep=', ').split(sep=', '))

category_occurences = pd.DataFrame(list(dict(Counter(categories)).items()),
                                columns=['category', 'occurences'
                                        ]).sort_values(by='occurences',
                                                        ascending=False)
# Maps the split categories to the original categories
category_occurences['split_category'] = category_occurences['category'].map(categories_dict_singular(categories))
category_occurences = category_occurences.explode('split_category')

# Maps the yelp categories that are already mapped to a schemaType to the original category.
class_mapping = pd.read_csv(get_path('class_mappings.csv'))
class_mapping['SchemaType'] = class_mapping['SchemaType'].apply(lambda x: eval(x)[0])
category_occurences['split_category'] = category_occurences['split_category'].apply(lambda x: x.title().replace(' ', ''))
category_occurences = category_occurences.merge(class_mapping,
                                            left_on='category',
                                            right_on='YelpCategory',
                                            how='left').drop(columns=['YelpCategory'])
category_occurences['schema_or_yelp_category'] = category_occurences['SchemaType'].fillna(category_occurences['split_category'])



In [18]:
category_occurences

Unnamed: 0,category,occurences,split_category,SchemaType,schema_or_yelp_category
0,Restaurants,52268,Restaurant,Restaurant,Restaurant
1,Food,27781,Food,,Food
2,Shopping,24395,Shopping,Retail,Retail
3,Home Services,14356,HomeService,Service,Service
4,Beauty & Spas,14292,Beauty,DaySpa,DaySpa
...,...,...,...,...,...
1422,Beach Bars,1,BeachBar,Beach,Beach
1423,DUI Schools,1,DuiSchool,,DuiSchool
1424,Patent Law,1,PatentLaw,,PatentLaw
1425,Housing Cooperatives,1,HousingCooperative,,HousingCooperative


In [9]:
cat = space_words_lower('restaurants')
wikidata_cat_query = wikidata_query(category_query(
category=cat))  # Querys wikidata for the QID of the category

wikidata_cat_query

Unnamed: 0,item.type,item.value,itemLabel.xml:lang,itemLabel.type,itemLabel.value,itemDescription.xml:lang,itemDescription.type,itemDescription.value
0,uri,http://www.wikidata.org/entity/Q11707,en,literal,restaurant,en,literal,single establishment which prepares and serves...


In [17]:
qid = min_qid(wikidata_cat_query)
i = ['Q1', 'Q2', 'Q3', 'Q4']
l = [j for j in i if j in ['Q1', 'Q2']]
qid

('Q11707', 'restaurant')

In [18]:
query = f"""
SELECT ?instanceOf WHERE {{
  wd:{qid[0]} wdt:P31 ?instanceOf
}}
"""
wikidata_query(query)

Unnamed: 0,instanceOf.type,instanceOf.value
0,uri,http://www.wikidata.org/entity/Q63922515


In [53]:
def instance_of_query(qid: str):
    query = f"""
  SELECT DISTINCT ?instanceOf ?instanceOfLabel
  WHERE {{
    VALUES ?item {{wd:{qid[0]}}}
    ?item wdt:P31 ?instanceOf .
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
  }}
"""
    instance_of = wikidata_query(query)[['instanceOf.value', 'instanceOfLabel.value']]
    return instance_of

instance_of_query(qid)

Unnamed: 0,instanceOf.value,instanceOfLabel.value
0,http://www.wikidata.org/entity/Q63922515,hyperlocal manufacturing


In [54]:
for cat in category_occurences.itertuples():
    try:
        cat = space_words_lower(cat.schema_or_yelp_category)
        wikidata_cat_query = wikidata_query(category_query(category=cat))
        qid = min_qid(wikidata_cat_query)
        instance_of_list = instance_of_query(qid).apply(lambda x: x[0].split('/')[-1], axis=1).tolist()
        wikidata_cat_list = wikidata_cat_query.apply(lambda x: x[1].split('/')[-1], axis=1).tolist()
        for i in instance_of_list:
            if i in wikidata_cat_list:
                print(i)
    except:
        pass


In [25]:
# Query Wikidata for the QID of the split categories
category_qid2 = {}
for cat in category_occurences.itertuples():
    try:
        cat = space_words_lower(cat.schema_or_yelp_category)
        wikidata_cat_query = wikidata_query(category_query(category=cat))
        category_qid2[cat] = min_qid(wikidata_cat_query)
    except:
        pass
category_qid2