In [1]:
import sys
sys.path.append(sys.path[0][:sys.path[0].find('DVML-P7') + len('DVML-P7')])
import os
import gzip
import datetime
import numpy as np
import pandas as pd
from collections import Counter
from rdflib import Namespace, Graph, URIRef, Literal, XSD
from rdflib.namespace import RDFS
from pprint import pprint
from deepdiff import DeepDiff

from Code.UtilityFunctions.wikidata_functions import wikidata_query, get_subclass_of_wikientity, category_query, min_qid, get_all_wikidata_claims, compare_qids, categories_dict_singular, get_qid_label
from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.string_functions import space_words_lower

In [2]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)
categories = list(biz['categories'].str.cat(sep=', ').split(sep=', '))

category_occurences = pd.DataFrame(list(dict(Counter(categories)).items()),
                                columns=['category', 'occurences'
                                        ]).sort_values(by='occurences',
                                                        ascending=False)
# Maps the split categories to the original categories
category_occurences['split_category'] = category_occurences['category'].map(categories_dict_singular(categories))
category_occurences = category_occurences.explode('split_category')

# Maps the yelp categories that are already mapped to a schemaType to the original category.
class_mapping = pd.read_csv(get_path('class_mappings.csv'))
class_mapping['SchemaType'] = class_mapping['SchemaType'].apply(lambda x: eval(x)[0])
category_occurences['split_category'] = category_occurences['split_category'].apply(lambda x: x.title().replace(' ', ''))
category_occurences = category_occurences.merge(class_mapping,
                                            left_on='category',
                                            right_on='YelpCategory',
                                            how='left').drop(columns=['YelpCategory'])
category_occurences['schema_or_yelp_category'] = category_occurences['SchemaType'].fillna(category_occurences['split_category'])



In [3]:
category_occurences

Unnamed: 0,category,occurences,split_category,SchemaType,schema_or_yelp_category
0,Restaurants,52268,Restaurant,Restaurant,Restaurant
1,Food,27781,Food,,Food
2,Shopping,24395,Shopping,Retail,Retail
3,Home Services,14356,HomeService,Service,Service
4,Beauty & Spas,14292,Beauty,DaySpa,DaySpa
...,...,...,...,...,...
1422,Beach Bars,1,BeachBar,Beach,Beach
1423,DUI Schools,1,DuiSchool,,DuiSchool
1424,Patent Law,1,PatentLaw,,PatentLaw
1425,Housing Cooperatives,1,HousingCooperative,,HousingCooperative


In [13]:
cat = space_words_lower('restaurants')
wikidata_cat_query = wikidata_query(category_query(
category=cat))  # Querys wikidata for the QID of the category

wikidata_cat_query

Unnamed: 0,item.type,item.value,itemLabel.xml:lang,itemLabel.type,itemLabel.value,itemDescription.xml:lang,itemDescription.type,itemDescription.value
0,uri,http://www.wikidata.org/entity/Q11707,en,literal,restaurant,en,literal,single establishment which prepares and serves...


In [14]:
def filter_potential_qids(df_of_items: pd.DataFrame):
  potential_qids_list = df_of_items['item.value'].apply(lambda x: x.split('/')[-1]).to_list()
  potential_qids_list_str = "wd:"+" wd:".join(potential_qids_list)
  query = f"""
  SELECT DISTINCT ?item ?itemLabel 
  WHERE {{
    VALUES ?item {{{potential_qids_list_str}}}
    ?item wdt:P279 ?subclass .
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
  }}
  """
  return wikidata_query(query)

min_qid(filter_potential_qids(wikidata_cat_query))



('Q11707', 'restaurant')

In [15]:
# Query Wikidata for the QID of the split categories
category_qid2 = {}
for cat in category_occurences.itertuples():
    try:
        cat = space_words_lower(cat.schema_or_yelp_category)
        wikidata_cat_query = wikidata_query(category_query(category=cat))
        category_qid2[cat] = min_qid(filter_potential_qids(wikidata_cat_query))
    except:
        pass
category_qid2

{'restaurant': ('Q11707', 'restaurant'),
 'food': ('Q2095', 'food'),
 'retail': ('Q126793', 'retail'),
 'service': ('Q44127', 'server'),
 'night club': ('Q622425', 'nightclub'),
 'physician': ('Q39631', 'physician'),
 'event': ('Q1349920', 'event'),
 'sandwich': ('Q28803', 'sandwich'),
 'pizza': ('Q177', 'pizza'),
 'coffee': ('Q8486', 'coffee'),
 'tea': ('Q6097', 'tea'),
 'fast food restaurant': ('Q1751429', 'fast food restaurant'),
 'bed and breakfast': ('Q367914', 'bed and breakfast'),
 'hotel': ('Q27686', 'hotel'),
 'house': ('Q3947', 'house'),
 'fashion': ('Q12684', 'fashion'),
 'burger': ('Q6663', 'hamburger'),
 'nail salon': ('Q8007048', 'nail salon'),
 'pet': ('Q39201', 'pet'),
 'real estate agent': ('Q519076', 'real estate agent'),
 'seafood': ('Q192935', 'seafood'),
 'instruction': ('Q216200', 'legal norm'),
 'professional service': ('Q811501', 'construction management'),
 'hair removal': ('Q625145', 'depilation'),
 'dessert': ('Q12502', 'charlotte'),
 'bakery': ('Q274393', 'b