In [1]:
from collections import Counter

import pandas as pd
import numpy as np

from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.wikidata_functions import categories_dict_singular

In [2]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)
categories = list(biz['categories'].str.cat(sep=', ').split(sep=', '))
categories_dict_singular = categories_dict_singular(categories)

category_occurences = pd.DataFrame(list(dict(Counter(categories)).items()),
                                   columns=['category', 'occurences'
                                            ]).sort_values(by='occurences',
                                                           ascending=False)
# Maps the split categories to the original categories
category_occurences['split_category'] = category_occurences['category'].map(categories_dict_singular)
category_occurences = category_occurences.explode('split_category')

# Maps the yelp categories that are already mapped to a schemaType to the original category.
class_mapping = pd.read_csv(get_path('class_mappings.csv'))
category_occurences['split_category'] = category_occurences['split_category'].apply(lambda x: x.title().replace(' ', ''))
category_occurences = category_occurences.merge(class_mapping,
                                                left_on='split_category',
                                                right_on='YelpCategory',
                                                how='left')

In [3]:
category_occurences

Unnamed: 0,category,occurences,split_category,YelpCategory,SchemaType
0,Restaurants,52268,Restaurant,,
1,Food,27781,Food,,
2,Shopping,24395,Shopping,Shopping,['Retail']
3,Home Services,14356,HomeService,,
4,Beauty & Spas,14292,Beauty,,
...,...,...,...,...,...
1422,Beach Bars,1,BeachBar,,
1423,DUI Schools,1,DuiSchool,,
1424,Patent Law,1,PatentLaw,,
1425,Housing Cooperatives,1,HousingCooperative,,


In [4]:
non_mapped_df = category_occurences[(category_occurences["YelpCategory"].isna()) | (category_occurences["SchemaType"].isna())]
non_mapped_df['category'] = non_mapped_df['category'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_mapped_df['category'] = non_mapped_df['category'].str.lower()


In [15]:
non_mapped = list(non_mapped_df['category'].apply(lambda x: x.title()).unique())
non_mapped

['Restaurants',
 'Food',
 'Home Services',
 'Beauty & Spas',
 'Health & Medical',
 'Local Services',
 'Bars',
 'Event Planning & Services',
 'Sandwiches',
 'American (Traditional)',
 'Active Life',
 'Pizza',
 'Coffee & Tea',
 'Fast Food',
 'Breakfast & Brunch',
 'American (New)',
 'Hotels & Travel',
 'Home & Garden',
 'Fashion',
 'Burgers',
 'Arts & Entertainment',
 'Auto Repair',
 'Hair Salons',
 'Nail Salons',
 'Mexican',
 'Italian',
 'Specialty Food',
 'Doctors',
 'Pets',
 'Real Estate',
 'Seafood',
 'Fitness & Instruction',
 'Professional Services',
 'Hair Removal',
 'Desserts',
 'Bakeries',
 'Salad',
 'Hotels',
 'Chicken Wings',
 'Cafes',
 'Ice Cream & Frozen Yogurt',
 'Caterers',
 'Pet Services',
 'Dentists',
 'Skin Care',
 'Venues & Event Spaces',
 'Tires',
 'Wine & Spirits',
 'Delis',
 'Oil Change Stations',
 'Waxing',
 'Contractors',
 "Women'S Clothing",
 'Massage',
 'Sports Bars',
 'Day Spas',
 'General Dentistry',
 'Flowers & Gifts',
 'Auto Parts & Supplies',
 'Apartments',


In [7]:
len(non_mapped)

1278

In [18]:
# Writes to .ttl file. It needs to be empty first
with open(file="../yelp_categories.ttl", mode="a") as file:
    file.write("@prefix ontology_uri: <https://purl.archive.org/purl/yelp/ontology#> \n @prefix category_uri: <https://purl.archive.org/purl/yelp/business_categories#> \n @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> \n")
    for category in non_mapped:
        print(category)
        file.write(f'\n category_uri:{category} a ontology_uri:datasetCategory ; \n rdfs:label "{category} is a category in Yelp used to describe a business." . \n')

Restaurants
Food
Home Services
Beauty & Spas
Health & Medical
Local Services
Bars
Event Planning & Services
Sandwiches
American (Traditional)
Active Life
Pizza
Coffee & Tea
Fast Food
Breakfast & Brunch
American (New)
Hotels & Travel
Home & Garden
Fashion
Burgers
Arts & Entertainment
Auto Repair
Hair Salons
Nail Salons
Mexican
Italian
Specialty Food
Doctors
Pets
Real Estate
Seafood
Fitness & Instruction
Professional Services
Hair Removal
Desserts
Bakeries
Salad
Hotels
Chicken Wings
Cafes
Ice Cream & Frozen Yogurt
Caterers
Pet Services
Dentists
Skin Care
Venues & Event Spaces
Tires
Wine & Spirits
Delis
Oil Change Stations
Waxing
Contractors
Women'S Clothing
Massage
Sports Bars
Day Spas
General Dentistry
Flowers & Gifts
Auto Parts & Supplies
Apartments
Convenience Stores
Home Decor
Gyms
Japanese
Pubs
Cocktail Bars
Sushi Bars
Barbeque
Juice Bars & Smoothies
Barbers
Car Dealers
Sporting Goods
Accessories
Drugstores
Cosmetic Dentists
Local Flavor
Furniture Stores
Pet Groomers
Asian Fusion
Co

In [3]:
from Code.UtilityFunctions.run_query import run_query

query = """
SELECT DISTINCT ?p
WHERE {
    ?s ?p ?o .
    FILTER regex(?p, "^https://purl")
}
"""

predicates = run_query(query, as_dataframe=True)

In [8]:
pd.options.display.max_rows = 1000

In [9]:
predicates

Unnamed: 0,p.value
0,https://purl.archive.org/purl/yelp/ontology#compliment_count
1,https://purl.archive.org/purl/yelp/ontology#haswednesday
2,https://purl.archive.org/purl/yelp/ontology#AcceptsInsurance
3,https://purl.archive.org/purl/yelp/ontology#AgesAllowed
4,https://purl.archive.org/purl/yelp/ontology#Alcohol
5,https://purl.archive.org/purl/yelp/ontology#BYOB
6,https://purl.archive.org/purl/yelp/ontology#BYOBCorkage
7,https://purl.archive.org/purl/yelp/ontology#BikeParking
8,https://purl.archive.org/purl/yelp/ontology#BusinessAcceptsBitcoin
9,https://purl.archive.org/purl/yelp/ontology#BusinessAcceptsCreditCards


In [12]:
for predicate in predicates['p.value']:
    print(predicate)

https://purl.archive.org/purl/yelp/ontology#compliment_count
https://purl.archive.org/purl/yelp/ontology#haswednesday
https://purl.archive.org/purl/yelp/ontology#AcceptsInsurance
https://purl.archive.org/purl/yelp/ontology#AgesAllowed
https://purl.archive.org/purl/yelp/ontology#Alcohol
https://purl.archive.org/purl/yelp/ontology#BYOB
https://purl.archive.org/purl/yelp/ontology#BYOBCorkage
https://purl.archive.org/purl/yelp/ontology#BikeParking
https://purl.archive.org/purl/yelp/ontology#BusinessAcceptsBitcoin
https://purl.archive.org/purl/yelp/ontology#BusinessAcceptsCreditCards
https://purl.archive.org/purl/yelp/ontology#ByAppointmentOnly
https://purl.archive.org/purl/yelp/ontology#Caters
https://purl.archive.org/purl/yelp/ontology#CoatCheck
https://purl.archive.org/purl/yelp/ontology#Corkage
https://purl.archive.org/purl/yelp/ontology#DogsAllowed
https://purl.archive.org/purl/yelp/ontology#DriveThru
https://purl.archive.org/purl/yelp/ontology#GoodForDancing
https://purl.archive.org/p

In [14]:
# Writes to .ttl file. It needs to be empty first
with open(file="../yelp_ontology.ttl", mode="a") as file:
    for predicate in predicates['p.value']:
        print(predicate)
        file.write(f'\n <{predicate}> \n a owl:ToDO ; \n rdfs:comment "Specifies something ..."@en ; \n rdfs:domain yelpont:X ; \n rdfs:label "{predicate}"@en ; \n rdfs:range yelpont:X . \n')

https://purl.archive.org/purl/yelp/ontology#compliment_count
https://purl.archive.org/purl/yelp/ontology#haswednesday
https://purl.archive.org/purl/yelp/ontology#AcceptsInsurance
https://purl.archive.org/purl/yelp/ontology#AgesAllowed
https://purl.archive.org/purl/yelp/ontology#Alcohol
https://purl.archive.org/purl/yelp/ontology#BYOB
https://purl.archive.org/purl/yelp/ontology#BYOBCorkage
https://purl.archive.org/purl/yelp/ontology#BikeParking
https://purl.archive.org/purl/yelp/ontology#BusinessAcceptsBitcoin
https://purl.archive.org/purl/yelp/ontology#BusinessAcceptsCreditCards
https://purl.archive.org/purl/yelp/ontology#ByAppointmentOnly
https://purl.archive.org/purl/yelp/ontology#Caters
https://purl.archive.org/purl/yelp/ontology#CoatCheck
https://purl.archive.org/purl/yelp/ontology#Corkage
https://purl.archive.org/purl/yelp/ontology#DogsAllowed
https://purl.archive.org/purl/yelp/ontology#DriveThru
https://purl.archive.org/purl/yelp/ontology#GoodForDancing
https://purl.archive.org/p

In [15]:
from Code.UtilityFunctions.run_query import run_query

query = """
SELECT DISTINCT ?s ?p ?o
WHERE {
    ?s ?p ?o .
    FILTER regex(?p, "AcceptsInsurance")
}
"""

result = run_query(query, as_dataframe=True)
result

Empty resultset
