In [3]:
from collections import Counter

import pandas as pd
import numpy as np

from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.wikidata_functions import categories_dict_singular
from Code.UtilityFunctions.run_query import run_query

In [None]:
pd.options.display.max_rows = 100000

### Create category file

In [15]:
query = """
SELECT DISTINCT ?o
WHERE {
    ?s ?p ?o .
    FILTER regex(?o, "^https://purl.archive.org/purl/yelp/business_categories")
}
"""

categories = run_query(query, as_dataframe=True)
categories

Unnamed: 0,o.value
0,https://purl.archive.org/purl/yelp/business_ca...
1,https://purl.archive.org/purl/yelp/business_ca...
2,https://purl.archive.org/purl/yelp/business_ca...
3,https://purl.archive.org/purl/yelp/business_ca...
4,https://purl.archive.org/purl/yelp/business_ca...
...,...
1306,https://purl.archive.org/purl/yelp/business_ca...
1307,https://purl.archive.org/purl/yelp/business_ca...
1308,https://purl.archive.org/purl/yelp/business_ca...
1309,https://purl.archive.org/purl/yelp/business_ca...


In [None]:
# APPENDS to the .ttl file
with open(file="yelp_categories.ttl", mode="a") as file:
    file.write("@prefix yelpont: <https://purl.archive.org/purl/yelp/ontology#> \n@prefix yelpcat: <https://purl.archive.org/purl/yelp/business_categories#> \n@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> \n")
    for category in categories['o.value']:
        file.write(f'\nyelpcat:{category} a yelpont:YelpCategory ; \n    rdfs:label "{category} is a category in Yelp used to describe a business." . \n')

### Template for Yelp ontology file

In [None]:
query = """
SELECT DISTINCT ?o
WHERE {
    ?s ?p ?o .
    FILTER regex(?o, "^https://purl")
}
"""

predicates = run_query(query, as_dataframe=True)

In [None]:
# APPENDS to the .ttl file
with open(file="yelp_ontology.ttl", mode="a") as file:
    for predicate in predicates['p.value']:
        file.write(f'\n <{predicate}> \n a owl:ToDO ; \n rdfs:comment "Specifies something ..."@en ; \n rdfs:domain yelpont:X ; \n rdfs:label "{predicate}"@en ; \n rdfs:range yelpont:X . \n')

### Extra

In [3]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)
categories = list(biz['categories'].str.cat(sep=', ').split(sep=', '))
categories_dict_singular = categories_dict_singular(categories)

category_occurences = pd.DataFrame(list(dict(Counter(categories)).items()),
                                   columns=['category', 'occurences'
                                            ]).sort_values(by='occurences',
                                                           ascending=False)
# Maps the split categories to the original categories
category_occurences['split_category'] = category_occurences['category'].map(categories_dict_singular)
category_occurences = category_occurences.explode('split_category')

# Maps the yelp categories that are already mapped to a schemaType to the original category.
class_mapping = pd.read_csv(get_path('class_mappings.csv'))
category_occurences['split_category'] = category_occurences['split_category'].apply(lambda x: x.title().replace(' ', ''))
category_occurences = category_occurences.merge(class_mapping,
                                                left_on='split_category',
                                                right_on='YelpCategory',
                                                how='left')

In [4]:
category_occurences

Unnamed: 0,category,occurences,split_category,YelpCategory,SchemaType
0,Restaurants,52268,Restaurant,,
1,Food,27781,Food,,
2,Shopping,24395,Shopping,Shopping,['Retail']
3,Home Services,14356,HomeService,,
4,Beauty & Spas,14292,Beauty,,
...,...,...,...,...,...
1422,Beach Bars,1,BeachBar,,
1423,DUI Schools,1,DuiSchool,,
1424,Patent Law,1,PatentLaw,,
1425,Housing Cooperatives,1,HousingCooperative,,


In [5]:
non_mapped_df = category_occurences[(category_occurences["YelpCategory"].isna()) | (category_occurences["SchemaType"].isna())]
non_mapped_df['category'] = non_mapped_df['category'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_mapped_df['category'] = non_mapped_df['category'].str.lower()


In [8]:
non_mapped = list(non_mapped_df['category'].apply(lambda x: x.replace(" ", "_").title()).unique())
non_mapped

['Restaurants',
 'Food',
 'Home_Services',
 'Beauty_&_Spas',
 'Health_&_Medical',
 'Local_Services',
 'Bars',
 'Event_Planning_&_Services',
 'Sandwiches',
 'American_(Traditional)',
 'Active_Life',
 'Pizza',
 'Coffee_&_Tea',
 'Fast_Food',
 'Breakfast_&_Brunch',
 'American_(New)',
 'Hotels_&_Travel',
 'Home_&_Garden',
 'Fashion',
 'Burgers',
 'Arts_&_Entertainment',
 'Auto_Repair',
 'Hair_Salons',
 'Nail_Salons',
 'Mexican',
 'Italian',
 'Specialty_Food',
 'Doctors',
 'Pets',
 'Real_Estate',
 'Seafood',
 'Fitness_&_Instruction',
 'Professional_Services',
 'Hair_Removal',
 'Desserts',
 'Bakeries',
 'Salad',
 'Hotels',
 'Chicken_Wings',
 'Cafes',
 'Ice_Cream_&_Frozen_Yogurt',
 'Caterers',
 'Pet_Services',
 'Dentists',
 'Skin_Care',
 'Venues_&_Event_Spaces',
 'Tires',
 'Wine_&_Spirits',
 'Delis',
 'Oil_Change_Stations',
 'Waxing',
 'Contractors',
 "Women'S_Clothing",
 'Massage',
 'Sports_Bars',
 'Day_Spas',
 'General_Dentistry',
 'Flowers_&_Gifts',
 'Auto_Parts_&_Supplies',
 'Apartments',


In [7]:
len(non_mapped)

1278