### Tests different threshold configurations when mapping Yelp categories to schema.org types.

In [2]:
from Code.UtilityFunctions.schema_functions import get_class_mappings
from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.string_functions import str_split, turn_words_singular, split_words, split_words_inc_slash
import pandas as pd

In [2]:
substring_thresholds = [0.1, 0.3, 0.50, 0.70, 0.90]
ratio_thresholds = [0.25, 0.50, 0.75]

mapping_dict = dict()

for i in substring_thresholds:
    for j in ratio_thresholds:
        mappings = get_class_mappings(substring_threshold=i, ratio_threshold=j)
        print(f"For thresholds {i, j}: a total of {len(mappings)} are found.")

        for key, value in mappings.items():
            mapping_dict.setdefault(key, list()).append(value + f" [{i}, {j}]")

print("Done")

For thresholds (0.1, 0.25): a total of 1408 are found.
For thresholds (0.1, 0.5): a total of 1408 are found.
For thresholds (0.1, 0.75): a total of 1406 are found.
For thresholds (0.3, 0.25): a total of 1358 are found.
For thresholds (0.3, 0.5): a total of 1350 are found.
For thresholds (0.3, 0.75): a total of 1287 are found.
For thresholds (0.5, 0.25): a total of 889 are found.
For thresholds (0.5, 0.5): a total of 839 are found.
For thresholds (0.5, 0.75): a total of 644 are found.
For thresholds (0.7, 0.25): a total of 322 are found.
For thresholds (0.7, 0.5): a total of 268 are found.
For thresholds (0.7, 0.75): a total of 173 are found.
For thresholds (0.9, 0.25): a total of 190 are found.
For thresholds (0.9, 0.5): a total of 154 are found.
For thresholds (0.9, 0.75): a total of 95 are found.
Done


In [3]:
mapping_dict

{'PhotographyClass': ['AnatomicalSystem [0.1, 0.25]',
  'AnatomicalSystem [0.1, 0.5]',
  'AnatomicalSystem [0.1, 0.75]',
  'PhotographAction [0.3, 0.25]',
  'PhotographAction [0.3, 0.5]',
  'PhotographAction [0.3, 0.75]',
  'PhotographAction [0.5, 0.25]',
  'PhotographAction [0.5, 0.5]',
  'PhotographAction [0.5, 0.75]'],
 'DoorSale': ['AmpStory [0.1, 0.25]',
  'AmpStory [0.1, 0.5]',
  'AmpStory [0.1, 0.75]',
  'FloorPlan [0.3, 0.25]',
  'FloorPlan [0.3, 0.5]',
  'FloorPlan [0.3, 0.75]',
  'HairSalon [0.5, 0.25]',
  'HairSalon [0.5, 0.5]',
  'HairSalon [0.5, 0.75]'],
 'DoorInstallation': ['ActionStatusType [0.1, 0.25]',
  'ActionStatusType [0.1, 0.5]',
  'ActionStatusType [0.1, 0.75]',
  'CompilationAlbum [0.3, 0.25]',
  'CompilationAlbum [0.3, 0.5]',
  'CompilationAlbum [0.3, 0.75]'],
 'PartySupply': ['ApplyAction [0.1, 0.25]',
  'ApplyAction [0.1, 0.5]',
  'ApplyAction [0.1, 0.75]',
  'ApplyAction [0.3, 0.25]',
  'ApplyAction [0.3, 0.5]',
  'ApplyAction [0.3, 0.75]',
  'HowToSupply [

In [6]:
class_mappings_csv = pd.read_csv(get_path("class_mappings.csv"))

In [11]:
mappings_list = list(class_mappings_csv['YelpCategory'])
mappings_list

['Airport',
 'Osteopath',
 'Repair',
 'RadioStation',
 'HardwareStore',
 'SportsClub',
 'DryCleaning',
 'Hotel',
 'RealEstate',
 'Recording',
 'Computer',
 'LocalService',
 'RvRepair',
 'AmusementPark',
 'BuddhistTemple',
 'Throat',
 'Service',
 'WaterStore',
 'Home',
 'PhysicalTherapy',
 'Midwife',
 'Kosher',
 'Pool',
 'Food',
 'Casino',
 'Mosque',
 'PostOffice',
 'Gastroenterologist',
 'Florist',
 'AdultEntertainment',
 'Vegan',
 'Wholesaler',
 'Dentist',
 'Podiatrist',
 'Fence',
 'Gate',
 'MiddleSchool',
 'HighSchool',
 'Electronic',
 'Vocational',
 'Wine',
 'Optician',
 'DaySpa',
 'HairSalon',
 'Aquarium',
 'MobilePhone',
 'Music',
 'Video',
 'MotorcycleDealer',
 'HomeOrganization',
 'ConvenienceStore',
 'Diner',
 'ShoppingCenter',
 'Tire',
 'MusicVenue',
 'BusStation',
 'ComedyClub',
 'Supply',
 'Neurologist',
 'PrintingService',
 'Library',
 'Grocery',
 'Veterinarian',
 'Musician',
 'OccupationalTherapy',
 'TrainStation',
 'Book',
 'CreditUnion',
 'Prop',
 'OfficeEquipment',
 'Ch

In [12]:
all_configurations_only_true_mapped = list(map(mapping_dict.get, mappings_list))
all_configurations_only_true_mapped

[['3DModel [0.1, 0.25]',
  '3DModel [0.1, 0.5]',
  '3DModel [0.1, 0.75]',
  'Airline [0.3, 0.25]',
  'Airline [0.3, 0.5]',
  'Airline [0.3, 0.75]',
  'Airport [0.5, 0.25]',
  'Airport [0.5, 0.5]',
  'Airport [0.5, 0.75]',
  'Airport [0.7, 0.25]',
  'Airport [0.7, 0.5]',
  'Airport [0.7, 0.75]',
  'Airport [0.9, 0.25]',
  'Airport [0.9, 0.5]',
  'Airport [0.9, 0.75]'],
 ['AboutPage [0.1, 0.25]',
  'AboutPage [0.1, 0.5]',
  'AboutPage [0.1, 0.75]',
  'Hackathon [0.3, 0.25]',
  'Hackathon [0.3, 0.5]',
  'Hackathon [0.3, 0.75]',
  'Homeopathic [0.5, 0.25]',
  'Homeopathic [0.5, 0.5]',
  'Homeopathic [0.5, 0.75]',
  'Osteopathic [0.7, 0.25]',
  'Osteopathic [0.7, 0.5]',
  'Osteopathic [0.7, 0.75]',
  'Osteopathic [0.9, 0.25]',
  'Osteopathic [0.9, 0.5]',
  'Osteopathic [0.9, 0.75]'],
 ['Action [0.1, 0.25]',
  'Action [0.1, 0.5]',
  'Action [0.1, 0.75]',
  'DaySpa [0.3, 0.25]',
  'DaySpa [0.3, 0.5]',
  'DaySpa [0.3, 0.75]',
  'Report [0.5, 0.25]',
  'Report [0.5, 0.5]',
  'Report [0.5, 0.75]

### Only high thresholds

In [24]:
substring_thresholds = [0.70, 0.90]
ratio_thresholds = [0.50, 0.75]

mapping_dict_high_thresholds = dict()

for i in substring_thresholds:
    for j in ratio_thresholds:
        mappings = get_class_mappings(substring_threshold=i, ratio_threshold=j)
        print(f"For thresholds {i, j}: a total of {len(mappings)} are found.")

        for key, value in mappings.items():
            mapping_dict_high_thresholds.setdefault(key, list()).append(value + f" [{i}, {j}]")

print("Done")

For thresholds (0.7, 0.5): a total of 268 are found.
For thresholds (0.7, 0.75): a total of 173 are found.
For thresholds (0.9, 0.5): a total of 154 are found.
For thresholds (0.9, 0.75): a total of 95 are found.
Done


In [27]:
mapping_dict_high_thresholds

{'Bowling': ['BowlingAlley [0.7, 0.5]', 'BowlingAlley [0.9, 0.5]'],
 'DisabilityLaw': ['DisabilitySupport [0.7, 0.5]',
  'DisabilitySupport [0.7, 0.75]'],
 'Apartment': ['Apartment [0.7, 0.5]',
  'Apartment [0.7, 0.75]',
  'Apartment [0.9, 0.5]',
  'Apartment [0.9, 0.75]'],
 'PetPhotography': ['PhotographAction [0.7, 0.5]',
  'PhotographAction [0.7, 0.75]'],
 'RealEstateAgent': ['RealEstateAgent [0.7, 0.5]',
  'RealEstateAgent [0.7, 0.75]',
  'RealEstateAgent [0.9, 0.5]',
  'RealEstateAgent [0.9, 0.75]'],
 'Accountant': ['BankAccount [0.7, 0.5]', 'BankAccount [0.7, 0.75]'],
 'Optometrist': ['Optometric [0.7, 0.5]', 'Optometric [0.7, 0.75]'],
 'Parking': ['ParkingMap [0.7, 0.5]', 'ParkingMap [0.9, 0.5]'],
 'EmploymentLaw': ['EmploymentAgency [0.7, 0.5]',
  'EmploymentAgency [0.7, 0.75]'],
 'Video': ['VideoGame [0.7, 0.5]', 'VideoGame [0.9, 0.5]'],
 'Home': ['Abdomen [0.7, 0.5]'],
 'Hotel': ['Hotel [0.7, 0.5]',
  'Hotel [0.7, 0.75]',
  'Hotel [0.9, 0.5]',
  'Hotel [0.9, 0.75]'],
 'Travel

### Total length of categories

In [7]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)
biz["categories"] = biz["categories"].apply(str_split)

# Iterate over categories in sublists ('If sublist' checks if the sublist is None) and insert them into a large set.
categories = list({category for sublist in biz["categories"].tolist() if sublist for category in sublist})
categories = split_words(categories, split_words_inc_slash)  # Split categories with & and /
categories = turn_words_singular(categories)  # Turn the categories singular

categories = [category.title().replace(" ", "") for sublist in categories.values() for category in sublist]  # Unpack the nested lists in dict values

print(len(categories), categories)

1430 ['GeneralDentistry', 'WaldorfSchool', 'GiftShop', 'CarAuction', 'Bowling', 'DisabilityLaw', 'DiamondBuyer', 'CucinaCampana', 'Roman', 'MusicalInstrumentService', 'SolarPanelCleaning', 'PanAsian', 'Apartment', 'SouvenirShop', 'AddictionMedicine', 'KidsHairSalon', 'PatioCovering', 'PetPhotography', 'BeerHall', 'Hungarian', 'RealEstateAgent', 'Magician', 'HotAirBalloon', 'Accountant', 'AutoGlassService', 'SportsMedicine', 'Kiteboarding', 'PatentLaw', 'Polish', 'Investing', 'Falafel', 'Climbing', 'Optometrist', 'Bookstore', 'Pop-UpRestaurant', 'Korean', 'Parking', 'PickYourOwnFarm', 'BoatCharter', 'PoleDancingClass', 'CarWindowTinting', 'Southern', 'DiyAutoShop', 'Armenian', 'KnittingSupply', 'Flower', 'Gift', 'Poke', 'Guamanian', 'EmploymentLaw', 'Video', 'VideoGameRental', 'PlacentaEncapsulation', 'Prosthetic', 'AutoSecurity', 'PartyBikeRental', 'HealthRetreat', 'Russian', 'Persian', 'Iranian', 'ClubCrawl', 'BeachEquipmentRental', 'Lahmacun', 'Home', 'RentalInsurance', 'Tubing', 'Ha

### Looking into class hierarchies

In [15]:
class_mapping_dict = get_class_mappings(substring_threshold=0.70, ratio_threshold=0.5)

In [16]:
#class_mapping_dict.pop('Gate')
class_mapping_dict.values()

dict_values(['Place', 'Course', 'OfficialLegalValue', 'Electrician', 'VeganDiet', 'Urologic', 'AutoBodyShop', 'Distillery', 'Midwifery', 'School', 'Museum', 'Abdomen', 'GardenStore', 'Rating', 'PlasticSurgery', 'HealthInsurancePlan', 'WPFooter', 'MobilePhoneStore', 'Florist', 'PostOffice', 'Hospital', 'VeterinaryCare', 'Flexibility', 'StrengthTraining', 'ClothingStore', 'TelevisionStation', 'NailSalon', 'OccupationalTherapy', 'SkiResort', 'Physician', 'MovieRentalStore', 'RealEstateAgent', 'Bakery', 'ShoppingCenter', 'LegalService', 'BankAccount', 'DepartmentStore', 'Skin', 'HairSalon', 'ChildCare', 'PrimaryCare', 'Optician', 'Car', 'BusStation', 'WPHeader', 'Wholesale', 'Psychiatric', 'ArtGallery', 'Painting', 'AnimalShelter', 'Playground', 'Audience', 'Date', 'MotorcycleRepair', 'MotorcycleDealer', 'Church', 'ComedyClub', 'Oncologic', 'PawnShop', 'AutoRepair', 'MedicalCode', 'MiddleSchool', 'HighSchool', 'VideoGameSeries', 'DisabilitySupport', 'HowToSupply', 'AutoDealer', 'Osteopathi

In [22]:
from networkx import DiGraph
from networkx.algorithms.traversal.depth_first_search import dfs_tree

schema_df = pd.read_csv(get_path("schemaorg-current-https-types.csv"))[["id", "subTypeOf"]]
schema_df = schema_df.apply(
    lambda x: x.str.split(', ').explode())  # Some types have multiple supertypes, so we explode those rows.

supertypes_dict = dict()

graph = DiGraph()
graph.add_edges_from(list(zip(schema_df["id"], schema_df["subTypeOf"])))  # Here we add EVERY row to the graph

# We do a depth first search on the constructed graph starting at each type in the input dictionary.
for _class in class_mapping_dict.values():
    supertypes = dfs_tree(graph, "https://schema.org/" + _class)
    edges = supertypes.edges()  # edges is a list of lists
    for edge in edges:
        supertypes_dict.setdefault(edge[0], set()).add(edge[1])

supertypes_df = pd.DataFrame(list(supertypes_dict.items()), columns=['type', 'superType'])
supertypes_df = supertypes_df.explode("superType")

supertypes_df.dropna() # To drop rows where the superType is NaN for example for Thing.

Place
Course
OfficialLegalValue
Electrician
VeganDiet
Urologic
AutoBodyShop
Distillery
Midwifery
School
Museum
Abdomen
GardenStore
Rating
PlasticSurgery
HealthInsurancePlan
WPFooter
MobilePhoneStore
Florist
PostOffice
Hospital
VeterinaryCare
Flexibility
StrengthTraining
ClothingStore
TelevisionStation
NailSalon
OccupationalTherapy
SkiResort
Physician
MovieRentalStore
RealEstateAgent
Bakery
ShoppingCenter
LegalService
BankAccount
DepartmentStore
Skin
HairSalon
ChildCare
PrimaryCare
Optician
Car
BusStation
WPHeader
Wholesale
Psychiatric
ArtGallery
Painting
AnimalShelter
Playground
Audience
Date
MotorcycleRepair
MotorcycleDealer
Church
ComedyClub
Oncologic
PawnShop
AutoRepair
MedicalCode
MiddleSchool
HighSchool
VideoGameSeries
DisabilitySupport
HowToSupply
AutoDealer
Osteopathic
HealthCare
PetStore
EducationEvent
BowlingAlley
Hostel
PathologyTest
IceCreamShop
Hospital
BusStation
TravelAgency
BoardingPolicyType
Resort
Neurologic
AutoRepair
RecyclingCenter
Enumeration
AccountingService
Driv

Unnamed: 0,type,superType
0,https://schema.org/Place,https://schema.org/Thing
2,https://schema.org/Course,https://schema.org/CreativeWork
2,https://schema.org/Course,https://schema.org/LearningResource
3,https://schema.org/CreativeWork,https://schema.org/Thing
4,https://schema.org/OfficialLegalValue,https://schema.org/LegalValueLevel
...,...,...
283,https://schema.org/Obstetric,https://schema.org/MedicalSpecialty
283,https://schema.org/Obstetric,https://schema.org/MedicalBusiness
284,https://schema.org/Gynecologic,https://schema.org/MedicalSpecialty
284,https://schema.org/Gynecologic,https://schema.org/MedicalBusiness
