In [1]:
import pandas as pd
import spacy

from UtilityFunctions.get_data_path import get_path

In [16]:
def long_com_substring(st1, st2):
    """
    :param st1: The string we want to check for.
    :param st2: The string we check longest substring (st1) in.
    :return: Returns the length of the longest substring
    """

    ans = 0
    for a in range(len(st1)):
        for b in range(len(st2)):
            k = 0
            while (a + k) < len(st1) and (b + k) < len(st2) and st1[a + k] == st2[b + k]:
                k = k + 1
            ans = max(ans, k)

    return ans

In [17]:
def get_classes(entity: str):
    """
    :param entity: The RDF entity we want to check if it has a possible type in schema.org
    :return: The type to add as a class to the entity.
    """


    possible_classes = dict()
    entity_length = len(entity)


    for _type in list(schema['label']): # schema_classes['label'] is all types in schema.org
        if long_com_substring(entity, _type) >= entity_length * 0.9:
            # If the longest common substring between the entity and schema.org types is similar with 90 %,
            # we add the type as key and the ratio between the two strings as value.
            ratio = entity_length / len(_type)
            possible_classes[_type] = ratio

    if possible_classes:  # An empty dict will return False
        best_pos_class = max(possible_classes, key=possible_classes.get)  # Get the schema.org type with highest ratio
        best_pos_class_superclass = schema[schema['label'] == best_pos_class]['subTypeOf'] # Checks if the best_pos_class has a super type
        if best_pos_class_superclass:  # If we have a value here
            return schema + best_pos_class, schema + best_pos_class_superclass
        else:
            return schema + best_pos_class, None  # Return the highest ratio key as the entities type.
    else:
        return schema + 'LocalBusiness', None

In [25]:
def str_split(string):
    if isinstance(string, str):
        return string.split(", ")
    else:
        return string


def get_class_mapping(file):
    biz = pd.read_json(file, lines=True)#["categories"]
    schema = pd.read_csv(get_path("schemaorg-current-https-types.csv"))[["label", "subTypeOf"]]

    biz["categories"] = biz["categories"].apply(str_split)
    schema["subTypeOf"] = schema["subTypeOf"].str.replace("https://schema.org/", "", regex=False)

    categories = list({num for sublist in biz["categories"].tolist() if sublist for num in sublist})

    category_mapping = dict()

    for category in categories:
        category_length = len(category)
        possible_classes = dict()

        for schema_type in schema["label"]:
            if long_com_substring(category, schema_type) >= category_length * 0.90:
                ratio = category_length / len(schema_type)
                if ratio >= 0.25:
                    possible_classes[schema_type] = ratio

        if possible_classes:  # An empty dict will return False
            best_pos_class = max(possible_classes, key=possible_classes.get)  # Get the schema.org type with highest ratio
            best_pos_class_superclass = schema[schema['label'] == best_pos_class]['subTypeOf'].values[0] # Checks if the best_pos_class has a super type
            if isinstance(best_pos_class_superclass, float):
                best_pos_class_superclass = None

            category_mapping[category] = [best_pos_class, best_pos_class_superclass]

    return category_mapping  # Dict -> category: [mapping, subtype]

In [26]:
with open(file=get_path("yelp_academic_dataset_business.json"), mode="r") as file:
    res = get_class_mapping(file)

res

{'Electricians': ['Electrician', 'HomeAndConstructionBusiness'],
 'Preschools': ['Preschool', 'EducationalOrganization'],
 'Used': ['UsedCondition', 'OfferItemCondition'],
 'Experiences': ['PatientExperienceHealthAspect', 'HealthAspectEnumeration'],
 'Campgrounds': ['Campground', 'CivicStructure, LodgingBusiness'],
 'Playgrounds': ['Playground', 'CivicStructure'],
 'Synagogues': ['Synagogue', 'PlaceOfWorship'],
 'Halal': ['HalalDiet', 'RestrictedDiet'],
 'Vegetarian': ['VegetarianDiet', 'RestrictedDiet'],
 'Pharmacy': ['Pharmacy', 'MedicalBusiness, MedicalOrganization'],
 'Chinese': ['TraditionalChinese', 'MedicineSystem'],
 'Jewelry': ['JewelryStore', 'Store'],
 'Vegan': ['VeganDiet', 'RestrictedDiet'],
 'Insurance': ['InsuranceAgency', 'FinancialService'],
 'Kosher': ['KosherDiet', 'RestrictedDiet'],
 'Golf': ['GolfCourse', 'SportsActivityLocation'],
 'Shopping': ['ShoppingCenter', 'LocalBusiness'],
 'Apartments': ['Apartment', 'Accommodation'],
 'Automotive': ['AutomotiveBusiness', 

In [22]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)

schema = pd.read_csv(get_path("schemaorg-current-https-types.csv"))

In [23]:
schema

Unnamed: 0,id,label,comment,subTypeOf,enumerationtype,equivalentClass,properties,subTypes,supersedes,supersededBy,isPartOf
0,https://schema.org/3DModel,3DModel,"A 3D model represents some kind of 3D content,...",https://schema.org/MediaObject,,,"https://schema.org/about, https://schema.org/a...",,,,
1,https://schema.org/AMRadioChannel,AMRadioChannel,A radio channel that uses AM.,https://schema.org/RadioChannel,,,"https://schema.org/additionalType, https://sch...",,,,
2,https://schema.org/APIReference,APIReference,Reference documentation for application progra...,https://schema.org/TechArticle,,,"https://schema.org/about, https://schema.org/a...",,,,
3,https://schema.org/Abdomen,Abdomen,Abdomen clinical examination.,https://schema.org/PhysicalExam,https://schema.org/PhysicalExam,,,,,,
4,https://schema.org/AboutPage,AboutPage,Web page type: About page.,https://schema.org/WebPage,,,"https://schema.org/about, https://schema.org/a...",,,,
...,...,...,...,...,...,...,...,...,...,...,...
1348,https://schema.org/WritePermission,WritePermission,Permission to write or edit the document.,https://schema.org/DigitalDocumentPermissionType,https://schema.org/DigitalDocumentPermissionType,,,,,,
1349,https://schema.org/XPathType,XPathType,Text representing an XPath (typically but not ...,https://schema.org/Text,,,,,,,
1350,https://schema.org/XRay,XRay,X-ray imaging.,https://schema.org/MedicalImagingTechnique,https://schema.org/MedicalImagingTechnique,,,,,,
1351,https://schema.org/ZoneBoardingPolicy,ZoneBoardingPolicy,The airline boards by zones of the plane.,https://schema.org/BoardingPolicyType,https://schema.org/BoardingPolicyType,,,,,,


In [24]:
schema[schema['label'] == "Pharmacy"]['subTypeOf']

894    https://schema.org/MedicalBusiness, https://sc...
Name: subTypeOf, dtype: object

In [56]:
type(schema[schema['label'] == "Date"]['subTypeOf'].values[0])

float

In [14]:
biz

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"[Doctors, Traditional Chinese Medicine, Naturo...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"[Shipping Centers, Local Services, Notaries, M...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","[Department Stores, Shopping, Fashion, Home & ...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","[Restaurants, Food, Bubble Tea, Coffee & Tea, ...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","[Brewpubs, Breweries, Food]","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,3388 Gateway Blvd,Edmonton,AB,T6J 5H2,53.468419,-113.492054,3.0,13,1,"{'ByAppointmentOnly': 'False', 'RestaurantsPri...","[Nail Salons, Beauty & Spas]","{'Monday': '10:0-19:30', 'Tuesday': '10:0-19:3..."
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,2813 Bransford Ave,Nashville,TN,37204,36.115118,-86.766925,4.0,5,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","[Pets, Nurseries & Gardening, Pet Stores, Hobb...","{'Monday': '9:30-17:30', 'Tuesday': '9:30-17:3..."
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,"6020 E 82nd St, Ste 46",Indianapolis,IN,46250,39.908707,-86.065088,3.5,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","[Shopping, Jewelry, Piercing, Toy Stores, Beau...",
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,2472 Troy Rd,Edwardsville,IL,62025,38.782351,-89.950558,4.0,24,1,"{'BusinessParking': '{'garage': False, 'street...","[Fitness/Exercise Equipment, Eyewear & Opticia...","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ..."


In [31]:
def str_split(string):
    if isinstance(string, str):
        return string.split(", ")
    else:
        return string

biz["categories"] = biz["categories"].apply(str_split)

categories = list({num for sublist in biz["categories"].tolist() if sublist for num in sublist})

In [32]:
categories

['Donburi',
 'Party Characters',
 'Eyebrow Services',
 'Horse Equipment Shops',
 'Poutineries',
 'Tacos',
 'Sports Psychologists',
 'Life Insurance',
 'Hot Air Balloons',
 'DJs',
 'Holiday Decorating Services',
 'Guitar Stores',
 'Pediatric Dentists',
 'Waffles',
 'Musicians',
 'Food Trucks',
 'Boat Parts & Supplies',
 'Beaches',
 'Gelato',
 'Turkish',
 'Grilling Equipment',
 'Arabic',
 'Cycling Classes',
 'Wildlife Control',
 'Farming Equipment',
 'Fuel Docks',
 'Car Rental',
 'Live/Raw Food',
 'Home Automation',
 'Car Wash',
 'Medical Law',
 'Excavation Services',
 'Waxing',
 'Herbs & Spices',
 'Cardiologists',
 'DIY Auto Shop',
 'Interlock Systems',
 'Drive-In Theater',
 'Playgrounds',
 'Condominiums',
 'Wills',
 'Workers Compensation Law',
 'Coffee & Tea Supplies',
 'Environmental Testing',
 'Gay Bars',
 "Men's Clothing",
 'Walk-in Clinics',
 'Employment Law',
 'Pita',
 'Haitian',
 'Performing Arts',
 'Lahmacun',
 'Observatories',
 'Magicians',
 'Duplication Services',
 'RV Dealers

In [13]:
schema["label"]

0                  3DModel
1           AMRadioChannel
2             APIReference
3                  Abdomen
4                AboutPage
               ...        
1348       WritePermission
1349             XPathType
1350                  XRay
1351    ZoneBoardingPolicy
1352                   Zoo
Name: label, Length: 1353, dtype: object

In [15]:
biz["categories"][3]

['Restaurants', 'Food', 'Bubble Tea', 'Coffee & Tea', 'Bakeries']

In [32]:
sim_df = pd.DataFrame(data={"label": schema["label"]})

nlp = spacy.load('en_core_web_md')

def compare_str(string1, string2):
    str1 = nlp(string1)
    str2 = nlp(string2)

    return str1.similarity(str2)

In [33]:
for i in biz["categories"][3]:
    sim_df[i] = schema["label"].apply(compare_str, string2=i)

  return str1.similarity(str2)
  return str1.similarity(str2)
  return str1.similarity(str2)
  return str1.similarity(str2)
  return str1.similarity(str2)


In [42]:
nlp("Baker").similarity(nlp("Baker"))

1.0

In [34]:
sim_df

Unnamed: 0,label,Restaurants,Food,Bubble Tea,Coffee & Tea,Bakeries
0,3DModel,0.000000,0.000000,0.000000,0.000000,0.000000
1,AMRadioChannel,0.000000,0.000000,0.000000,0.000000,0.000000
2,APIReference,0.000000,0.000000,0.000000,0.000000,0.000000
3,Abdomen,0.030459,-0.017765,0.030372,-0.136703,0.030459
4,AboutPage,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...
1348,WritePermission,0.000000,0.000000,0.000000,0.000000,0.000000
1349,XPathType,0.000000,0.000000,0.000000,0.000000,0.000000
1350,XRay,0.072472,-0.008574,0.182335,0.058196,0.072472
1351,ZoneBoardingPolicy,0.000000,0.000000,0.000000,0.000000,0.000000


In [52]:
import difflib
difflib.get_close_matches(biz["categories"][3][4], schema["label"])

['BookSeries', 'Series', 'Bakery']

In [51]:
biz["categories"][3][3]

'Coffee & Tea'