### Tests different threshold configurations when mapping Yelp categories to schema.org types.

In [1]:
import pandas as pd
from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.schema_functions import get_class_mappings
from Code.UtilityFunctions.schema_functions import long_com_substring

In [2]:
substring_thresholds = [0.1, 0.3, 0.5, 0.7, 0.9]
ratio_thresholds = [0.25, 0.50, 0.75]

mapping_dict = dict()

for i in substring_thresholds:
    for j in ratio_thresholds:
        mappings = get_class_mappings(substring_threshold=i, ratio_threshold=j)
        print(f"For thresholds {i, j}: a total of {len(mappings)} are found.")

        for key, value in mappings.items():
            mapping_dict.setdefault(key, list()).append(value + f" [{i}, {j}]")

For thresholds (0.1, 0.25): a total of 1311 are found.
For thresholds (0.1, 0.5): a total of 1311 are found.
For thresholds (0.1, 0.75): a total of 1310 are found.
For thresholds (0.3, 0.25): a total of 1130 are found.
For thresholds (0.3, 0.5): a total of 1108 are found.
For thresholds (0.3, 0.75): a total of 1033 are found.
For thresholds (0.5, 0.25): a total of 481 are found.
For thresholds (0.5, 0.5): a total of 448 are found.
For thresholds (0.5, 0.75): a total of 339 are found.
For thresholds (0.7, 0.25): a total of 113 are found.
For thresholds (0.7, 0.5): a total of 91 are found.
For thresholds (0.7, 0.75): a total of 53 are found.
For thresholds (0.9, 0.25): a total of 36 are found.
For thresholds (0.9, 0.5): a total of 27 are found.
For thresholds (0.9, 0.75): a total of 11 are found.


In [3]:
mapping_dict

{'Town Car Service': ['AnatomicalSystem [0.1, 0.25]',
  'AnatomicalSystem [0.1, 0.5]',
  'AnatomicalSystem [0.1, 0.75]',
  'BroadcastService [0.3, 0.25]',
  'BroadcastService [0.3, 0.5]',
  'BroadcastService [0.3, 0.75]'],
 'Officiants': ['Anesthesia [0.1, 0.25]',
  'Anesthesia [0.1, 0.5]',
  'Anesthesia [0.1, 0.75]',
  'PostOffice [0.3, 0.25]',
  'PostOffice [0.3, 0.5]',
  'PostOffice [0.3, 0.75]',
  'PostOffice [0.5, 0.25]',
  'PostOffice [0.5, 0.5]',
  'PostOffice [0.5, 0.75]',
  'OfficialLegalValue [0.7, 0.25]',
  'OfficialLegalValue [0.7, 0.5]'],
 'Architectural Tours': ['ActiveNotRecruiting [0.1, 0.25]',
  'ActiveNotRecruiting [0.1, 0.5]',
  'ActiveNotRecruiting [0.1, 0.75]'],
 'Vegan': ['Atlas [0.1, 0.25]',
  'Atlas [0.1, 0.5]',
  'Atlas [0.1, 0.75]',
  'Brand [0.3, 0.25]',
  'Brand [0.3, 0.5]',
  'Brand [0.3, 0.75]',
  'VeganDiet [0.5, 0.25]',
  'VeganDiet [0.5, 0.5]',
  'VeganDiet [0.7, 0.25]',
  'VeganDiet [0.7, 0.5]',
  'VeganDiet [0.9, 0.25]',
  'VeganDiet [0.9, 0.5]'],
 'P

In [4]:
substring_thresholds = [0.7, 0.9]
ratio_thresholds = [0.50, 0.75]

mapping_dict2 = dict()

for i in substring_thresholds:
    for j in ratio_thresholds:
        mappings = get_class_mappings(substring_threshold=i, ratio_threshold=j)
        print(f"For thresholds {i, j}: a total of {len(mappings)} are found.")

        for key, value in mappings.items():
            mapping_dict2.setdefault(key, list()).append(value + f" [{i}, {j}]")

For thresholds (0.7, 0.5): a total of 91 are found.
For thresholds (0.7, 0.75): a total of 53 are found.
For thresholds (0.9, 0.5): a total of 27 are found.
For thresholds (0.9, 0.75): a total of 11 are found.


In [5]:
mapping_dict2

{'Officiants': ['OfficialLegalValue [0.7, 0.5]'],
 'Vegan': ['VeganDiet [0.7, 0.5]', 'VeganDiet [0.9, 0.5]'],
 'Props': ['Property [0.7, 0.5]'],
 'Airlines': ['Airline [0.7, 0.5]', 'Airline [0.7, 0.75]'],
 'Hospitals': ['Hospital [0.7, 0.5]', 'Hospital [0.7, 0.75]'],
 'Jewelry': ['JewelryStore [0.7, 0.5]', 'JewelryStore [0.9, 0.5]'],
 'Courthouses': ['Courthouse [0.7, 0.5]',
  'Courthouse [0.7, 0.75]',
  'Courthouse [0.9, 0.5]',
  'Courthouse [0.9, 0.75]'],
 'Gastroenterologist': ['Gastroenterologic [0.7, 0.5]',
  'Gastroenterologic [0.7, 0.75]'],
 'Archery': ['Researcher [0.7, 0.5]'],
 'Halal': ['HalalDiet [0.7, 0.5]', 'HalalDiet [0.9, 0.5]'],
 'Painters': ['HousePainter [0.7, 0.5]'],
 'Zoos': ['Zoo [0.7, 0.5]', 'Zoo [0.7, 0.75]'],
 'Pubs': ['BarOrPub [0.7, 0.5]'],
 'Taxis': ['Taxi [0.7, 0.5]', 'Taxi [0.7, 0.75]'],
 'Optometrists': ['Optometric [0.7, 0.5]', 'Optometric [0.7, 0.75]'],
 'Synagogues': ['Synagogue [0.7, 0.5]',
  'Synagogue [0.7, 0.75]',
  'Synagogue [0.9, 0.5]',
  'Synago