In [10]:
# !pip install -U sentence-transformers

In [1]:
import re
import pandas as pd
import numpy as np
import inflect


def str_split(string):
    if isinstance(string, str):
        return string.split(", ")
    else:
        return string


def turn_words_singular(categories_dict):
    """
    For each key in the dictionary, the function takes the value (a list of words) and turns each word
    into its singular form

    :param categories_dict: a dictionary of categories and their associated words
    :return: A dictionary with the same keys as the original dictionary, but with the values being a
    list of singular words.
    """
    p = inflect.engine()
    categories_dict_singular = {}
    for key, value in categories_dict.items():
        new_value = []
        for word in value:
            word = word.lower()
            if p.singular_noun(word) is False:
                word = word
            else:
                word = p.singular_noun(word)
            new_value.append(word)
        categories_dict_singular[key] = new_value
    return categories_dict_singular

In [None]:
from sentence_transformers import SentenceTransformer

pd.options.mode.chained_assignment = None

In [2]:
def space_words_lower(string):
    return re.sub('(?<!^)([A-Z])([^A-Z])', r' \1\2', string).lower()

In [3]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [4]:
biz = pd.read_json("yelp_academic_dataset_business.json", lines=True)
schema = pd.read_csv("schemaorg-current-https-types.csv")[["label", "subTypeOf"]]

In [5]:
def clean_yelp_categories():
    categories_unique = list(set(biz["categories"].str.cat(sep=', ').split(sep=', ')))
    categories_dict = {categories_unique[i]: [categories_unique[i]] for i in range(len(categories_unique))}

    cat_string_manually_handled_dict = pd.read_excel("split_categories.xlsx", sheet_name="Sheet1", index_col=0, names=['column']).to_dict()['column']
    cat_string_manually_handled_dict = {k: v.split(', ') for k, v in cat_string_manually_handled_dict.items()}
    categories_dict.update(cat_string_manually_handled_dict)

    yelp_categories_dict = turn_words_singular(categories_dict)

    yelp_categories = list({category for sublist in yelp_categories_dict.values() for category in sublist})

    return yelp_categories, yelp_categories_dict

In [6]:
def clean_schema_categories():
    schema_categories = list(map(lambda x: space_words_lower(x), schema["label"].tolist()))
    schema_categories_dict = dict(zip(schema_categories, schema["label"].tolist()))

    return schema_categories, schema_categories_dict

In [8]:
def cos_sim_2d(x, y):
    norm_x = x / np.linalg.norm(x, axis=1, keepdims=True)
    norm_y = y / np.linalg.norm(y, axis=1, keepdims=True)
    return np.matmul(norm_x, norm_y.T)

In [9]:
def category_mappings(threshold):
    yelp_categories, yelp_categories_dict = clean_yelp_categories()
    schema_categories, schema_categories_dict = clean_schema_categories()

    swapped_yelp_categories = {sub_value: key for key, value in yelp_categories_dict.items() for sub_value in value}

    yelp_embeddings = model.encode(yelp_categories)
    schema_embeddings = model.encode(schema_categories)

    co_sim_matrix = pd.DataFrame(data=cos_sim_2d(yelp_embeddings, schema_embeddings),
                                 index=yelp_categories,
                                 columns=schema_categories
                                 ).apply(pd.to_numeric)

    mappings = pd.DataFrame(data={"mapped_schema": co_sim_matrix.idxmax(axis=1), "similarity": co_sim_matrix.max(axis=1)}, index=co_sim_matrix.index).sort_values(by="similarity", ascending=False).reset_index(names="yelp_category")

    # Getting correct names
    mappings["mapped_schema"] = mappings["mapped_schema"].apply(lambda x: schema_categories_dict.get(x))
    mappings["yelp_category"] = mappings["yelp_category"].apply(lambda x: swapped_yelp_categories.get(x))

    mappings = mappings[mappings['similarity'] >= threshold]
    print(len(mappings))
    # del dict_mappings["similarity"]
    mappings['mapped_schema'][mappings['mapped_schema'] == "None"] = None
    mappings.set_index('yelp_category', inplace=True)

    mapping_dictionary = mappings["mapped_schema"].squeeze().to_dict()

    print(len(mapping_dictionary))

    return mapping_dictionary

In [10]:
category_mappings(0.68)

400
383


{'Hostels': 'Hostel',
 'Pawn Shops': 'PawnShop',
 'Financial Services': 'FinancialService',
 'Beaches': 'Beach',
 'Adult Entertainment': 'AdultEntertainment',
 'Bus Stations': 'BusStation',
 'Department Stores': 'DepartmentStore',
 'Television Stations': 'TelevisionStation',
 'RV Parks': 'RVPark',
 'Amusement Parks': 'AmusementPark',
 'Elementary Schools': 'ElementarySchool',
 'Mosques': 'Mosque',
 'Pharmacy': 'Pharmacy',
 'Parks': 'Park',
 'Real Estate Agents': 'RealEstateAgent',
 'Bakeries': 'Bakery',
 'Art Galleries': 'ArtGallery',
 'Florists': 'Florist',
 'Nail Salons': 'NailSalon',
 'Courthouses': 'Courthouse',
 'Comedy Clubs': 'ComedyClub',
 'Playgrounds': 'Playground',
 'Hair Salons': 'HairSalon',
 'Hobby Shops': 'HobbyShop',
 'Preschools': 'Preschool',
 'Shopping Centers': 'ShoppingCenter',
 'Middle Schools & High Schools': 'MiddleSchool',
 'Aquariums': 'Aquarium',
 'Toy Stores': 'ToyStore',
 'Convenience Stores': 'ConvenienceStore',
 'Festivals': 'Festival',
 'Auto Repair': 'A

In [11]:
categories_unique = list(set(biz["categories"].str.cat(sep=', ').split(sep=', ')))
categories_dict = {categories_unique[i]: [categories_unique[i]] for i in range(len(categories_unique))}

cat_string_manually_handled_dict = pd.read_excel(get_path("split_categories.xlsx"), sheet_name="Sheet1", index_col=0, names=['column']).to_dict()['column']
cat_string_manually_handled_dict = {k: v.split(', ') for k, v in cat_string_manually_handled_dict.items()}
categories_dict.update(cat_string_manually_handled_dict)

yelp_categories_dict_singular = turn_words_singular(categories_dict)

yelp_categories = list({category for sublist in yelp_categories_dict_singular.values() for category in sublist})

In [12]:
yelp_categories

['aquarium',
 'buffet',
 'trailer rental',
 'venezuelan',
 'private tutor',
 'medical supply',
 'oriental',
 'valet service',
 'pressure washer',
 'department of motor vehicles',
 'window washing',
 'buddhist temple',
 'diy auto shop',
 'karate',
 'hair removal',
 'cremation service',
 'credit union',
 'faith-based crisis pregnancy center',
 'wedding chapel',
 'recording studio',
 'visitor center',
 'baby gear',
 'internal medicine',
 'interval training gym',
 'fur clothing',
 'door sale',
 'art gallery',
 'austrian',
 'windows installation',
 'meat shop',
 'supper club',
 'country dance hall',
 'security service',
 'installment loan',
 'cuban',
 'sperm clinic',
 'spray tanning',
 'scooter tour',
 'canadian (new)',
 'coffee supply',
 'arena',
 'pet training',
 'home inspector',
 'escape game',
 'television station',
 'pet groomer',
 'video',
 'clock repair',
 'florist',
 'historical building',
 'internet law',
 'hong kong style cafe',
 'government',
 'wine tour',
 'mongolian',
 'bocce 

In [13]:
schema_categories = list(map(lambda x: space_words_lower(x), schema["label"].tolist()))
schema_categories_dict = dict(zip(schema_categories, schema["label"].tolist()))

In [14]:
swapped_yelp_categories = {sub_value: key for key, value in yelp_categories_dict_singular.items() for sub_value in value}

In [15]:
yelp_embeddings = model.encode(yelp_categories)

In [16]:
schema_embeddings = model.encode(schema_categories)

In [17]:
co_sim_matrix = pd.DataFrame(data=cos_sim_2d(yelp_embeddings, schema_embeddings),
                             index=yelp_categories,
                             columns=schema_categories)

In [18]:
co_sim_matrix = co_sim_matrix.apply(pd.to_numeric)

In [19]:
co_sim_matrix

Unnamed: 0,3d model,am radio channel,api reference,abdomen,about page,accept action,accommodation,accounting service,achieve action,action,...,winery,withdrawn,work based program,workers union,write action,write permission,x path type,x ray,zone boarding policy,zoo
aquarium,0.194140,0.166180,0.107367,0.232968,0.097491,0.134389,0.246239,0.163466,0.157213,0.269890,...,0.267033,0.149620,0.109207,0.072035,0.066155,0.031288,0.032292,0.258573,-0.016667,0.557363
buffet,0.178702,0.079405,0.134935,0.248412,0.173229,0.045803,0.270982,0.296284,0.099117,0.167006,...,0.338663,0.169818,0.088270,0.189738,0.023237,-0.029982,-0.037868,0.062040,0.069411,0.359222
trailer rental,0.126763,0.080258,0.139991,0.002531,0.013091,0.040866,0.370656,0.126121,0.048690,0.160230,...,0.177687,0.106663,0.136700,0.078271,0.109225,0.112348,0.144158,0.041672,0.080879,0.273984
venezuelan,0.097291,0.110546,0.044107,0.199912,0.089531,0.121913,0.219158,0.064273,0.113156,0.206148,...,0.157055,0.162466,0.059780,0.162563,0.110618,0.109885,0.021141,0.139223,0.045815,0.221529
private tutor,0.112748,0.054525,0.133857,0.086723,0.162565,0.100321,0.197289,0.132844,0.149179,0.154061,...,0.118903,0.113177,0.203131,0.060000,0.186331,0.237676,0.063234,0.096052,0.061408,0.112287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
crepery,0.137858,0.064521,0.126296,0.216216,0.144789,0.096243,0.204162,0.166627,0.069804,0.145254,...,0.162239,0.271336,0.108061,0.146494,0.079377,0.053844,0.081561,0.179428,-0.041778,0.152277
maternity wear,0.096855,0.058868,0.074811,0.196418,0.074513,-0.019159,0.186593,0.079594,0.109707,0.096561,...,0.146855,0.082647,0.132960,0.288509,0.014970,-0.045101,0.021758,0.094771,0.047765,0.108860
excavation service,0.226101,0.043657,0.202529,0.085309,0.094744,0.126856,0.219173,0.338421,0.170093,0.215193,...,0.199801,-0.010108,0.281119,0.278930,0.055253,-0.036476,0.110615,0.218187,0.099704,0.207803
chicken wing,0.181211,0.000144,0.026832,0.197749,0.125397,0.066481,0.156071,0.042330,0.134012,0.203914,...,0.003213,0.165703,-0.013687,0.097708,0.045168,-0.037787,0.018621,0.141014,0.201918,0.296391


In [30]:
mappings = pd.DataFrame(data={"mapped_schema": co_sim_matrix.idxmax(axis=1), "similarity": co_sim_matrix.max(axis=1)}, index=co_sim_matrix.index).sort_values(by="similarity", ascending=False).reset_index(names="yelp_category")

In [31]:
# mappings["mapped_schema"] = mappings["mapped_schema"].apply(lambda x: schema_categories_dict.get(x))
# mappings["yelp_category"] = mappings["yelp_category"].apply(lambda x: swapped_yelp_categories.get(x))

In [46]:
threshold = 0.44 # Determine
dict_mappings = mappings[mappings['similarity'] >= threshold]
del dict_mappings["similarity"]
dict_mappings['mapped_schema'][dict_mappings['mapped_schema'] == "None"] = None
# dict_mappings.set_index('yelp_category', inplace=True)
# dict_mappings = dict_mappings.squeeze().to_dict()
#
# dict_mappings

In [47]:
mappings[["yelp_category", "mapped_schema"]].set_index('yelp_category').stack().groupby(level=0).agg(list).to_dict()

{' tea supply': ['how to supply'],
 '3d printing': ['3d model'],
 'acai bowl': ['bowling alley'],
 'accessory': ['medical device'],
 'accountant': ['accounting service'],
 'acne treatment': ['dermatologic'],
 'active life': ['active action status'],
 'acupuncture': ['medical therapy'],
 'addiction medicine': ['drug'],
 'adoption service': ['child care'],
 'adult': ['adult entertainment'],
 'adult education': ['adult entertainment'],
 'adult entertainment': ['adult entertainment'],
 'advertising': ['business audience'],
 'aerial fitnes': ['3d model'],
 'aerial tour': ['tourist trip'],
 'aesthetician': ['beauty salon'],
 'afghan': ['country'],
 'african': ['continent'],
 'air conditioning': ['hvac business'],
 'air duct cleaning': ['cleaning fee'],
 'aircraft dealer': ['motorcycle dealer'],
 'aircraft repair': ['auto repair'],
 'airline': ['airline'],
 'airport': ['airport'],
 'airport lounge': ['airport'],
 'airport shuttle': ['airport'],
 'airport terminal': ['airport'],
 'airsoft': ['

In [64]:
mappings.sample(10)

Unnamed: 0,yelp_category,mapped_schema,similarity
217,service station,bus station,0.767655
170,bowling,bowling alley,0.800881
1242,pest control,control action,0.433374
553,lighting store,electronics store,0.625327
456,tour,trip,0.657896
1027,garage door service,service,0.497877
1379,hydro-jetting,vessel,0.366921
497,donation center,donate action,0.643401
945,escape game,video game,0.517067
33,synagogue,synagogue,1.0


In [61]:
manual_mapping = pd.read_excel("yelp_schema_manual_mapping.xlsx", names=["yelp_category", "manual_mapped_schema"], usecols=[0, 1]).dropna()[:200]

In [36]:
mapp = mappings.copy()

mapp['mapped_schema'][mapp['similarity'] < 0.0 / 100] = "None"
merged = mapp.merge(manual_mapping, on='yelp_category', how='right')
merged

Unnamed: 0,yelp_category,mapped_schema,similarity,manual_mapped_schema
0,aviation service,airline,0.736034,airline
1,veterinarian,veterinary care,0.772105,veterinary care
2,shaved snow,mountain,0.426003,
3,game truck rental,auto rental,0.638553,
4,hunting supply,how to supply,0.499186,
...,...,...,...,...
195,cooking class,cook action,0.630217,
196,soup,throat,0.492700,
197,college counseling,college or university,0.509525,
198,magazine,hardcover,0.550991,


In [41]:
lst = []

for threshold in range(101):
    mapp = mappings.copy()

    mapp['mapped_schema'][mapp['similarity'] < threshold / 100] = "None"

    merged = mapp.merge(manual_mapping, on='yelp_category', how='right')

    tp = ((merged['mapped_schema'] == merged['manual_mapped_schema']) & (merged['mapped_schema'] != "None")).sum()  # True positive
    fp = ((merged['mapped_schema'] != "None") & (merged['manual_mapped_schema'] != merged['mapped_schema'])).sum()  # False negative
    fn = ((merged['mapped_schema'] == "None") & (merged['manual_mapped_schema'] != "None")).sum()  # False positive

    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    beta = 1
    f1 = (1+beta**2)*(precision * recall) / (beta**2*precision + recall)
    f10 = (1+10**2)*(precision * recall) / (10**2*precision + recall)
    lst.append([threshold / 100, recall, precision, f1, f10])

In [42]:
res = pd.DataFrame(lst, columns=["Threshold", "Recall", "Precision", "F-1", "F-10"])

In [43]:
res

Unnamed: 0,Threshold,Recall,Precision,F-1,F-10
0,0.00,1.000000,0.25,0.400000,0.971154
1,0.01,1.000000,0.25,0.400000,0.971154
2,0.02,1.000000,0.25,0.400000,0.971154
3,0.03,1.000000,0.25,0.400000,0.971154
4,0.04,1.000000,0.25,0.400000,0.971154
...,...,...,...,...,...
96,0.96,0.245902,1.00,0.394737,0.247751
97,0.97,0.245902,1.00,0.394737,0.247751
98,0.98,0.245902,1.00,0.394737,0.247751
99,0.99,0.245902,1.00,0.394737,0.247751


In [45]:
import plotly.express as px

fig = px.line(res, x="Threshold", y=["Recall", "Precision", "F-1", "F-10"], title='')

fig['data'][0]['line']['color'] = "rgb(234,143,129)"
fig['data'][1]['line']['color'] = "rgb(255,191,0)"
fig['data'][2]['line']['color'] = "rgb(32,115,171)"

fig.add_vline(x=res["Threshold"][res["F-10"].idxmax()], line_width=1.5, line_dash="dash", line_color="gray")

fig.update_layout(
    height=600,
    width=600,
    yaxis_title="",
    xaxis_title="Threshold (Cosine Similarity)",
    legend_title_text='',
    legend=dict(orientation="h",
                yanchor="top",
                y=1.08,
                xanchor="left",
                x=0.0),
    # plot_bgcolor='rgba(0,0,0,0)',
)

fig.update_xaxes(dtick=0.1, range=[-0.02,1.02])
fig.update_yaxes(dtick=0.1, range=[-0.02,1.02])

# fig.update_layout(dict(yaxis2={'anchor': 'x', 'overlaying': 'y', 'side': 'left'},
#                        yaxis={'anchor': 'x', 'domain': [0.0, 1.0], 'side':'right'}))

fig.show()

In [None]:
# possible_matches = co_sim_matrix.apply(lambda x: list(zip(x.abs().nlargest(5).index.tolist(), x.abs().nlargest(5).round(4).tolist())), axis=1)
# possible_matches

In [65]:
threshold = res["Threshold"][res["F-10"].idxmax()]

In [66]:
mapp = mappings.copy()

mapp['mapped_schema'][mapp['similarity'] < threshold] = "None"
mapp

Unnamed: 0,yelp_category,mapped_schema,similarity
0,hostel,hostel,1.000000
1,pawn shop,pawn shop,1.000000
2,financial service,financial service,1.000000
3,beach,beach,1.000000
4,adult entertainment,adult entertainment,1.000000
...,...,...,...
1411,countertop installation,,0.304017
1412,conveyor belt sushi,,0.294110
1413,water heater installation,,0.293358
1414,lighting fixture,,0.282112


In [72]:
mapp.to_csv("semantic_mappings_csv", index=False)