In [1]:
# !pip install -U sentence-transformers

In [2]:
import re
import pandas as pd
import numpy as np

from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.string_functions import str_split, turn_words_singular

from sentence_transformers import SentenceTransformer

pd.options.mode.chained_assignment = None

ImportError: cannot import name 'str_split' from 'Code.UtilityFunctions.string_functions' (/home/ubuntu/DVML-P7/Code/UtilityFunctions/string_functions.py)

In [None]:
def space_words_lower(string):
    return re.sub('(?<!^)([A-Z])([^A-Z])', r' \1\2', string).lower()

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)
schema = pd.read_csv(get_path("schemaorg-current-https-types.csv"))[["label", "subTypeOf"]]

In [None]:
def clean_yelp_categories():
    categories_unique = list(set(biz["categories"].str.cat(sep=', ').split(sep=', ')))
    categories_dict = {categories_unique[i]: [categories_unique[i]] for i in range(len(categories_unique))}

    cat_string_manually_handled_dict = pd.read_excel(get_path("split_categories.xlsx"), sheet_name="Sheet1", index_col=0, names=['column']).to_dict()['column']
    cat_string_manually_handled_dict = {k: v.split(', ') for k, v in cat_string_manually_handled_dict.items()}
    categories_dict.update(cat_string_manually_handled_dict)

    yelp_categories_dict = turn_words_singular(categories_dict)

    yelp_categories = list({category for sublist in yelp_categories_dict.values() for category in sublist})

    return yelp_categories, yelp_categories_dict

In [None]:
def clean_schema_categories():
    schema_categories = list(map(lambda x: space_words_lower(x), schema["label"].tolist()))
    schema_categories_dict = dict(zip(schema_categories, schema["label"].tolist()))

    return schema_categories, schema_categories_dict

In [None]:
len(clean_schema_categories()[0])

1353

In [None]:
def cos_sim_2d(x, y):
    norm_x = x / np.linalg.norm(x, axis=1, keepdims=True)
    norm_y = y / np.linalg.norm(y, axis=1, keepdims=True)
    return np.matmul(norm_x, norm_y.T)

In [None]:
def category_mappings(threshold):
    yelp_categories, yelp_categories_dict = clean_yelp_categories()
    schema_categories, schema_categories_dict = clean_schema_categories()

    swapped_yelp_categories = {sub_value: key for key, value in yelp_categories_dict.items() for sub_value in value}

    yelp_embeddings = model.encode(yelp_categories)
    schema_embeddings = model.encode(schema_categories)

    co_sim_matrix = pd.DataFrame(data=cos_sim_2d(yelp_embeddings, schema_embeddings),
                                 index=yelp_categories,
                                 columns=schema_categories
                                 ).apply(pd.to_numeric)

    mappings = pd.DataFrame(data={"mapped_schema": co_sim_matrix.idxmax(axis=1), "similarity": co_sim_matrix.max(axis=1)}, index=co_sim_matrix.index).sort_values(by="similarity", ascending=False).reset_index(names="yelp_category")

    # Getting correct names
    mappings["mapped_schema"] = mappings["mapped_schema"].apply(lambda x: schema_categories_dict.get(x))
    mappings["yelp_category"] = mappings["yelp_category"].apply(lambda x: swapped_yelp_categories.get(x))

    mappings = mappings[mappings['similarity'] >= threshold]
    print(len(mappings))
    # del dict_mappings["similarity"]
    mappings['mapped_schema'][mappings['mapped_schema'] == "None"] = None
    mappings.set_index('yelp_category', inplace=True)

    mapping_dictionary = mappings["mapped_schema"].squeeze().to_dict()

    print(len(mapping_dictionary))

    return mapping_dictionary

In [None]:
category_mappings(0.68)

400
384


{'Hostels': 'Hostel',
 'Financial Services': 'FinancialService',
 'Pawn Shops': 'PawnShop',
 'Mosques': 'Mosque',
 'Pharmacy': 'Pharmacy',
 'RV Parks': 'RVPark',
 'Art Galleries': 'ArtGallery',
 'Amusement Parks': 'AmusementPark',
 'Elementary Schools': 'ElementarySchool',
 'Bakeries': 'Bakery',
 'Adult Entertainment': 'AdultEntertainment',
 'Real Estate Agents': 'RealEstateAgent',
 'Television Stations': 'TelevisionStation',
 'Beaches': 'Beach',
 'Parks': 'Park',
 'Department Stores': 'DepartmentStore',
 'Bus Stations': 'BusStation',
 'Festivals': 'Festival',
 'Middle Schools & High Schools': 'MiddleSchool',
 'Convenience Stores': 'ConvenienceStore',
 'Hobby Shops': 'HobbyShop',
 'Aquariums': 'Aquarium',
 'Florists': 'Florist',
 'Shopping Centers': 'ShoppingCenter',
 'Notaries': 'Notary',
 'Preschools': 'Preschool',
 'Hair Salons': 'HairSalon',
 'Courthouses': 'Courthouse',
 'Comedy Clubs': 'ComedyClub',
 'Nail Salons': 'NailSalon',
 'Auto Repair': 'AutoRepair',
 'Toy Stores': 'ToySto

In [None]:
categories_unique = list(set(biz["categories"].str.cat(sep=', ').split(sep=', ')))
categories_dict = {categories_unique[i]: [categories_unique[i]] for i in range(len(categories_unique))}

cat_string_manually_handled_dict = pd.read_excel(get_path("split_categories.xlsx"), sheet_name="Sheet1", index_col=0, names=['column']).to_dict()['column']
cat_string_manually_handled_dict = {k: v.split(', ') for k, v in cat_string_manually_handled_dict.items()}
categories_dict.update(cat_string_manually_handled_dict)

yelp_categories_dict_singular = turn_words_singular(categories_dict)

yelp_categories = list({category for sublist in yelp_categories_dict_singular.values() for category in sublist})

In [None]:
yelp_categories

['windshield repair',
 'home decor',
 'bail bondsman',
 'oral surgeon',
 'meditation center',
 'valet service',
 'dialysis clinic',
 'knife sharpening',
 'health market',
 'hot tub service',
 'challenge course',
 'drywall installation',
 'uzbek',
 'nepalese',
 'dumpster rental',
 'art space rental',
 'blood donation center',
 'optician',
 'home staging',
 'sugaring',
 'mini golf',
 'sports betting',
 'it service',
 'billing service',
 'car wash',
 'wig',
 'cidery',
 'garden',
 'car share service',
 'juice bar',
 'restaurant supply',
 'medical law',
 'well drilling',
 'handyman',
 'prosthetic',
 'head shop',
 'stonemason',
 'safe store',
 'suppplement',
 'scooter tour',
 'landscaping',
 'hookah bar',
 'british',
 'printing service',
 'bike shop',
 'sailing',
 'magazine',
 'cambodian',
 'dietitian',
 'laundry service',
 'generator installation',
 'machine rental',
 'rheumatologist',
 'meadery',
 'furniture reupholstery',
 'delivery service',
 'outdoor power equipment service',
 'immigrat

In [None]:
schema_categories = list(map(lambda x: space_words_lower(x), schema["label"].tolist()))
schema_categories_dict = dict(zip(schema_categories, schema["label"].tolist()))

In [None]:
swapped_yelp_categories = {sub_value: key for key, value in yelp_categories_dict_singular.items() for sub_value in value}

In [None]:
yelp_embeddings = model.encode(yelp_categories)

In [None]:
schema_embeddings = model.encode(schema_categories)

In [None]:
co_sim_matrix = pd.DataFrame(data=cos_sim_2d(yelp_embeddings, schema_embeddings),
                             index=yelp_categories,
                             columns=schema_categories)

In [None]:
co_sim_matrix = co_sim_matrix.apply(pd.to_numeric)

In [None]:
co_sim_matrix

Unnamed: 0,3d model,am radio channel,api reference,abdomen,about page,accept action,accommodation,accounting service,achieve action,action,...,winery,withdrawn,work based program,workers union,write action,write permission,x path type,x ray,zone boarding policy,zoo
windshield repair,0.166756,0.087508,0.081118,0.049472,0.134348,0.046928,0.107238,0.192772,0.119944,0.188658,...,0.293492,0.080718,0.126126,0.122515,0.050046,0.044432,0.011668,0.245469,0.047809,0.142943
home decor,0.239683,0.032066,0.061307,0.121317,0.226014,0.095482,0.372689,0.139357,0.186673,0.275329,...,0.271768,0.082926,0.103699,0.195467,0.149618,0.070270,0.062545,0.143552,0.053353,0.271928
bail bondsman,0.055890,0.039353,0.068994,0.076917,0.044639,0.098138,0.110004,0.152378,0.076060,0.172300,...,0.061178,0.143091,0.003429,0.029254,0.119204,0.056337,-0.057062,0.074161,0.018550,0.124090
oral surgeon,0.161944,0.054144,0.104871,0.293079,0.123985,0.076816,0.142175,0.215740,0.135928,0.155813,...,0.190576,0.189891,0.139600,0.145375,0.117496,0.110604,0.040190,0.228359,-0.043729,0.152814
meditation center,0.082787,0.113344,0.131031,0.146631,0.105668,0.089245,0.398364,0.145187,0.149898,0.146208,...,0.265737,0.129809,0.127742,0.107701,0.055418,0.007742,0.077007,0.117857,0.051437,0.245461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
traditional clothing,0.116899,0.040039,0.064779,0.125075,0.083955,0.059935,0.150337,0.062923,0.135912,0.167903,...,0.232257,0.017010,0.077018,0.181513,0.029124,-0.074473,0.050523,-0.033423,0.013548,0.111725
personal assistant,0.255214,0.088899,0.263478,0.223524,0.126910,0.234296,0.267924,0.402887,0.232142,0.267464,...,0.114691,0.106661,0.418440,0.243352,0.217917,0.105129,-0.023650,0.147992,-0.050308,0.199811
deli,0.177560,0.089469,0.119397,0.204706,0.115743,0.202494,0.269470,0.222779,0.179210,0.332940,...,0.243349,0.280952,0.086047,0.163815,0.192188,0.092792,0.056010,0.192118,-0.007573,0.189681
door sale,0.137169,0.041028,-0.006442,0.095963,0.210668,0.219453,0.241950,0.112280,0.173180,0.244552,...,0.196073,0.187763,0.097118,0.090319,0.154394,0.063440,0.071788,0.137616,0.015678,0.142743


In [None]:
mappings = pd.DataFrame(data={"mapped_schema": co_sim_matrix.idxmax(axis=1), "similarity": co_sim_matrix.max(axis=1)}, index=co_sim_matrix.index).sort_values(by="similarity", ascending=False).reset_index(names="yelp_category")

In [None]:
# mappings["mapped_schema"] = mappings["mapped_schema"].apply(lambda x: schema_categories_dict.get(x))
# mappings["yelp_category"] = mappings["yelp_category"].apply(lambda x: swapped_yelp_categories.get(x))

In [None]:
threshold = 0.68 # Determine
dict_mappings = mappings[mappings['similarity'] >= threshold]
del dict_mappings["similarity"]
dict_mappings['mapped_schema'][dict_mappings['mapped_schema'] == "None"] = None
# dict_mappings.set_index('yelp_category', inplace=True)
# dict_mappings = dict_mappings.squeeze().to_dict()
#
# dict_mappings

In [None]:
mappings[["yelp_category", "mapped_schema"]].set_index('yelp_category').stack().groupby(level=0).agg(list).to_dict()

{' tea supply': ['how to supply'],
 '3d printing': ['3d model'],
 'acai bowl': ['bowling alley'],
 'accessory': ['medical device'],
 'accountant': ['accounting service'],
 'acne treatment': ['dermatologic'],
 'active life': ['active action status'],
 'acupuncture': ['medical therapy'],
 'addiction medicine': ['drug'],
 'adoption service': ['child care'],
 'adult': ['adult entertainment'],
 'adult education': ['adult entertainment'],
 'adult entertainment': ['adult entertainment'],
 'advertising': ['business audience'],
 'aerial fitnes': ['3d model'],
 'aerial tour': ['tourist trip'],
 'aesthetician': ['beauty salon'],
 'afghan': ['country'],
 'african': ['continent'],
 'air conditioning': ['hvac business'],
 'air duct cleaning': ['cleaning fee'],
 'aircraft dealer': ['motorcycle dealer'],
 'aircraft repair': ['auto repair'],
 'airline': ['airline'],
 'airport': ['airport'],
 'airport lounge': ['airport'],
 'airport shuttle': ['airport'],
 'airport terminal': ['airport'],
 'airsoft': ['

In [None]:
mappings

Unnamed: 0,yelp_category,mapped_schema,similarity
0,hostel,hostel,1.000000
1,financial service,financial service,1.000000
2,pawn shop,pawn shop,1.000000
3,mosque,mosque,1.000000
4,pharmacy,pharmacy,1.000000
...,...,...,...
1411,countertop installation,interaction counter,0.304017
1412,conveyor belt sushi,canal,0.294110
1413,water heater installation,lake body of water,0.293358
1414,lighting fixture,city hall,0.282112


In [None]:
manual_mapping = pd.read_excel(get_path("yelp_schema_manual_mapping.xlsx"), names=["yelp_category", "manual_mapped_schema"], usecols=[0, 1]).dropna()[:200]

In [None]:
mapp = mappings.copy()

mapp['mapped_schema'][mapp['similarity'] < 0.0 / 100] = "None"
merged = mapp.merge(manual_mapping, on='yelp_category', how='right')
merged

Unnamed: 0,yelp_category,mapped_schema,similarity,manual_mapped_schema
0,aviation service,airline,0.736034,airline
1,veterinarian,veterinary care,0.772105,veterinary care
2,shaved snow,mountain,0.426003,
3,game truck rental,auto rental,0.638553,
4,hunting supply,how to supply,0.499186,
...,...,...,...,...
195,cooking class,cook action,0.630217,
196,soup,throat,0.492700,
197,college counseling,college or university,0.509525,
198,magazine,hardcover,0.550991,


In [None]:
lst = []

for threshold in range(101):
    mapp = mappings.copy()

    mapp['mapped_schema'][mapp['similarity'] < threshold / 100] = "None"

    merged = mapp.merge(manual_mapping, on='yelp_category', how='right')

    tp = ((merged['mapped_schema'] == merged['manual_mapped_schema']) & (merged['mapped_schema'] != "None")).sum()  # True positive
    fp = ((merged['mapped_schema'] != "None") & (merged['manual_mapped_schema'] != merged['mapped_schema'])).sum()  # False negative
    fn = ((merged['mapped_schema'] == "None") & (merged['manual_mapped_schema'] != "None")).sum()  # False positive

    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    beta = 1
    f1 = (1+beta**2)*(precision * recall) / (beta**2*precision + recall)
    lst.append([threshold / 100, recall, precision, f1])

In [None]:
res = pd.DataFrame(lst, columns=["Threshold", "Recall", "Precision", u"F\u2081-score"])

In [None]:
res

Unnamed: 0,Threshold,Recall,Precision,F₁-score
0,0.00,1.000000,0.25,0.400000
1,0.01,1.000000,0.25,0.400000
2,0.02,1.000000,0.25,0.400000
3,0.03,1.000000,0.25,0.400000
4,0.04,1.000000,0.25,0.400000
...,...,...,...,...
96,0.96,0.245902,1.00,0.394737
97,0.97,0.245902,1.00,0.394737
98,0.98,0.245902,1.00,0.394737
99,0.99,0.245902,1.00,0.394737


In [None]:
import plotly.express as px

fig = px.line(res, x="Threshold", y=["Recall", "Precision", u"F\u2081-score"], title='')

fig['data'][0]['line']['color'] = "rgb(234,143,129)"
fig['data'][1]['line']['color'] = "rgb(255,191,0)"
fig['data'][2]['line']['color'] = "rgb(32,115,171)"

fig.add_vline(x=res["Threshold"][res[u"F\u2081-score"].idxmax()], line_width=1.5, line_dash="dash", line_color="gray")
fig.add_hrect(y0=-0.05,
              y1=1.05,
              x0=res["Threshold"][res[u"F\u2081-score"].idxmax()] - 0.01,
              x1=res["Threshold"][res["Precision"].idxmax()] - 0.01,
              line_width=0,
              fillcolor="red",
              opacity=0.1)

fig.update_layout(
    height=600,
    width=600,
    yaxis_title="",
    xaxis_title="Threshold (Cosine Similarity)",
    legend_title_text='',
    legend=dict(orientation="h",
                yanchor="top",
                y=1.08,
                xanchor="left",
                x=0.0),
    # plot_bgcolor='rgba(0,0,0,0)',
)

fig.update_xaxes(dtick=0.1, range=[-0.02,1.02])
fig.update_yaxes(dtick=0.1, range=[-0.02,1.02])

# fig.update_layout(dict(yaxis2={'anchor': 'x', 'overlaying': 'y', 'side': 'left'},
#                        yaxis={'anchor': 'x', 'domain': [0.0, 1.0], 'side':'right'}))

fig.show()

In [None]:
# possible_matches = co_sim_matrix.apply(lambda x: list(zip(x.abs().nlargest(5).index.tolist(), x.abs().nlargest(5).round(4).tolist())), axis=1)
# possible_matches

In [None]:
threshold = res["Threshold"][res[r"$F_1-score$"].idxmax()]
# threshold = 0.44

In [None]:
mapp = mappings.copy()

mapp['mapped_schema'][mapp['similarity'] < threshold] = "None"
mapp

Unnamed: 0,yelp_category,mapped_schema,similarity
0,hostel,hostel,1.000000
1,financial service,financial service,1.000000
2,pawn shop,pawn shop,1.000000
3,mosque,mosque,1.000000
4,pharmacy,pharmacy,1.000000
...,...,...,...
1411,countertop installation,,0.304017
1412,conveyor belt sushi,,0.294110
1413,water heater installation,,0.293358
1414,lighting fixture,,0.282112


In [None]:
schema_category_mappings_df = pd.read_csv(get_path("class_mappings_manual.csv"))
schema_category_mappings_dict = dict([(i, eval(x)) for i, x in zip(schema_category_mappings_df['YelpCategory'],
                                                                   schema_category_mappings_df['SchemaType'])])

In [None]:
mapped_categories = mapp[mapp["mapped_schema"] != "None"]["yelp_category"].tolist()

In [None]:
mapped_categories

['hostel',
 'financial service',
 'pawn shop',
 'mosque',
 'pharmacy',
 'rv park',
 'art gallery',
 'amusement park',
 'elementary school',
 'bakery',
 'adult entertainment',
 'real estate agent',
 'television station',
 'beach',
 'park',
 'department store',
 'bus station',
 'festival',
 'high school',
 'convenience store',
 'hobby shop',
 'aquarium',
 'florist',
 'shopping center',
 'notary',
 'preschool',
 'hair salon',
 'courthouse',
 'comedy club',
 'nail salon',
 'auto repair',
 'toy store',
 'playground',
 'train station',
 'book',
 'professional service',
 'cemetery',
 'optician',
 'library',
 'furniture store',
 'self storage',
 'locksmith',
 'buddhist temple',
 'child care',
 'animal shelter',
 'day spa',
 'church',
 'campground',
 'synagogue',
 'hospital',
 'recycling center',
 'resort',
 'sports club',
 'middle school',
 'electrician',
 'brewery',
 'hindu temple',
 'winery',
 'motorcycle dealer',
 'museum',
 'zoo',
 'post office',
 'shoe store',
 'internet cafe',
 'pet',
 '

In [None]:
yelp_categories_dict_singular

{'Kiteboarding': ['kiteboarding'],
 'Siding': ['siding'],
 'Furniture Stores': ['furniture store'],
 'Go Karts': ['go kart'],
 'Tax Services': ['tax service'],
 'Religious Schools': ['religious school'],
 'Body Contouring': ['body contouring'],
 'Cupcakes': ['cupcake'],
 'Coffeeshops': ['coffeeshop'],
 'Comic Books': ['comic book'],
 'Fertility': ['fertility'],
 'Brazilian Jiu-jitsu': ['brazilian jiu-jitsu'],
 'Public Transportation': ['public transportation'],
 'Landscaping': ['landscaping'],
 'Juice Bars & Smoothies': ['juice bar', 'smoothy'],
 'Montessori Schools': ['montessori school'],
 'Pizza': ['pizza'],
 'Butcher': ['butcher'],
 'Supernatural Readings': ['supernatural reading'],
 'Bike Parking': ['bike parking'],
 'Tai Chi': ['tai chi'],
 'Title Loans': ['title loan'],
 'Airports': ['airport'],
 'Hepatologists': ['hepatologist'],
 'Personal Care Services': ['personal care service'],
 'Bicycles': ['bicycle'],
 'Elder Law': ['elder law'],
 'Solar Installation': ['solar installati

In [None]:
x = set()
for k, v in yelp_categories_dict_singular.items():
    for i in v:
        if i in mapped_categories:
            x.add(k)

x

{'Accountants',
 'Addiction Medicine',
 'Adult Entertainment',
 'Airlines',
 'Airport Lounges',
 'Airport Shuttles',
 'Airport Terminals',
 'Airports',
 'Amateur Sports Teams',
 'Amusement Parks',
 'Anesthesiologists',
 'Animal Physical Therapy',
 'Animal Shelters',
 'Apartment Agents',
 'Apartments',
 'Appliances & Repair',
 'Aquarium Services',
 'Aquariums',
 'Art Classes',
 'Art Consultants',
 'Art Galleries',
 'Art Installation',
 'Art Museums',
 'Art Supplies',
 'Art Tours',
 'Arts & Crafts',
 'Arts & Entertainment',
 'Auto Parts & Supplies',
 'Auto Repair',
 'Automotive',
 'Aviation Services',
 'Ayurveda',
 'Bakeries',
 'Banks & Credit Unions',
 'Bars',
 'Battery Stores',
 'Beach Bars',
 'Beaches',
 'Beauty & Spas',
 'Bed & Breakfast',
 'Beer',
 'Beer Bar',
 'Beer Gardens',
 'Beer Tours',
 'Beverage Store',
 'Bicycles',
 'Bike Parking',
 'Bike Repair',
 'Bike Repair/Maintenance',
 'Bike Shop',
 'Bikes',
 'Billing Services',
 'Blow Dry/Out Services',
 'Boat Charters',
 'Boat Parts

In [None]:
x = set(schema_category_mappings_df["YelpCategory"].to_list())

In [None]:
lst_cat_series = biz["categories"].apply(str_split)

In [None]:
lst_cat_series

0         [Doctors, Traditional Chinese Medicine, Naturo...
1         [Shipping Centers, Local Services, Notaries, M...
2         [Department Stores, Shopping, Fashion, Home & ...
3         [Restaurants, Food, Bubble Tea, Coffee & Tea, ...
4                               [Brewpubs, Breweries, Food]
                                ...                        
150341                         [Nail Salons, Beauty & Spas]
150342    [Pets, Nurseries & Gardening, Pet Stores, Hobb...
150343    [Shopping, Jewelry, Piercing, Toy Stores, Beau...
150344    [Fitness/Exercise Equipment, Eyewear & Opticia...
150345    [Beauty & Spas, Permanent Makeup, Piercing, Ta...
Name: categories, Length: 150346, dtype: object

In [None]:
cnt = 0
for cats in lst_cat_series:
    if cats is not None:
        cats = set(cats)
        cnt += bool(cats & x)
cnt

142269

In [None]:
# Percentage business's mapped
cnt / len(lst_cat_series) * 100

94.62772538012318

In [None]:
# Found mappings
len(x) / len(yelp_categories_dict_singular.keys()) * 100

23.64607170099161