## Recommender Based on Universal Sentence Encoder

In [1]:
import pandas as pd
import numpy as np
import tensorflow_hub as hub
import ast

In [9]:
embedder = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
EMBEDDED_CATEGORIES_FILE = 'data/embedded_categories.npy'
EMBEDDED_ATTRIBUTES_FILE = 'data/embedded_attributes.npy'

In [3]:
def cosine_similarity(mat, vec):
    """
    Calculate cosine similarity scores

    :param mat: 2D array of all targets
    :param vec: 1D array of vector to compare to
    :return: all cosine similarity scores to vec
    """
    vec_mag = np.linalg.norm(vec)
    mat_mag = np.linalg.norm(mat, axis=1).reshape((-1, 1))
    return (1/vec_mag) * np.dot(mat, vec).reshape((-1, 1)) / mat_mag

def return_attribute_soup(input):
    """
    Converts dictionary of inputs into list of items that are true.  Can also handle nested examples

    :param input: dictionary of attributes
    :return: list of keys
    """
    current = []
    for key in input:
        # Inner dictionaries appear to be malformed in places
        try:
            value = ast.literal_eval(input[key])
        except ValueError:
            value = input[key]
        if isinstance(value, dict):
            current.extend(return_attribute_soup(value))
        elif value not in (False, 'No', 'False'):
            current.append(key)
    return current

In [4]:
df_business = pd.read_csv('data/yelp_academic_dataset_business.csv')
df_business = df_business.dropna(subset=['categories'])
df_business['attributes'] = df_business['attributes'].fillna('{}')

In [5]:
df_business['attribute_dict'] = df_business['attributes'].apply(ast.literal_eval)
df_business['attribute_soup'] = df_business['attribute_dict'].apply(return_attribute_soup)
df_business['attribute_string'] = df_business['attribute_soup'].apply(lambda x: ' '.join(map(str, x)))

In [6]:
df_business.head()


Unnamed: 0.1,Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,attribute_dict,attribute_soup,attribute_string
0,0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': ""u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...","{'RestaurantsTableService': 'True', 'WiFi': 'u...","[RestaurantsTableService, WiFi, BikeParking, s...",RestaurantsTableService WiFi BikeParking stree...
1,1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...","{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","[RestaurantsTakeOut, RestaurantsAttire, GoodFo...",RestaurantsTakeOut RestaurantsAttire GoodForKi...
2,2,bvN78flM8NLprQ1a1y5dRg,The Reclaimory,4720 Hawthorne Ave,Portland,OR,97214,45.511907,-122.613693,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Antiques, Fashion, Used, Vintage & Consignment...","{'Thursday': '11:0-18:0', 'Friday': '11:0-18:0...","{'BusinessAcceptsCreditCards': 'True', 'Restau...","[BusinessAcceptsCreditCards, RestaurantsPriceR...",BusinessAcceptsCreditCards RestaurantsPriceRan...
3,3,oaepsyvc0J17qwi8cfrOWg,Great Clips,2566 Enterprise Rd,Orange City,FL,32763,28.914482,-81.295979,3.0,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Beauty & Spas, Hair Salons",,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","[RestaurantsPriceRange2, BusinessAcceptsCredit...",RestaurantsPriceRange2 BusinessAcceptsCreditCa...
4,4,PE9uqAjdw0E4-8mjGl3wVA,Crossfit Terminus,1046 Memorial Dr SE,Atlanta,GA,30316,33.747027,-84.353424,4.0,14,1,"{'GoodForKids': 'False', 'BusinessParking': ""{...","Gyms, Active Life, Interval Training Gyms, Fit...","{'Monday': '16:0-19:0', 'Tuesday': '16:0-19:0'...","{'GoodForKids': 'False', 'BusinessParking': '{...",[BusinessAcceptsCreditCards],BusinessAcceptsCreditCards


In [10]:
try:
    with open(EMBEDDED_CATEGORIES_FILE, 'rb') as f:
        embedded_categories = np.load(EMBEDDED_CATEGORIES_FILE, allow_pickle=True)

    with open(EMBEDDED_ATTRIBUTES_FILE, 'rb') as f:
        embedded_attributes = np.load(EMBEDDED_ATTRIBUTES_FILE, allow_pickle=True)

    print('EMBEDDINGS LOADED')

except IOError as e:
    print('ARRAY(S) NOT FOUND. EMBEDDING:')
    categories = list(df_business['categories'])
    embedded_categories = embedder(categories).numpy()
    np.save(EMBEDDED_CATEGORIES_FILE, embedded_categories)

    attributes = list(df_business['attribute_string'])
    embedded_attributes = embedder(attributes).numpy()
    np.save(EMBEDDED_ATTRIBUTES_FILE, embedded_attributes)
    print('EMBEDDINGS SAVED')

ARRAY(S) NOT FOUND. EMBEDDING:
EMBEDDINGS SAVED


In [14]:
embedded_attributes
embedded_categories

array([[-0.0276633 ,  0.00761654, -0.02711659, ..., -0.03886824,
        -0.04586763, -0.08839873],
       [-0.0705768 ,  0.00129631, -0.06778074, ..., -0.06274288,
        -0.04824021,  0.0047615 ],
       [-0.04613002,  0.02605862, -0.02970327, ...,  0.00173576,
        -0.00088245, -0.02037748],
       ...,
       [-0.00770766,  0.03912623,  0.04667362, ..., -0.00783631,
        -0.0218939 , -0.06485061],
       [ 0.08441994, -0.02555969,  0.04712285, ...,  0.00609742,
         0.02619056, -0.04860131],
       [-0.07634011, -0.01020356, -0.02329722, ..., -0.06450093,
        -0.059017  , -0.07953931]], dtype=float32)

In [66]:
def get_knn(input, embedded_attributes, embedded_categories, df_business, k=5, category_weighting=0.75, min_rating=-1):
    """
    Get the most similar businesses by knn search and sort

    :param input: input business, in the form of a Series
    :param embedded_attributes: services offered at the business
    :param embedded_categories: embedded categories (business type)
    :param df_business: DataFrame of all business info that must match embedded_attributes
    :param k: Number of top results desired
    :param category_weighting: emphasis placed on the contents of the restaurant.
    :return: DataFrame of similar businesses
    """
    assert df_business.shape[0] == embedded_categories.shape[0]
    assert df_business.shape[0] == embedded_attributes.shape[0]
    df_business_copy = df_business.copy()
    query_category = input.loc['categories']
    query_attributes = input.loc['attribute_string']
    print(input.loc['name'])
    embedded_query_category = embedder([query_category]).numpy().flatten()
    embedded_query_attributes = embedder([query_attributes]).numpy().flatten()

    categ_simil_scores = cosine_similarity(embedded_categories, embedded_query_category).flatten()
    attri_simil_scores = cosine_similarity(embedded_attributes, embedded_query_attributes).flatten()

    weighted_simil_scores = category_weighting * categ_simil_scores + (1 - category_weighting) + attri_simil_scores
    weighted_simil_scores *= 0.5

    df_business_copy['scores'] = weighted_simil_scores
    indices = np.argsort(weighted_simil_scores)[-k:][::-1]
    ret = df_business_copy.iloc[indices, :]
    return ret[ret['stars'] >= min_rating]

In [70]:
get_knn(df_business.iloc[0, :], embedded_attributes, embedded_categories, df_business, category_weighting=0.5)

Great Clips


Unnamed: 0.1,Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,attribute_dict,attribute_soup,attribute_string,scores
10542,10542,KlVryc9EjFEwSwPzH2fbFg,Kadi Professional Hair Braiding,3167 E Main St,Columbus,OH,43213,39.956268,-82.913651,2.0,6,1,"{'BusinessParking': ""{'garage': False, 'street...","Beauty & Spas, Hair Salons",,"{'BusinessParking': '{'garage': False, 'street...","[RestaurantsPriceRange2, BusinessAcceptsCredit...",RestaurantsPriceRange2 BusinessAcceptsCreditCa...,1.0
90849,90849,2SUKMNpuZGNWvwL6Yo9zKg,James Joseph Salon,666 Market St,Lynnfield,MA,1940,42.516201,-71.035242,4.0,11,1,"{'RestaurantsPriceRange2': '3', 'BusinessAccep...","Beauty & Spas, Hair Salons","{'Monday': '11:0-19:0', 'Tuesday': '10:0-20:0'...","{'RestaurantsPriceRange2': '3', 'BusinessAccep...","[RestaurantsPriceRange2, BusinessAcceptsCredit...",RestaurantsPriceRange2 BusinessAcceptsCreditCa...,1.0
67658,67658,LznIz_siLV87z3_JZENLxg,Pigtails & Crewcuts,6618 Sitio Del Rio Blvd,Austin,TX,78730,30.391965,-97.845492,3.5,6,0,"{'RestaurantsPriceRange2': '2', 'ByAppointment...","Beauty & Spas, Hair Salons","{'Monday': '10:0-18:0', 'Tuesday': '10:0-18:0'...","{'RestaurantsPriceRange2': '2', 'ByAppointment...","[RestaurantsPriceRange2, BusinessAcceptsCredit...",RestaurantsPriceRange2 BusinessAcceptsCreditCa...,1.0
40613,40613,lrgoAgYR_TsU7P0R9kpjvw,Great Clips,6365 Perimeter Dr,Dublin,OH,43016,40.10458,-83.161648,1.5,8,1,"{'ByAppointmentOnly': 'False', 'BikeParking': ...","Beauty & Spas, Hair Salons",,"{'ByAppointmentOnly': 'False', 'BikeParking': ...","[RestaurantsPriceRange2, BusinessAcceptsCredit...",RestaurantsPriceRange2 BusinessAcceptsCreditCa...,1.0
103475,103475,CPJg0Z84Nb-4N2Ghs2HE0A,Le Salon,"5150 Buford Hwy NE, Ste C190",Doraville,GA,30340,33.892665,-84.285492,4.0,7,1,"{'RestaurantsPriceRange2': '1', 'BikeParking':...","Beauty & Spas, Hair Salons",,"{'RestaurantsPriceRange2': '1', 'BikeParking':...","[RestaurantsPriceRange2, BusinessAcceptsCredit...",RestaurantsPriceRange2 BusinessAcceptsCreditCa...,1.0


How to improve this system:
- `attributes` don't appear to do much when semantically embedded: preferable to do term matching?
- Add ability to filter by area, distance from user?  Minimum rating without changing number of results?
- Add ability to update on the fly, replacing `stars` by aggregating scores from the users?
- Add a description text option, where reviews from many users are aggregated and used semantically, rather than just comparing restaurant categories.

