## Recommender Based on Universal Sentence Encoder

In [1]:
import pandas as pd
import numpy as np
import tensorflow_hub as hub
import ast

In [2]:
embedder = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
EMBEDDED_CATEGORIES_FILE = 'data/embedded_categories.npy'
EMBEDDED_ATTRIBUTES_FILE = 'data/embedded_attributes.npy'

In [7]:
def cosine_similarity(mat, vec):
    """
    Calculate cosine similarity scores

    :param mat: 2D array of all targets
    :param vec: 1D array of vector to compare to
    :return: all cosine similarity scores to vec
    """
    vec_mag = np.linalg.norm(vec)
    mat_mag = np.linalg.norm(mat, axis=1).reshape((-1, 1))
    return (1/vec_mag) * np.dot(mat, vec).reshape((-1, 1)) / mat_mag

def return_attribute_soup(input):
    """
    Converts dictionary of inputs into list of items that are true.  Can also handle nested examples

    :param input: dictionary of attributes
    :return: list of keys
    """
    current = []
    for key in input:
        # Inner dictionaries appear to be malformed in places
        try:
            value = ast.literal_eval(input[key])
        except ValueError:
            value = input[key]
        if isinstance(value, dict):
            current.extend(return_attribute_soup(value))
        elif value not in (False, 'No', 'False'):
            current.append(key)
    return current

In [4]:
df_business = pd.read_csv('data/yelp_academic_dataset_business_filtered.csv')
df_business = df_business.dropna(subset=['categories'])
df_business['attributes'] = df_business['attributes'].fillna('{}')

In [5]:
df_business['attribute_dict'] = df_business['attributes'].apply(ast.literal_eval)
df_business['attribute_soup'] = df_business['attribute_dict'].apply(return_attribute_soup)
df_business['attribute_string'] = df_business['attribute_soup'].apply(lambda x: ' '.join(map(str, x)))

In [8]:
df_business.head()


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,attribute_dict,attribute_soup,attribute_string
0,0,0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': ""u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...","{'RestaurantsTableService': 'True', 'WiFi': 'u...","[RestaurantsTableService, WiFi, BikeParking, s...",RestaurantsTableService WiFi BikeParking stree...
1,1,1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...","{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","[RestaurantsTakeOut, RestaurantsAttire, GoodFo...",RestaurantsTakeOut RestaurantsAttire GoodForKi...
2,5,5,D4JtQNTI4X3KcbzacDJsMw,Bob Likes Thai Food,3755 Main St,Vancouver,BC,V5V,49.251342,-123.101333,3.5,169,1,"{'GoodForKids': 'True', 'Alcohol': ""u'none'"", ...","Restaurants, Thai","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'...","{'GoodForKids': 'True', 'Alcohol': 'u'none'', ...","[GoodForKids, Alcohol, RestaurantsGoodForGroup...",GoodForKids Alcohol RestaurantsGoodForGroups R...
3,10,10,rYs_1pNB_RMtn5WQh55QDA,Chautauqua General Store,100 Clematis Dr,Boulder,CO,80302,39.998449,-105.281006,3.5,5,1,"{'BikeParking': 'True', 'RestaurantsTakeOut': ...","Food, Shopping, Convenience Stores, Souvenir S...","{'Monday': '8:0-20:0', 'Tuesday': '8:0-20:0', ...","{'BikeParking': 'True', 'RestaurantsTakeOut': ...","[BikeParking, RestaurantsTakeOut, street, Whee...",BikeParking RestaurantsTakeOut street Wheelcha...
4,12,12,HPA_qyMEddpAEtFof02ixg,Mr G's Pizza & Subs,474 Lowell St,Peabody,MA,01960,42.541155,-70.973438,4.0,39,1,"{'RestaurantsGoodForGroups': 'True', 'HasTV': ...","Food, Pizza, Restaurants","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...","{'RestaurantsGoodForGroups': 'True', 'HasTV': ...","[RestaurantsGoodForGroups, HasTV, GoodForKids,...",RestaurantsGoodForGroups HasTV GoodForKids Res...


In [9]:
try:
    with open(EMBEDDED_CATEGORIES_FILE, 'rb') as f:
        embedded_categories = np.load(EMBEDDED_CATEGORIES_FILE, allow_pickle=True)

    with open(EMBEDDED_ATTRIBUTES_FILE, 'rb') as f:
        embedded_attributes = np.load(EMBEDDED_ATTRIBUTES_FILE, allow_pickle=True)

    print('EMBEDDINGS LOADED')

except IOError as e:
    print('ARRAY(S) NOT FOUND. EMBEDDING:')
    categories = list(df_business['categories'])
    embedded_categories = embedder(categories).numpy()
    np.save(EMBEDDED_CATEGORIES_FILE, embedded_categories)

    attributes = list(df_business['attribute_string'])
    embedded_attributes = embedder(attributes).numpy()
    np.save(EMBEDDED_ATTRIBUTES_FILE, embedded_attributes)
    print('EMBEDDINGS SAVED')


ARRAY(S) NOT FOUND. EMBEDDING:
EMBEDDINGS SAVED


In [63]:
def get_knn(input, embedded_attributes, embedded_categories, df_business, k=5, category_weighting=0.75, min_rating=-1):
    """
    Get the most similar businesses by knn search and sort

    :param input: input business, in the form of a Series
    :param embedded_attributes: services offered at the business
    :param embedded_categories: embedded categories (business type)
    :param df_business: DataFrame of all business info that must match embedded_attributes
    :param k: Number of top results desired
    :param category_weighting: emphasis placed on the contents of the restaurant.
    :return: DataFrame of similar businesses
    """
    assert df_business.shape[0] == embedded_categories.shape[0]
    assert df_business.shape[0] == embedded_attributes.shape[0]

    # Filter by state before continuing
    df_business_copy = df_business.copy()
    state = input['state']
    # List of all matching indices
    match_dex = df_business.index[df_business['state'] == state].tolist()

    embedded_attributes = embedded_attributes[match_dex]
    embedded_categories = embedded_categories[match_dex]
    df_business_copy = df_business.loc[match_dex, :]


    query_category = input.loc['categories']
    query_attributes = input.loc['attribute_string']
    print(input.loc['name'])
    embedded_query_category = embedder([query_category]).numpy().flatten()
    embedded_query_attributes = embedder([query_attributes]).numpy().flatten()

    categ_simil_scores = cosine_similarity(embedded_categories, embedded_query_category).flatten()
    attri_simil_scores = cosine_similarity(embedded_attributes, embedded_query_attributes).flatten()

    weighted_simil_scores = category_weighting * categ_simil_scores + (1 - category_weighting) + attri_simil_scores
    weighted_simil_scores *= 0.5

    df_business_copy['scores'] = weighted_simil_scores
    indices = np.argsort(weighted_simil_scores)[-k:][::-1]
    ret = df_business_copy.iloc[indices, :]
    return ret[ret['stars'] >= min_rating]

In [64]:
df_business.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,attribute_dict,attribute_soup,attribute_string
0,0,0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': ""u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...","{'RestaurantsTableService': 'True', 'WiFi': 'u...","[RestaurantsTableService, WiFi, BikeParking, s...",RestaurantsTableService WiFi BikeParking stree...
1,1,1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...","{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","[RestaurantsTakeOut, RestaurantsAttire, GoodFo...",RestaurantsTakeOut RestaurantsAttire GoodForKi...
2,5,5,D4JtQNTI4X3KcbzacDJsMw,Bob Likes Thai Food,3755 Main St,Vancouver,BC,V5V,49.251342,-123.101333,3.5,169,1,"{'GoodForKids': 'True', 'Alcohol': ""u'none'"", ...","Restaurants, Thai","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'...","{'GoodForKids': 'True', 'Alcohol': 'u'none'', ...","[GoodForKids, Alcohol, RestaurantsGoodForGroup...",GoodForKids Alcohol RestaurantsGoodForGroups R...
3,10,10,rYs_1pNB_RMtn5WQh55QDA,Chautauqua General Store,100 Clematis Dr,Boulder,CO,80302,39.998449,-105.281006,3.5,5,1,"{'BikeParking': 'True', 'RestaurantsTakeOut': ...","Food, Shopping, Convenience Stores, Souvenir S...","{'Monday': '8:0-20:0', 'Tuesday': '8:0-20:0', ...","{'BikeParking': 'True', 'RestaurantsTakeOut': ...","[BikeParking, RestaurantsTakeOut, street, Whee...",BikeParking RestaurantsTakeOut street Wheelcha...
4,12,12,HPA_qyMEddpAEtFof02ixg,Mr G's Pizza & Subs,474 Lowell St,Peabody,MA,01960,42.541155,-70.973438,4.0,39,1,"{'RestaurantsGoodForGroups': 'True', 'HasTV': ...","Food, Pizza, Restaurants","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...","{'RestaurantsGoodForGroups': 'True', 'HasTV': ...","[RestaurantsGoodForGroups, HasTV, GoodForKids,...",RestaurantsGoodForGroups HasTV GoodForKids Res...


In [65]:
results = get_knn(df_business.iloc[2, :], embedded_attributes, embedded_categories, df_business, category_weighting=0.5)
results.head()

Bob Likes Thai Food


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,attribute_dict,attribute_soup,attribute_string,scores
2,5,5,D4JtQNTI4X3KcbzacDJsMw,Bob Likes Thai Food,3755 Main St,Vancouver,BC,V5V,49.251342,-123.101333,3.5,169,1,"{'GoodForKids': 'True', 'Alcohol': ""u'none'"", ...","Restaurants, Thai","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'...","{'GoodForKids': 'True', 'Alcohol': 'u'none'', ...","[GoodForKids, Alcohol, RestaurantsGoodForGroup...",GoodForKids Alcohol RestaurantsGoodForGroups R...,1.0
25424,78195,78195,VyHdjx6K9XNfjsh2_2PqFA,Thida Thai Resturant,1193 Davie St,Vancouver,BC,V6E 1N2,49.281778,-123.132975,3.0,55,1,"{'GoodForKids': 'True', 'Caters': 'False', 'Re...","Restaurants, Thai","{'Monday': '11:30-22:0', 'Tuesday': '11:30-22:...","{'GoodForKids': 'True', 'Caters': 'False', 'Re...","[GoodForKids, RestaurantsReservations, street,...",GoodForKids RestaurantsReservations street Res...,0.944792
8012,24884,24884,iBun5rZsOClH0rhyg7FUlA,U & I Thai Fine Cuisine,3364 Cambie Street,Vancouver,BC,V5Y 2A1,49.255267,-123.115072,3.5,167,0,"{'RestaurantsGoodForGroups': 'True', 'Alcohol'...","Restaurants, Thai","{'Monday': '11:30-22:0', 'Tuesday': '11:30-22:...","{'RestaurantsGoodForGroups': 'True', 'Alcohol'...","[RestaurantsGoodForGroups, Alcohol, BikeParkin...",RestaurantsGoodForGroups Alcohol BikeParking W...,0.935762
32917,101012,101012,jR5KXCNnG89b8qPM_5mRbA,Baan Wasana Thai Restaurant,2143 41st Avenue W,Vancouver,BC,V6M 1Z6,49.234719,-123.156672,3.5,55,0,"{'Ambience': ""{'romantic': False, 'intimate': ...","Restaurants, Thai","{'Tuesday': '17:0-22:0', 'Wednesday': '17:0-22...","{'Ambience': '{'romantic': False, 'intimate': ...","[casual, WiFi, street, NoiseLevel, Restaurants...",casual WiFi street NoiseLevel RestaurantsReser...,0.932708
20364,62748,62748,CdBrsk1y814YgzVfxQ0tnQ,Urban Thai Bistro,1119 Hamilton Street,Vancouver,BC,V6B 5P6,49.275809,-123.121512,3.0,165,0,"{'OutdoorSeating': 'True', 'RestaurantsPriceRa...","Restaurants, Thai","{'Monday': '11:0-21:30', 'Tuesday': '11:0-21:3...","{'OutdoorSeating': 'True', 'RestaurantsPriceRa...","[OutdoorSeating, RestaurantsPriceRange2, Resta...",OutdoorSeating RestaurantsPriceRange2 Restaura...,0.931687


### Load the users and their reviews into a train and testable form
Given a user:
- Find all reviews produced by this user
- Filter to only positive reviews (4 stars or above)
- For each positive review, find the top 5 most similar restaurants in the area specified by the review (it's unlikely the user would be willing to travel across the country to find a restaurant that might just be similar)
- Pool all of the results, and sort by similarity scores.  Therefore, the user should have a collection of potential places to visit.

By using a text embedder to calculate similarity, a training set isn't truly needed:
- Require at least 2 good reviews from each test user, otherwise skip the training instance although a larger threshold for minimum number of reviews may make more sense.
- For each user, having $$n$$ reviews, we randomly use 50% of the reviews for generating similar results.  Then, follow the above results to obtain $5n/2$ predictions.
- We examine whether the predictions and remaining $n/2$ test reviews have _any_ intersection (although we can change this later).


In [33]:
df_users = pd.read_csv('data/yelp_academic_dataset_user.csv')
df_reviews = pd.read_csv('data/yelp_academic_dataset_review.csv')

In [35]:
df_reviews.shape[0]
df_users.shape[0]

2189457

In [None]:
# Set of restaurant IDs
restaurant_set = set(df_business['business_id'].values)

In [37]:
df_reviews_final = df_reviews[df_reviews['business_id'].isin(restaurant_set)]
# TODO: remove bad reviews here?

In [38]:
df_reviews_sampled = df_reviews_final.sample(n=100_000, replace=False)

In [39]:
user_id_set = set(df_reviews_sampled['user_id'].values)
df_users_sampled = df_users[df_users['user_id'].isin(user_id_set)]

In [49]:
df_user_and_review = pd.merge(df_reviews_sampled, df_users_sampled, on='user_id')
df_all_filtered = df_user_and_review[df_user_and_review.groupby('user_id').transform('count')['review_id'] > 4]

In [59]:
user_id_set = set(df_all_filtered['user_id'].values)

# for user_id in user_id_set:
user_iter = iter(user_id_set)
for dex in range(2):
    user_id = next(user_iter)
    df_individual_reviews = df_all_filtered[df_all_filtered['user_id'] == user_id]
    all_rec = []
    for business_id in df_individual_reviews['business_id']:
        df_input_business = df_business[df_business['business_id'] == business_id].iloc[0, :]
        all_rec.append(get_knn(df_input_business, embedded_attributes, embedded_categories, df_business, category_weighting=0.5))
    pd.concat(all_rec)

Insomnia Cookies
GENKI Noodles and Sushi - Buckhead
Villa Vino
Il Bacio Pizzeria & Trattoria
Everybody's Pizza
Sushi House
One Star Ranch
Whole Foods Market
The Kroger Company
The Righteous Room
Baraonda Italian Restaurant
Cypress Street Pint & Plate
El Potro Mexican Restaurant
Cook Out
Allegro
       Unnamed: 0  Unnamed: 0.1             business_id  \
39211      119875        119875  97vNRu9r0LfqIf-CCOxD5w   
43092      131735        131735  RoOtGXMwxmaoJ-c_nKRQ6Q   
50484      153867        153867  0EILBP5TxMPujUxkAfjAaA   
35084      107458        107458  GxHtrn6FJy-eaBjBXbDiJA   
39780      121671        121671  33vTX7D5up56a1dm_AGsUQ   
...           ...           ...                     ...   
36754      112427        112427  YWTSQHwFpbVqm_wM-3te5w   
5240        16123         16123  rrQoE-CoQpWIMH6pla56Kw   
48638      148362        148362  ISZ7oCFzeb0JWG9ZAt9FCw   
47707      145634        145634  T8bpbSANpBoChPWbKb-LYg   
19577       60196         60196  KtzLhMuCE7a7k7bB95cVFA

In [52]:
df_business.iloc[2, :]

Unnamed: 0                                                          5
Unnamed: 0.1                                                        5
business_id                                    D4JtQNTI4X3KcbzacDJsMw
name                                              Bob Likes Thai Food
address                                                  3755 Main St
city                                                        Vancouver
state                                                              BC
postal_code                                                       V5V
latitude                                                    49.251342
longitude                                                 -123.101333
stars                                                             3.5
review_count                                                      169
is_open                                                             1
attributes          {'GoodForKids': 'True', 'Alcohol': "u'none'", ...
categories          

TypeError: 'set' object is not subscriptable