## Recommender Based on Universal Sentence Encoder

In [2]:
import pandas as pd
import numpy as np
import tensorflow_hub as hub
import ast

In [3]:
embedder = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
EMBEDDED_CATEGORIES_FILE = 'data/embedded_categories.npy'
EMBEDDED_ATTRIBUTES_FILE = 'data/embedded_attributes.npy'

In [4]:
def cosine_similarity(mat, vec):
    """
    Calculate cosine similarity scores

    :param mat: 2D array of all targets
    :param vec: 1D array of vector to compare to
    :return: all cosine similarity scores to vec
    """
    vec_mag = np.linalg.norm(vec)
    mat_mag = np.linalg.norm(mat, axis=1).reshape((-1, 1))
    return (1/vec_mag) * np.dot(mat, vec).reshape((-1, 1)) / mat_mag

def return_attribute_soup(input):
    """
    Converts dictionary of inputs into list of items that are true.  Can also handle nested examples

    :param input: dictionary of attributes
    :return: list of keys
    """
    current = []
    for key in input:
        # Inner dictionaries appear to be malformed in places
        try:
            value = ast.literal_eval(input[key])
        except ValueError:
            value = input[key]
        if isinstance(value, dict):
            current.extend(return_attribute_soup(value))
        elif value not in (False, 'No', 'False'):
            current.append(key)
    return current

In [5]:
df_business = pd.read_csv('data/yelp_academic_dataset_business_filtered.csv')
df_business = df_business.dropna(subset=['categories'])
df_business['attributes'] = df_business['attributes'].fillna('{}')

In [6]:
df_business['attribute_dict'] = df_business['attributes'].apply(ast.literal_eval)
df_business['attribute_soup'] = df_business['attribute_dict'].apply(return_attribute_soup)
df_business['attribute_string'] = df_business['attribute_soup'].apply(lambda x: ' '.join(map(str, x)))

In [7]:
df_business.head()


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,attribute_dict,attribute_soup,attribute_string
0,0,0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': ""u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...","{'RestaurantsTableService': 'True', 'WiFi': 'u...","[RestaurantsTableService, WiFi, BikeParking, s...",RestaurantsTableService WiFi BikeParking stree...
1,1,1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...","{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","[RestaurantsTakeOut, RestaurantsAttire, GoodFo...",RestaurantsTakeOut RestaurantsAttire GoodForKi...
2,5,5,D4JtQNTI4X3KcbzacDJsMw,Bob Likes Thai Food,3755 Main St,Vancouver,BC,V5V,49.251342,-123.101333,3.5,169,1,"{'GoodForKids': 'True', 'Alcohol': ""u'none'"", ...","Restaurants, Thai","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'...","{'GoodForKids': 'True', 'Alcohol': 'u'none'', ...","[GoodForKids, Alcohol, RestaurantsGoodForGroup...",GoodForKids Alcohol RestaurantsGoodForGroups R...
3,10,10,rYs_1pNB_RMtn5WQh55QDA,Chautauqua General Store,100 Clematis Dr,Boulder,CO,80302,39.998449,-105.281006,3.5,5,1,"{'BikeParking': 'True', 'RestaurantsTakeOut': ...","Food, Shopping, Convenience Stores, Souvenir S...","{'Monday': '8:0-20:0', 'Tuesday': '8:0-20:0', ...","{'BikeParking': 'True', 'RestaurantsTakeOut': ...","[BikeParking, RestaurantsTakeOut, street, Whee...",BikeParking RestaurantsTakeOut street Wheelcha...
4,12,12,HPA_qyMEddpAEtFof02ixg,Mr G's Pizza & Subs,474 Lowell St,Peabody,MA,01960,42.541155,-70.973438,4.0,39,1,"{'RestaurantsGoodForGroups': 'True', 'HasTV': ...","Food, Pizza, Restaurants","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...","{'RestaurantsGoodForGroups': 'True', 'HasTV': ...","[RestaurantsGoodForGroups, HasTV, GoodForKids,...",RestaurantsGoodForGroups HasTV GoodForKids Res...


In [8]:
try:
    with open(EMBEDDED_CATEGORIES_FILE, 'rb') as f:
        embedded_categories = np.load(EMBEDDED_CATEGORIES_FILE, allow_pickle=True)

    with open(EMBEDDED_ATTRIBUTES_FILE, 'rb') as f:
        embedded_attributes = np.load(EMBEDDED_ATTRIBUTES_FILE, allow_pickle=True)

    print('EMBEDDINGS LOADED')

except IOError as e:
    print('ARRAY(S) NOT FOUND. EMBEDDING:')
    categories = list(df_business['categories'])
    embedded_categories = embedder(categories).numpy()
    np.save(EMBEDDED_CATEGORIES_FILE, embedded_categories)

    attributes = list(df_business['attribute_string'])
    embedded_attributes = embedder(attributes).numpy()
    np.save(EMBEDDED_ATTRIBUTES_FILE, embedded_attributes)
    print('EMBEDDINGS SAVED')


EMBEDDINGS LOADED


In [9]:
def get_knn(input, embedded_attributes, embedded_categories, df_business, k=5, category_weighting=0.75, min_rating=-1):
    """
    Get the most similar businesses by knn search and sort

    :param input: input business, in the form of a Series
    :param embedded_attributes: services offered at the business
    :param embedded_categories: embedded categories (business type)
    :param df_business: DataFrame of all business info that must match embedded_attributes
    :param k: Number of top results desired
    :param category_weighting: emphasis placed on the contents of the restaurant.
    :return: DataFrame of similar businesses
    """
    assert df_business.shape[0] == embedded_categories.shape[0]
    assert df_business.shape[0] == embedded_attributes.shape[0]

    # Filter by state before continuing
    df_business_copy = df_business.copy()
    state = input['state']
    query_id = input['business_id']
    # List of all matching indices
    match_dex = df_business.index[df_business['state'] == state].tolist()

    embedded_attributes = embedded_attributes[match_dex]
    embedded_categories = embedded_categories[match_dex]
    df_business_copy = df_business.loc[match_dex, :]


    query_category = input.loc['categories']
    query_attributes = input.loc['attribute_string']
    print("INPUT BUSINESS:", input.loc['name'])
    embedded_query_category = embedder([query_category]).numpy().flatten()
    embedded_query_attributes = embedder([query_attributes]).numpy().flatten()

    categ_simil_scores = cosine_similarity(embedded_categories, embedded_query_category).flatten()
    attri_simil_scores = cosine_similarity(embedded_attributes, embedded_query_attributes).flatten()

    weighted_simil_scores = category_weighting * categ_simil_scores + (1 - category_weighting) + attri_simil_scores
    weighted_simil_scores *= 0.5

    df_business_copy['scores'] = weighted_simil_scores

    # After filtering, choose the smallest, account for perfect match
    available_businesses = weighted_simil_scores.shape[0]
    k = min(k + 1, available_businesses)

    indices = np.argsort(weighted_simil_scores)[-k:][::-1]
    ret = df_business_copy.iloc[indices, :]
    return ret[(ret['stars'] >= min_rating) & (ret['business_id'] != query_id)]

In [10]:
df_business.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,attribute_dict,attribute_soup,attribute_string
0,0,0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': ""u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...","{'RestaurantsTableService': 'True', 'WiFi': 'u...","[RestaurantsTableService, WiFi, BikeParking, s...",RestaurantsTableService WiFi BikeParking stree...
1,1,1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...","{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","[RestaurantsTakeOut, RestaurantsAttire, GoodFo...",RestaurantsTakeOut RestaurantsAttire GoodForKi...
2,5,5,D4JtQNTI4X3KcbzacDJsMw,Bob Likes Thai Food,3755 Main St,Vancouver,BC,V5V,49.251342,-123.101333,3.5,169,1,"{'GoodForKids': 'True', 'Alcohol': ""u'none'"", ...","Restaurants, Thai","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'...","{'GoodForKids': 'True', 'Alcohol': 'u'none'', ...","[GoodForKids, Alcohol, RestaurantsGoodForGroup...",GoodForKids Alcohol RestaurantsGoodForGroups R...
3,10,10,rYs_1pNB_RMtn5WQh55QDA,Chautauqua General Store,100 Clematis Dr,Boulder,CO,80302,39.998449,-105.281006,3.5,5,1,"{'BikeParking': 'True', 'RestaurantsTakeOut': ...","Food, Shopping, Convenience Stores, Souvenir S...","{'Monday': '8:0-20:0', 'Tuesday': '8:0-20:0', ...","{'BikeParking': 'True', 'RestaurantsTakeOut': ...","[BikeParking, RestaurantsTakeOut, street, Whee...",BikeParking RestaurantsTakeOut street Wheelcha...
4,12,12,HPA_qyMEddpAEtFof02ixg,Mr G's Pizza & Subs,474 Lowell St,Peabody,MA,01960,42.541155,-70.973438,4.0,39,1,"{'RestaurantsGoodForGroups': 'True', 'HasTV': ...","Food, Pizza, Restaurants","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...","{'RestaurantsGoodForGroups': 'True', 'HasTV': ...","[RestaurantsGoodForGroups, HasTV, GoodForKids,...",RestaurantsGoodForGroups HasTV GoodForKids Res...


In [11]:
%%time
results = get_knn(df_business.iloc[3, :], embedded_attributes, embedded_categories, df_business, category_weighting=0.5)
results.head()

INPUT BUSINESS: Chautauqua General Store
CPU times: user 426 ms, sys: 38.7 ms, total: 465 ms
Wall time: 463 ms


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,attribute_dict,attribute_soup,attribute_string,scores
6644,20510,20510,Wzm2YtE9wXyhPXnx0JIV8A,Assorted Goods and Candy,706 Front St,Louisville,CO,80027,39.977083,-105.130589,4.5,14,1,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Food, Specialty Food, Toy Stores, Shopping, Ca...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-17:0', ...","{'BikeParking': 'True', 'BusinessAcceptsCredit...","[BikeParking, BusinessAcceptsCreditCards, Rest...",BikeParking BusinessAcceptsCreditCards Restaur...,0.794812
28605,87791,87791,2VUGEP7yFC7fex9xIf0F8g,Where the Buffalo Roam,1320 Pearl St Frnt,Boulder,CO,80302,40.018185,-105.27803,3.5,12,1,"{'BusinessParking': ""{'garage': False, 'street...","Souvenir Shops, Fashion, Men's Clothing, Shopping","{'Monday': '0:0-0:0', 'Tuesday': '11:0-18:0', ...","{'BusinessParking': '{'garage': False, 'street...","[street, RestaurantsPriceRange2, BikeParking, ...",street RestaurantsPriceRange2 BikeParking Busi...,0.78567
35038,107340,107340,pECKvlZBwUORYC3bnZ65Zw,Rocket Fizz,1441 Pearl St,Boulder,CO,80302,40.01859,-105.276827,3.5,38,1,"{'RestaurantsTakeOut': 'False', 'Caters': 'Fal...","Food, Candy Stores, Specialty Food","{'Monday': '10:0-20:30', 'Tuesday': '10:0-20:3...","{'RestaurantsTakeOut': 'False', 'Caters': 'Fal...","[RestaurantsPriceRange2, BikeParking, street, ...",RestaurantsPriceRange2 BikeParking street Busi...,0.73821
15556,47884,47884,oCsgfMHFvraMQfvfuSJheg,"Piece, Love and Chocolate",805 Pearl St,Boulder,CO,80302,40.017113,-105.284825,4.5,196,1,"{'BusinessParking': ""{'garage': False, 'street...","Chocolatiers & Shops, Bakeries, Specialty Food...","{'Monday': '9:0-18:0', 'Tuesday': '9:0-18:0', ...","{'BusinessParking': '{'garage': False, 'street...","[street, BikeParking, RestaurantsPriceRange2, ...",street BikeParking RestaurantsPriceRange2 Busi...,0.724674
44319,135338,135338,HkcQyjtDxn2Bu_yBHBOAug,Crystal Springs Brewing Company,"657 S Taylor Ave, Unit E",Louisville,CO,80027,39.960583,-105.120277,4.5,43,1,"{'BusinessAcceptsBitcoin': 'False', 'BikeParki...","Breweries, Food","{'Monday': '0:0-0:0', 'Thursday': '15:0-18:0',...","{'BusinessAcceptsBitcoin': 'False', 'BikeParki...","[BikeParking, WheelchairAccessible, Restaurant...",BikeParking WheelchairAccessible RestaurantsPr...,0.71632


### Load the users and their reviews into a train and testable form
Given a user:
- Find all reviews produced by this user
- Filter to only positive reviews (4 stars or above)
- For each positive review, find the top 5 most similar restaurants in the area specified by the review (it's unlikely the user would be willing to travel across the country to find a restaurant that might just be similar)
- Pool all of the results, and sort by similarity scores.  Therefore, the user should have a collection of potential places to visit.

By using a text embedder to calculate similarity, a training set isn't truly needed:
- Require at least 2 good reviews from each test user, otherwise skip the training instance although a larger threshold for minimum number of reviews may make more sense.
- For each user, having $$n$$ reviews, we randomly use 50% of the reviews for generating similar results.  Then, follow the above results to obtain $5n/2$ predictions.
- We examine whether the predictions and remaining $n/2$ test reviews have _any_ intersection (although we can change this later).


In [12]:
df_users = pd.read_csv('data/yelp_academic_dataset_user.csv')
df_reviews = pd.read_csv('data/yelp_academic_dataset_review.csv')

In [13]:
df_reviews.shape[0]
df_users.shape[0]

2189457

In [14]:
# Set of restaurant IDs
restaurant_set = set(df_business['business_id'].values)

In [15]:
df_reviews_final = df_reviews[df_reviews['business_id'].isin(restaurant_set)]
# TODO: remove bad reviews here?

In [16]:
df_reviews_sampled = df_reviews_final.sample(n=100_000, replace=False)

In [17]:
user_id_set = set(df_reviews_sampled['user_id'].values)
df_users_sampled = df_users[df_users['user_id'].isin(user_id_set)]

In [18]:
df_user_and_review = pd.merge(df_reviews_sampled, df_users_sampled, on='user_id')
df_all_filtered = df_user_and_review[df_user_and_review.groupby('user_id').transform('count')['review_id'] > 4]

In [19]:
user_id_set = set(df_all_filtered['user_id'].values)

# for user_id in user_id_set:
user_iter = iter(user_id_set)
for dex in range(2):
    user_id = next(user_iter)
    df_individual_reviews = df_all_filtered[df_all_filtered['user_id'] == user_id]
    all_rec = []
    for business_id in df_individual_reviews['business_id']:
        df_input_business = df_business[df_business['business_id'] == business_id].iloc[0, :]
        all_rec.append(get_knn(df_input_business, embedded_attributes, embedded_categories, df_business, category_weighting=0.5))
    pd.concat(all_rec)

INPUT BUSINESS: All Seasons Table
INPUT BUSINESS: BerryLine
INPUT BUSINESS: BrickFire Pizza
INPUT BUSINESS: Caramel French Patisserie
INPUT BUSINESS: Flour Bakery + Café
INPUT BUSINESS: Centre Street Café
INPUT BUSINESS: Same Old Place
INPUT BUSINESS: Whole Foods Market
INPUT BUSINESS: Gitto's Pizza
INPUT BUSINESS: Donut Day
INPUT BUSINESS: The Hall Street Local
INPUT BUSINESS: Dapper Duck Bar
INPUT BUSINESS: Akira Sushi


In [52]:
df_business.iloc[2, :]

Unnamed: 0                                                          5
Unnamed: 0.1                                                        5
business_id                                    D4JtQNTI4X3KcbzacDJsMw
name                                              Bob Likes Thai Food
address                                                  3755 Main St
city                                                        Vancouver
state                                                              BC
postal_code                                                       V5V
latitude                                                    49.251342
longitude                                                 -123.101333
stars                                                             3.5
review_count                                                      169
is_open                                                             1
attributes          {'GoodForKids': 'True', 'Alcohol': "u'none'", ...
categories          

In [20]:
def get_all_user_recommendations(user_id, df_individual_reviews, df_business, embedded_attributes, embedded_categories):
    """
    Gets a DataFrame of all recommendations.  If the user reviews n items positively, return 5n items that they might enjoy.
    :param user_id: id of the queried user
    :param df_individual_reviews: reviews from a given user.
    :param df_business: DataFrame including all potential businesses
    :return: DataFrame of all recommendations
    """
    all_rec = []
    for business_id in df_individual_reviews['business_id']:
        # Retrieve first instance
        df_input_business = df_business[df_business['business_id'] == business_id].iloc[0, :]
        # Append all corresponding predictions
        all_rec.append(get_knn(df_input_business, embedded_attributes, embedded_categories, df_business, category_weighting=0.5))
    return pd.concat(all_rec)

In [33]:
user_iter = iter(user_id_set)
user_id = next(user_iter)
df_individual_reviews = df_all_filtered[df_all_filtered['user_id'] == user_id]
pd.merge(df_individual_reviews.columns, df_business)

array(['Unnamed: 0_x', 'review_id', 'user_id', 'business_id', 'stars',
       'useful_x', 'funny_x', 'cool_x', 'text', 'date', 'Unnamed: 0_y',
       'name', 'review_count', 'yelping_since', 'useful_y', 'funny_y',
       'cool_y', 'elite', 'friends', 'fans', 'average_stars',
       'compliment_hot', 'compliment_more', 'compliment_profile',
       'compliment_cute', 'compliment_list', 'compliment_note',
       'compliment_plain', 'compliment_cool', 'compliment_funny',
       'compliment_writer', 'compliment_photos'], dtype=object)

In [49]:
# TEST LOOP
from sklearn.model_selection import train_test_split

user_iter = iter(user_id_set)
hit = 0

for dex in range(50):
    user_id = next(user_iter)
    data = df_all_filtered[df_all_filtered['user_id'] == user_id]
    X, y = train_test_split(data, train_size=0.5)
    y_pred = get_all_user_recommendations(user_id, X, df_business, embedded_attributes, embedded_categories)

    y_businesses = set(y['business_id'].values)
    y_pred_businesses = set(y_pred['business_id'].values)
    if len(y_pred_businesses.intersection(y_businesses)) > 0:
        hit += 1

INPUT BUSINESS: All Seasons Table
INPUT BUSINESS: BerryLine
INPUT BUSINESS: Flour Bakery + Café
INPUT BUSINESS: Whole Foods Market
INPUT BUSINESS: Donut Day
INPUT BUSINESS: Akira Sushi
INPUT BUSINESS: Bella Tuscany Ristorante Italiano
INPUT BUSINESS: Michael's Ali Coal Fired Pizza
INPUT BUSINESS: Corona Cigar Company
INPUT BUSINESS: Rendezvous In Central Square
INPUT BUSINESS: Big Sky Bread Bakery & Cafe
INPUT BUSINESS: Bao Bao Bakery & Cafe
INPUT BUSINESS: Yoshi Ramen
INPUT BUSINESS: Muss & Turner's - Smyrna
INPUT BUSINESS: Haymaker
INPUT BUSINESS: Hawkers Delight Deli
INPUT BUSINESS: Banzai Sushi House
INPUT BUSINESS: Shiny Tea
INPUT BUSINESS: ZAC.ZAC Japanese Curry House
INPUT BUSINESS: Pizza Hut
INPUT BUSINESS: Passion8 Dessert Cafe
INPUT BUSINESS: Blenz Coffee
INPUT BUSINESS: Little Big Burger
INPUT BUSINESS: Pizza Maria
INPUT BUSINESS: Cajun Seafood Market
INPUT BUSINESS: Buckhead Saloon
INPUT BUSINESS: Kirkland convenience store
INPUT BUSINESS: Game On
INPUT BUSINESS: Petsi Pies