## Recommender System Internals

In [1]:
import numpy as np
import pandas as pd
import ast
from sklearn.preprocessing import OneHotEncoder

In [2]:
df_business = pd.read_csv('data_small/yelp_academic_dataset_business_filtered.csv')
# Remove average user rating.  This system will be using the user reviews instead
del df_business['stars']
df_users = pd.read_csv('data_small/yelp_academic_dataset_user.csv')
df_review = pd.read_csv('data_small/yelp_academic_dataset_review.csv')
# Remove reviews that are for businesses outside of df_business
business_id_set = set(df_business['business_id'])
df_review = df_review[df_review['business_id'].isin(business_id_set)]

In [4]:
temp = pd.merge(df_users, df_review, on='user_id')
df_all = pd.merge(temp, df_business, on='business_id')

print("DATAFRAME SHAPE: ", df_all.shape)
print("UNIQUE USERS: ", np.unique(df_all['user_id']).shape[0])
print("UNIQUE BUSINESSES: ", np.unique(df_all['business_id']).shape[0])

DATAFRAME SHAPE:  (6576, 49)
UNIQUE USERS:  6152
UNIQUE BUSINESSES:  697


In [5]:
enc = OneHotEncoder(handle_unknown='ignore', sparse=True)
enc.fit(df_all['user_id'].values.reshape(-1, 1))

OneHotEncoder(handle_unknown='ignore')

In [6]:
enc.transform([['-7yf2-ax6xpxCXPpHHfNLA']]).A

array([[0., 0., 0., ..., 0., 0., 0.]])

In [7]:
# https://stackoverflow.com/questions/46622869/pandas-groupby-column-a-and-make-lists-of-tuples-from-other-columns
data = df_all.groupby('business_id')[['user_id', 'stars']].apply(lambda x: x.values.tolist())
data

business_id
-7yf2-ax6xpxCXPpHHfNLA                      [[kkWU93G18F7vgzdQB-8h-g, 3.0]]
-Eg1pMVoWg8YR6-O4QuTxw    [[KMmMQ8Dpx7wx15xG5caw0Q, 2.0], [yIG8aHbb3aFdD...
-OmEmU8sds0dcsDDSoce-g    [[HEB0nn9Hi5occxoGZR8pXA, 3.0], [UqqTaNJEoDp2Z...
-R_djOxD9Jd5qYqk06dKJA    [[Uke3Ob09vZgndCZ1ulHsdw, 4.0], [yyj1nv9Ee8TKk...
-SkwKPbo5oK1-NtKkupNvw                      [[AzMXhq_WAJxnyYi4eEytjA, 1.0]]
                                                ...                        
zdXd2X8oH4r7QCD0mF9tig    [[aieN8y6UxP-nDDevF7SOfA, 5.0], [a6NFgtvqzvUL8...
zfasUshwU5NhibqKVQ7FIw    [[E4SfIfuU0H3RtDKwDvc_Uw, 3.0], [eNFWxvso6FkM7...
zrEX83k18Zf-CeMKrHIOKA    [[jVYzrVblDFSuL3GHtt8ZSA, 2.0], [T0CyD8Y5wDOow...
zxbkDCJ85JHgC8CWcdMCZw    [[Zog0qSOFTjE4H0T044hA-g, 5.0], [2LVT_gi7IVCUm...
zyBC3BUkH9klhPhMyQmxAQ    [[z9B25fKtfrxzUws_OOHJSQ, 2.0], [MyW4TaKufhNxr...
Length: 697, dtype: object

In [8]:
def interaction_vec(data, enc):
    """
    Generates the interaction matrix.
    :param data: Series where indices correspond to business_id and values contains a list of pairs (user_id, star rating for the business)
    :param enc: one hot encoder trained on the user IDs
    :return: numpy array of star interactions of size (number of businesses, number of users)
    """
    user_interactions = []

    for business_id in data.index:
        vec = enc.transform([['']])
        for user_id, rating in data[business_id]:
            vec += rating * enc.transform([[user_id]])
        user_interactions.append(vec.A.flatten())

    return np.array(user_interactions)

In [9]:
out = interaction_vec(data, enc)

`out` is a matrix where each row corresponds to a business, and each column corresponds to a user.  This system was designed with item-to-item in mind.

In [10]:
print(out.shape)

(697, 6152)


In [11]:
def get_closest_businesses(business_index, interaction_matrix):
    business = interaction_matrix[business_index]
    interaction_magnitudes = np.linalg.norm(interaction_matrix, axis=1).reshape((-1, 1))
    similarities = np.dot(interaction_matrix, business).reshape((-1, 1)) / interaction_magnitudes
    similarities = similarities.flatten()
    n = similarities[similarities > 0].shape[0]
    return np.argsort(similarities)[::-1][:n]

indices = data.index[get_closest_businesses(45, out)]
df_business[df_business['business_id'].isin(indices)]


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,business_id,name,address,city,state,postal_code,latitude,longitude,review_count,is_open,attributes,categories,hours
67,67,195,195,zYMjNc-Q7-T238SHylKgKA,Chinatown Restaurant & Lucky Bistro,14455 SW Pacific Hwy,Tigard,OR,97224,45.415859,-122.791456,44,0,"{'RestaurantsPriceRange2': '2', 'RestaurantsTa...","Restaurants, Dim Sum, Chinese","{'Monday': '10:30-22:0', 'Tuesday': '10:30-22:..."
378,378,1170,1170,ZT8PWgmXw9G1TiVq6JlRWg,The Know,3728 NE Sandy Blvd,Portland,OR,97232,45.534061,-122.624405,66,0,"{'RestaurantsGoodForGroups': 'True', 'Restaura...","Dive Bars, Music Venues, Nightlife, Karaoke, A...","{'Monday': '14:0-2:30', 'Tuesday': '14:0-2:30'..."
404,404,1312,1312,Ak6ak8lb7qLlg0K4qdwAdg,Boxxes Video Bar,330 SW 11th St,Portland,OR,97205,45.522401,-122.681899,17,0,"{'Ambience': ""{'romantic': False, 'intimate': ...","Restaurants, Seafood, Gay Bars, Bars, Nightlife","{'Monday': '5:0-1:0', 'Tuesday': '5:0-1:0', 'W..."
557,557,1797,1797,EXU9J5LCGb-CTs8sTOt-Eg,Burnside Brewing,701 E Burnside St,Portland,OR,97214,45.523395,-122.658372,538,0,"{'Alcohol': ""u'full_bar'"", 'WiFi': ""u'free'"", ...","Pubs, Bars, Food, American (New), Nightlife, B...","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'..."
573,573,1848,1848,-ehH_g5kTc1CSzydc6apOw,Over the Top,"A La Carts Food Pavillion, SE 50th Ave & SE Di...",Portland,OR,97206,45.505366,-122.599835,22,0,"{'RestaurantsDelivery': 'False', 'HasTV': 'Fal...","Restaurants, Food Stands, Burgers","{'Monday': '12:0-17:0', 'Tuesday': '12:0-17:0'..."
701,701,2267,2267,TDQ0lSTHW3RyfVWQuwbBGg,Breakfast At Valerie's,"516 SE Chkalov Dr, Ste 1",Vancouver,WA,98683,45.617161,-122.555714,461,1,"{'RestaurantsGoodForGroups': 'True', 'Business...","American (New), Restaurants, Breakfast & Brunch","{'Monday': '6:0-15:0', 'Tuesday': '6:0-15:0', ..."
752,752,2457,2457,VGxHDM-0Ic6E5CrXl0qMfQ,Oaks Bottom Public House,1621 SE Bybee Blvd,Portland,OR,97202,45.47388,-122.649422,217,1,"{'BusinessParking': ""{'garage': False, 'street...","Salad, Restaurants, Bars, Nightlife, Beverage ...","{'Monday': '11:0-0:0', 'Tuesday': '11:0-0:0', ..."
782,782,2559,2559,kGmMiW1qCoJ95-5Y9Vk8kw,Simpatica Dining Hall,828 SE Ash St,Portland,OR,97214,45.521371,-122.657057,247,0,"{'RestaurantsTakeOut': 'False', 'Alcohol': ""u'...","Diners, Event Planning & Services, Restaurants...","{'Friday': '19:30-19:45', 'Saturday': '19:0-22..."
820,820,2692,2692,3HNq9KV8A7OBiqlWm8kZ8g,Swirl Frozen Yogurt,3538 SE Hawthorne Blvd,Portland,OR,97214,45.511922,-122.627815,118,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Ice Cream & Frozen Yogurt, Food",
824,824,2700,2700,6yBizH8RnIYXk6vboLk3PA,Random Order Pie Bar,1800 NE Alberta St,Portland,OR,97211,45.558983,-122.646688,514,0,"{'HasTV': 'False', 'BikeParking': 'True', 'Noi...","Bars, Restaurants, Coffee & Tea, Bakeries, Foo...","{'Monday': '7:30-21:30', 'Wednesday': '7:30-21..."


In [12]:
out_magnitudes = np.linalg.norm(out, axis=1).reshape((-1, 1))
np.dot(out, out[3]).reshape((-1, 1)) / out_magnitudes

array([[0.        ],
       [0.        ],
       [0.        ],
       [4.89897949],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.   

In [13]:
import re
import ast
#out = re.sub('[\]','', df_business['attributes'][0])
attributes = ast.literal_eval(df_business['attributes'][5])
attributes

{'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}",
 'RestaurantsDelivery': 'False',
 'BusinessAcceptsCreditCards': 'True',
 'RestaurantsPriceRange2': '1',
 'DogsAllowed': 'False',
 'NoiseLevel': "u'quiet'",
 'RestaurantsTakeOut': 'True',
 'Caters': 'True',
 'BikeParking': 'True'}

In [14]:
df_business['attributes'][5]

'{\'BusinessParking\': "{\'garage\': False, \'street\': False, \'validated\': False, \'lot\': True, \'valet\': False}", \'RestaurantsDelivery\': \'False\', \'BusinessAcceptsCreditCards\': \'True\', \'RestaurantsPriceRange2\': \'1\', \'DogsAllowed\': \'False\', \'NoiseLevel\': "u\'quiet\'", \'RestaurantsTakeOut\': \'True\', \'Caters\': \'True\', \'BikeParking\': \'True\'}'

In [110]:
def return_attribute_soup(input):
    current = []
    for key in input:
        # Inner dictionaries appear to be malformed in places
        try:
            value = ast.literal_eval(input[key])
        except ValueError:
            value = input[key]
        if isinstance(value, dict):
            current.extend(return_attribute_soup(value))
        elif value not in (False, 'No', 'False'):
            current.append(key)
    return current

return_attribute_soup(attributes)

['lot',
 'BusinessAcceptsCreditCards',
 'RestaurantsPriceRange2',
 'NoiseLevel',
 'RestaurantsTakeOut',
 'Caters',
 'BikeParking']

In [17]:
df_users = pd.read_csv('data_small/yelp_academic_dataset_user.csv')

(100000, 24)