# Recommendation systems collaborative filtering using SVD

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import pandas as pd
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Chunking Dataset

In [4]:
businesses = pd.read_json("/kaggle/input/yelp-dataset/yelp_academic_dataset_business.json", lines=True, orient='columns', chunksize=100000)
reviews = pd.read_json("/kaggle/input/yelp-dataset/yelp_academic_dataset_review.json", lines=True, orient='columns', chunksize=100000)

In [5]:
# read the data
for business in businesses:
    business_chunk = business
    break

for review in reviews:
    review_chunk = review
    break

In [6]:
business_chunk.head(2)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."


In [8]:
review_chunk.head(2)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18


## Preprocessing data

Filter fitur yang digunakan dan hanya gunakan categories "Restaurants".

In [10]:
subset = business_chunk[['business_id','name','address', 'categories', 'attributes','stars']]
subset.shape

(100000, 6)

In [11]:
left_out = subset[subset['categories'].str.contains('Restaurant.*')==True].reset_index()
left_out.head()

Unnamed: 0,index,business_id,name,address,categories,attributes,stars
0,3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'RestaurantsDelivery': 'False', 'OutdoorSeati...",4.0
1,5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,"Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'BusinessParking': 'None', 'BusinessAcceptsCr...",2.0
2,8,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,"Pubs, Restaurants, Italian, Bars, American (Tr...","{'Caters': 'True', 'Alcohol': 'u'full_bar'', '...",3.0
3,9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,"Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{'RestaurantsAttire': ''casual'', 'Restaurants...",1.5
4,11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,"Vietnamese, Food, Restaurants, Food Trucks","{'Alcohol': ''none'', 'OutdoorSeating': 'None'...",4.0


Ekstark fitur atributes dan categories untuk mengeluarkannya dari bentuk objek dan array

In [12]:
# Function that extract keys from the nested dictionary
def extract_keys(attr, key):
    if attr == None:
        return "{}"
    if key in attr:
        return attr.pop(key)

# convert string to dictionary
import ast
def str_to_dict(attr):
    if attr != None:
        return ast.literal_eval(attr)
    else:
        return ast.literal_eval("{}")

# get dummies from nested attributes
left_out['BusinessParking'] = left_out.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'BusinessParking')), axis=1)
left_out['Ambience'] = left_out.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'Ambience')), axis=1)
left_out['GoodForMeal'] = left_out.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'GoodForMeal')), axis=1)
left_out['Dietary'] = left_out.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'Dietary')), axis=1)
left_out['Music'] = left_out.apply(lambda x: str_to_dict(extract_keys(x['attributes'], 'Music')), axis=1)

In [13]:
left_out.head()

Unnamed: 0,index,business_id,name,address,categories,attributes,stars,BusinessParking,Ambience,GoodForMeal,Dietary,Music
0,3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'RestaurantsDelivery': 'False', 'OutdoorSeati...",4.0,"{'garage': False, 'street': True, 'validated':...",{},{},{},{}
1,5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,"Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'BusinessAcceptsCreditCards': 'True', 'Restau...",2.0,,,{},{},{}
2,8,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,"Pubs, Restaurants, Italian, Bars, American (Tr...","{'Caters': 'True', 'Alcohol': 'u'full_bar'', '...",3.0,"{'garage': False, 'street': False, 'validated'...","{'romantic': False, 'intimate': False, 'touris...",{},{},{}
3,9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,"Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{'RestaurantsAttire': ''casual'', 'Restaurants...",1.5,"{'garage': False, 'street': False, 'validated'...",{},{},{},{}
4,11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,"Vietnamese, Food, Restaurants, Food Trucks","{'Alcohol': ''none'', 'OutdoorSeating': 'None'...",4.0,"{'garage': False, 'street': False, 'validated'...","{'touristy': False, 'hipster': False, 'romanti...","{'dessert': False, 'latenight': False, 'lunch'...",{},{}


In [15]:
df_review = review_chunk[['user_id','business_id','stars', 'date']]
df_review.head()

Unnamed: 0,user_id,business_id,stars,date
0,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,2018-07-07 22:09:11
1,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,2012-01-03 15:28:18
2,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,2014-02-05 20:30:30
3,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,2015-01-04 00:01:03
4,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,2017-01-14 20:54:15


In [17]:
df_rest = left_out[['business_id', 'name', 'address']]
df_rest.head()

Unnamed: 0,business_id,name,address
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St
1,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St
2,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd
3,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike
4,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,


## Integrasi data

In [19]:
all_combined = pd.merge(df_review, df_rest, on='business_id')
all_combined.head()

Unnamed: 0,user_id,business_id,stars,date,name,address
0,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,2018-07-07 22:09:11,Turning Point of North Wales,1460 Bethlehem Pike
1,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,2014-02-05 20:30:30,Kettle Restaurant,748 W Starr Pass Blvd
2,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,2015-01-04 00:01:03,Zaika,2481 Grant Ave
3,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,2017-01-14 20:54:15,Melt,2549 Banks St
4,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1,2015-09-23 23:10:31,Dmitri's,795 S 3rd St


In [20]:
all_combined.shape

(72125, 6)

## Create Crosstab 
> Buat crosstab df_rest dengan df_review untuk interaksi antara kedua data

In [21]:
rating_crosstab = all_combined.pivot_table(values='stars', index='user_id', columns='name', fill_value=0)
rating_crosstab.head()

name,'feine,101 Taiwanese Cuisine,10th Street Italian,1200 Chophouse,12th & Porter,16th Street Seafood,1911 Smoke House Barbeque,1925 Cocktail Lounge,2 in One Cafe,211 York,...,eegee's,fat Rooster diner,honeygrow,iCafe,iLuv Pho,iPho Vietnamese Restaurant,il Tavolo Trattoria,la Madeleine,swah-rey,sweetgreen
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---2PmXbF47D870stH1jqA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--4AjktZiHowEIBCMd4CZA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--E0uVPphTORm_OiZ5KCvA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--KMTwCrhKKUmr7riuS4WQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--S8M395r8NtOCvS2LRfDw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# Transpose the Utility matrix
X = rating_crosstab.values.T
X.shape

(3410, 59367)

## Implementing Models SVD

In [23]:
#Implementing SVD
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score

SVD = TruncatedSVD(n_components=12, random_state=17)
result_matrix = SVD.fit_transform(X)
result_matrix.shape

(3410, 12)

In [24]:
import numpy as np
# PearsonR coef
corr_matrix = np.corrcoef(result_matrix)
corr_matrix.shape

(3410, 3410)

restoran yang populer menurut rating (stars) 

In [26]:
all_combined.groupby('business_id')['stars'].count().sort_values(ascending=False).head()

business_id
GBTPC53ZrG1ZBY3DT8Mbcw    950
W4ZEKkva9HpAdZG88juwyQ    433
vN6v8m4DO45Z4pp8yxxF_w    404
pSmOH4a3HNNpYM82J5ycLA    384
8uF-bhJFgT4Tn6DTb27viA    365
Name: stars, dtype: int64

In [27]:
# see the NAME of the most popular restaurant
Filter = all_combined['business_id'] == 'GBTPC53ZrG1ZBY3DT8Mbcw'
print("Name: ", all_combined[Filter]['name'].unique())
print("Address:", all_combined[Filter]['address'].unique())

Name:  ['Luke']
Address: ['333 Saint Charles Ave']


In [28]:
# get the index of the popular restaurant
restaurant_names = rating_crosstab.columns
restaurants_list = list(restaurant_names)

popular_rest = restaurants_list.index('Luke')
print("index of the popular restaurant: ", popular_rest)

index of the popular restaurant:  1719


In [29]:
# restaurant of interest
corr_popular_rest = corr_matrix[popular_rest]
corr_popular_rest.shape

(3410,)

## Recommendations

In [32]:
# Get the list of restaurant names that satisfy the condition
restaurant_list = list(restaurant_names[(corr_popular_rest < 1.0) & (corr_popular_rest > 0.9)])

# Print the list in green color
for restaurant in restaurant_list:
    print(restaurant)

Another Broken Egg Cafe
Baggin's
Banks Street Bar
Binder Alois J Bakery
Bobby Chez Crabcakes
Boizao
Buena Vista Mexican Restaurant
Cafe Bamboo
Cafe Reconcile
Café at The Square
Candela Restaurant
Central Park
Dickey's Barbecue Pit
Dots Diner
Foundation Room
Gyro Shack
HK NOLA
IHOP
Joto Thai-Sushi Clearwater
Kinjo's Japanese Restaurant
Kopper Kitchen
Le Croissant Express
Lucky Dogs
Luke
Mangia Nashville
Maple Street Patisserie
Naoki Ramen
New Orleans Social House
Pair Restaurant
Petite 4
Pizza Hut
Pravda!
Redemption
Rivista
Saltwater Grill
Singha Song Thai
Taj Mahal Homestyle Indian & Pakistani Cuisine
Tangelo's Grille
The Libertine Liquor Bar
Tonys Mexican Restaurant
Twin Peaks
Velvet Central
Wasabi Sushi & Asian Grill
Willie's Chicken Shack
Zimmer's Seafood
