In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('res_review')

In [3]:
df.shape

(60788, 10)

In [4]:
df.columns

Index(['Unnamed: 0', 'business_id', 'cool', 'date', 'funny', 'review_id',
       'stars', 'text', 'useful', 'user_id'],
      dtype='object')

In [5]:
vec = CountVectorizer(min_df = 2, max_df=0.95, max_features=500, stop_words='english')

In [6]:
review_matrix = vec.fit_transform(df['text'])

In [7]:
from sklearn.decomposition import NMF

In [8]:
nmf = NMF(n_components=20, solver='mu')
W = nmf.fit_transform(review_matrix)

In [9]:
idx_to_word = np.array(vec.get_feature_names())

In [10]:
H = nmf.components_

In [13]:
W.shape

(60788, 20)

In [20]:
array = []
for i in range(len(df)):
    array.append(np.argmax(W[i]))

In [21]:
df['category'] = array

In [22]:
df.head()

Unnamed: 0.1,Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,category
0,0,xSs1Z1OrWC_KmhY0SPKJ1Q,0,2016-06-01,0,DsPX83pa6vVSASrHF0U1aQ,2,My server was polite but not very prompt. The ...,0,ubgsST240v6cc5C7Kt4zKQ,2
1,1,xSs1Z1OrWC_KmhY0SPKJ1Q,0,2015-05-06,0,HGw5_lZQ3lpnRHFmbK0-Ww,4,The food was good and the service was great! W...,1,xdcMl8ghySzhDgKzc7OPtQ,10
2,2,xSs1Z1OrWC_KmhY0SPKJ1Q,0,2016-10-30,0,VsH5hnYRbksrYnH4Vi-ZSg,3,I like to get the habachi vegetables or chicke...,0,rzY8uwz-JtfLowr2EcHo1A,10
3,3,xSs1Z1OrWC_KmhY0SPKJ1Q,0,2010-06-17,0,xSRTe2zjKjzYqyC-0EThSQ,2,My daughter and I decided to try Shiki one nig...,1,A2l6pDAwA5SkfuOd7E_npw,11
4,4,xSs1Z1OrWC_KmhY0SPKJ1Q,0,2015-10-22,0,3gctU9pG-ikd8XVW58p_wg,5,My husband and I don't eat out frequently beca...,0,GljH-HX4zGc-c87NGBDhSQ,15


In [12]:
H.shape

(20, 500)

In [11]:
for i, topic in enumerate(H):
    print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in idx_to_word[topic.argsort()[-10:]]])))

Topic 1: server,asked,said,did,good,didn,table,food,got,came
Topic 2: amazing,wait,fast,great,staff,mexican,quality,place,service,food
Topic 3: price,bit,definitely,try,overall,little,nice,pretty,place,good
Topic 4: area,don,nice,friendly,staff,try,like,love,great,place
Topic 5: night,staff,definitely,menu,atmosphere,nice,service,delicious,amazing,great
Topic 6: beef,curry,food,soup,spicy,sauce,fried,thai,rice,chicken
Topic 7: feel,eat,think,taste,know,tasted,just,food,don,like
Topic 8: style,garlic,best,order,delivery,new,wings,sauce,cheese,pizza
Topic 9: vegas,restaurants,thai,best,delicious,dish,dishes,service,menu,restaurant
Topic 10: come,second,don,times,lunch,went,long,just,wait,time
Topic 11: location,come,server,minutes,times,slow,bad,like,customer,service
Topic 12: say,wasn,don,better,bad,good,right,ok,service,just
Topic 13: tuna,rice,quality,salmon,fresh,sashimi,fish,rolls,roll,sushi
Topic 14: come,definitely,pretty,think,liked,little,enjoyed,just,nice,really
Topic 15: defin

In [23]:
df.category.value_counts()

1     5134
4     5102
3     4859
2     4804
10    4390
14    3486
0     3142
13    3076
8     3049
5     2860
9     2855
11    2684
18    2512
7     2493
6     2091
19    1960
12    1923
15    1894
17    1410
16    1064
Name: category, dtype: int64

In [24]:
df16 = df[df.category==16]

In [25]:
vec = CountVectorizer(min_df = 2, max_df=0.95, max_features=500, stop_words='english')

In [26]:
review_matrix = vec.fit_transform(df16['text'])

In [27]:
similarity = cosine_similarity(review_matrix)

In [35]:
df16.head(10)

Unnamed: 0.1,Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,category
197,197,pF7994Vz1kuReP9f-4HOng,0,2018-06-22,0,lThZp4FI5OQzP9-dOS91gA,1,Gross!! I came here to have lunch but I'm tota...,0,rTIVsxvTEm5b94C2th5xQQ,16
342,342,8PNXDkWz3hyZFi7E4tYhjw,0,2014-01-01,0,kLqxpSamRULkkSoYKDcsXA,5,"Really exceptional food, nice menu, quality in...",0,wNs8Pu7Ia-_2Q-7J-WlEWQ,16
486,486,91FlIjn-FabJYjJ819gwbQ,0,2016-08-11,0,jlMbmYqicSiUluI6_OStDg,5,Solid breakfast / lunch spot with every bevera...,0,doXxdj4RJhk04-xxm5J1kw,16
603,603,91FlIjn-FabJYjJ819gwbQ,0,2016-06-21,0,xPfYShts-TSVlsRQn51LOw,5,10 minutes walk to the convention center. I w...,0,SXeMGP5lNgc03z7cl9Xihg,16
915,915,L23qeWacyCZFjF9DL6o04w,0,2014-08-28,0,hRlBNL7L1cpxkDzveGjlkA,1,"Awful,dried out, rubbery. Tried this as smart...",0,QjmhwWSEhIZzG7CETkLw6g,16
1333,1333,mjeolbU52n--a9Lg4ygg6w,0,2016-07-22,0,dngdvsy3kLXJXU9uTnhnSg,3,A normal and clean sub shop. I tried Engineer ...,0,8ZhO7ixP3HRtrry5ygfYFQ,16
1403,1403,nM2Jl9GS7IGlluey8ieivg,0,2016-08-20,0,PlXSkOqL03_zIMODPpYb0Q,1,Their salads suck! Mushy apples - overload of ...,0,pnLFy7olymhwmXqRaDXzpg,16
1424,1424,nM2Jl9GS7IGlluey8ieivg,0,2014-05-02,1,5PGrMCNSDQO_bIuyWInnuw,3,Highlights/lowlights \n\n1) lots of gluten fre...,0,Q1IENmNc6bdDruACmhy4mg,16
2076,2076,sU1sSEOHvd4ex72miOKMEQ,1,2017-12-13,1,5azk8uZnxQNFmnjB5tNing,1,The guy was nice but the sub looked gross and ...,0,TsoR_8KYAvvKzretmQ5Oew,16
2088,2088,sU1sSEOHvd4ex72miOKMEQ,0,2017-07-24,0,JlPXD8jewIemt2kIXtnKxA,5,I always prefer Blimpies over subway and was o...,0,AhJCWoJ1m2UjPrVc1D5Ejg,16


In [31]:
similar_restaurant = list(enumerate(similarity[197]))

In [32]:
similar_restaurant[:5]

[(0, 0.0),
 (1, 0.04212627318711346),
 (2, 0.04662524041201569),
 (3, 0.07944407119985641),
 (4, 0.0)]

In [33]:
sorted_similar_restaurants = sorted(similar_restaurant, key=lambda x:x[1], reverse=True)

In [34]:
i = 0
for restaurant in sorted_similar_restaurants:
    print(restaurant[0])
    i += 1
    if i> 10:
        break

197
200
412
207
198
280
276
349
318
394
251
