In [1]:
import pandas as pd

# import data for users already containing expectations
users = pd.read_csv('processed_data/user_expectations.csv')

# show data
high_exp_users = users.sort_values(by='expectation')['user_id'].tolist()[0:1000]
low_exp_users = users.sort_values(by='expectation', ascending=False)['user_id'].tolist()[0:1000]

high_exp_users[:10]

['z3wkCACUTfarkwVS-A-yvA',
 'Vb3yGDoHB4QrdZfHzEf6ig',
 'T3cJe99-zCYgcxu2iwY1KA',
 'ag0Ko4cAXf1fdKXD52gGeA',
 'C9rP-btqLgDIBnYuNlKycg',
 'Ta8vU1VCejBxxgo2cfokYQ',
 '53I1ryVkUben5pTSidKyuw',
 'QjJChh0CPzGHaoyxszmYCA',
 'YwE_VkhMPO7a4LXQi3FfRg',
 'IOBbQDBPTk8bgc1fejdqyw']

In [2]:
# import data for reviews and filter it
reviews = pd.read_csv('csv_data/yelp_academic_dataset_review.csv')
reviews = reviews[reviews['user_id'].isin(high_exp_users + low_exp_users)]

# verify number of reviews
reviews

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
24,lUUhg8ltDsUZ9h0xnwY4Dg,RreNy--tOmXMl1en0wiBOg,cPepkJeRMtHapc_b2Oe_dw,4.0,1.0,0.0,1.0,I was really between 3 and 4 stars for this on...,2018-07-17 03:30:07
92,fC415u9adP0Xtamme7hcCw,mEOMAeEonZoUx2nPM3v6fg,f-WhNOSwN1aB4nRFekf01g,4.0,0.0,0.0,0.0,Id you haven't been to the Smoothie King cente...,2015-03-19 00:30:09
154,bi6GaeWDGceGv62lXTIKQA,RgtbLaiU22zqaCk20HgbiQ,bjhCtlYHrkgA5Ku8l-rB3g,1.0,1.0,0.0,0.0,Very disappointed. We went to eat at 2:15 on ...,2015-01-04 02:26:46
183,a5JHzBrWxRd_OmIvV7znDA,04JlTjJRbcv_kS9xVPhOdg,WM3q-7scdPUei6fu4SJFYw,1.0,0.0,0.0,0.0,Went here based on the high ratings and raves ...,2014-11-12 12:41:29
213,Oqh_qiy0kUEYY0IScSYTaQ,dCOonQ7Md1ooief37g1SHw,rrD5LY3nkyKMg1CYKKZomQ,5.0,0.0,0.0,0.0,Steve & his partner are the best! Love that he...,2015-11-06 16:49:25
...,...,...,...,...,...,...,...,...,...
6990174,m_WyTXe6z6FlAzG7qjebEA,uw9cwb4qvH0EKvUh-X_W-w,necj933-7IiKCyMGj6ZWGQ,5.0,8.0,0.0,2.0,We all need to splurge a little sometimes. To...,2016-03-09 16:59:45
6990210,FFNTjpmp4pR0h7c-rxifaQ,MTiEYQ_LHH3xdxOKoXetfw,Ee5liydIi6qRkN64W3LRwg,5.0,4.0,0.0,1.0,"I really enjoyed my first experience floating,...",2019-05-07 21:10:01
6990235,kZiKvXxK7o5i7fa32u5Jgw,6jjHo9Lilv3kTy87pm2ycw,pQAQwhBlSQdG1HuuLuCqXw,5.0,46.0,17.0,45.0,"Just $5 every SUNDAY in October! Do it!\n\nOh,...",2020-10-11 00:09:30
6990240,rtt1Ymczj-1Lb26JMsY2lA,M1cMsRL4L7IUr9RILDywEQ,vt_esoDw6HG5ClM12OPkMg,4.0,4.0,3.0,4.0,"5 stars for the Bonte waffle, 3 stars for this...",2009-03-03 20:59:10


In [14]:
# set labels of high and low for the reviews based on the user who made it
reviews['expectation'] = 'low'
reviews.loc[reviews['user_id'].isin(high_exp_users), ['expectation']] = 'high'

# reduce the dataset to 500 reviews of each category
reviews = reviews.groupby('expectation').apply(lambda x: x.sample(500, random_state=0).reset_index(drop=True))
reviews = reviews.droplevel(level=0)

# check if it worked
reviews.groupby('expectation').count()

Unnamed: 0_level_0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
expectation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
high,500,500,500,500,500,500,500,500,500
low,500,500,500,500,500,500,500,500,500


In [15]:
high_expectation_reviews_text = ''
for text in reviews[reviews['expectation'] == 'high']['text'].tolist():
    high_expectation_reviews_text += text

low_expectation_reviews_text = ''
for text in reviews[reviews['expectation'] == 'low']['text'].tolist():
    low_expectation_reviews_text += text

high_expectation_reviews_text[:500]

'Just around the corner of the Piazza comes this unique find! BARCADE! Came here on a Wednesday night, and street parking was easy to find. Atmosphere was industrial, casual, and dim. Pretty open layout with tables and stools for getting groups together. Large bar with an extensive beer menu. Definitely a place to try some brews that are not your typical "lite" beers. \n\nDid a walkthrough of some of the arcade games, and there were so many! Double Dragon, Punch Out, Paper Boy, Tetris, and Arkanoid'

In [20]:
import textacy
import textacy.extract.keyterms as ke

# process the data before inputting it to textrank
en = textacy.load_spacy_lang('en_core_web_sm', disable=('parser',))
high_expectation_doc = textacy.make_spacy_doc(high_expectation_reviews_text, lang=en)
low_expectation_doc = textacy.make_spacy_doc(low_expectation_reviews_text, lang=en)

# get the key-phrases
high_expectation_kp = [kp for kp, weight in ke.textrank(high_expectation_doc, normalize='lemma', topn=10)]
print('done part 1')
low_expectation_kp = [kp for kp, weight in ke.textrank(low_expectation_doc, normalize='lemma', topn=10)]
print('done part 2')

done part 1
done part 2


In [21]:
print(f'High Expectation Key-Phrases: {high_expectation_kp}')
print(f'Low Expectation Key-Phrases: {low_expectation_kp}')

High Expectation Key-Phrases: ['convenient plug in', 'good good', 'good french comfort food', 'great food option', 'good non greasy food', 'good solid comfort food', 'good mexican food', 'good indian food', 'good comfort food', 'good chinese food']
Low Expectation Key-Phrases: ['walk in', 'good quality food', 'good breakfast place', 'good sushi place', 'good chinese food', 'great bar food', 'good southern comfort food', 'good food Seis', 'good cheesesteak place', 'great little chicken shop']
