> This notebook demonstrates loading and stratified sampling of the Yelp datasets.

# Setup

##### Imports

In [1]:
import os

import pandas as pd

##### Set Root Dir as CWD

In [2]:
# Navigate to project root (if not already there)
if os.path.split(os.getcwd())[1] != "cs374-project":
    %cd ../

D:\Documents\A_DIGIPEN\PersonalSVN\Fall22SVN\CS374\Project\cs374-project


# Load

In [3]:
path = "data/yelp/yelp_academic_dataset_"

##### Tip (~0.18Gb)

> Testing loading with a smaller dataset.

In [5]:
tips = pd.read_json(path + "tip.json", lines=True)

In [6]:
tips

Unnamed: 0,user_id,business_id,text,date,compliment_count
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban...,2013-02-05 18:35:10,0
2,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,It's open even when you think it isn't,2013-08-18 00:56:08,0
3,FjMQVZjSqY8syIO-53KFKw,hV-bABTK-glh5wj31ps_Jw,Very decent fried chicken,2017-06-27 23:05:38,0
4,ld0AperBXk1h6UbqmM80zw,_uN0OudeJ3Zl_tf6nxg5ww,Appetizers.. platter special for lunch,2012-10-06 19:43:09,0
...,...,...,...,...,...
908910,eYodOTF8pkqKPzHkcxZs-Q,3lHTewuKFt5IImbXJoFeDQ,Disappointed in one of your managers.,2021-09-11 19:18:57,0
908911,1uxtQAuJ2T5Xwa_wp7kUnA,OaGf0Dp56ARhQwIDT90w_g,Great food and service.,2021-10-30 11:54:36,0
908912,v48Spe6WEpqehsF2xQADpg,hYnMeAO77RGyTtIzUSKYzQ,Love their Cubans!!,2021-11-05 13:18:56,0
908913,ckqKGM2hl7I9Chp5IpAhkw,s2eyoTuJrcP7I_XyjdhUHQ,Great pizza great price,2021-11-20 16:11:44,0


##### Reviews (~5.22Gb)

> Testing loading for review dataset. Given the size, testing is done with only the first 1000 rows.

In [4]:
# Save path
review_path = path + "review.json"

In [7]:
# Load iterable JSON reader to only view sample of reviews
review_samples = pd.read_json(review_path, lines=True, chunksize=1000)

In [8]:
# View single sample
for sample in review_samples:
    print(sample.stars.unique())
    display(sample)
    break
    
# Reset to first sample
review_samples = pd.read_json(review_path, lines=True, chunksize=1000)

[3 5 4 1 2]


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15
...,...,...,...,...,...,...,...,...,...
999995,t-2o35kr7Q9DSaeuKhaDuQ,oX7o1TH0PHUWp9r9ry9_vw,jLn69WQupjsDKrbPw_nlGQ,3,0,1,0,Never really had any issues here other than th...,2017-11-15 09:43:07
999996,fLIwWCvdul9PNWYfJt5QWA,v8wlapFKVLs2qTYCGhCdiw,t6v8g8UeNiq3O2GoEc7R4Q,4,0,0,0,Fish recently moved a couple of doors down the...,2014-09-03 18:27:33
999997,ETAiy6wEM-r9ve4SKDhBpg,rLlYc1RzIBnOmnX3AbpEYw,ZYRul0i1bhOjirHED6Kd0w,3,0,0,0,I've been to South House around a dozen or so ...,2016-02-20 22:25:29
999998,8OgvSXuc6KjAt2fSz9LuzA,eEH-8CEPU5ndPxDGzVfHiQ,onGXKwnxPLtKnO8yqQMPSA,1,1,0,1,Wow! I am shocked at these reviews. I have tri...,2010-06-27 02:17:30


# Sample

> Create stratified sample using a small percentage of the dataset.

In [11]:
from sklearn.model_selection import train_test_split

In [10]:
reviews = pd.read_json(review_path, lines=True)

In [18]:
# Create stratified sample using 1% of data
X, y = train_test_split(reviews, test_size=0.01, stratify=reviews["stars"])

In [26]:
y

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
3309054,-Ci6Vj1DgDLKdW68E_2cuw,cPcStFGBlk-6lWWsttIfVQ,E7RtIisUoRMcADSebBgNAA,3,6,2,6,"No wait time, beautiful patio and indoor seati...",2021-06-06 21:13:08
2947147,H74lqMTYQpD46w3mia3FQg,6XE7-ZJavi56DTf8XMdDcg,BeOUoIGCwelzFJJYES6iog,2,1,0,1,I was so disappointed when I heard Bella sold ...,2015-09-17 13:40:05
2328516,hQV40DurGWAOlSXTj4gVUA,cZj8t1H2SycG0qspnrU_WQ,fEyW48Cdk_dJO2pxUWa8kQ,5,0,0,0,We rented a house just a couple of blocks down...,2018-06-24 03:14:57
4854559,AQFCnNEQjSp0EcwSCcRaCQ,xW9oeMlDySMc5BeCCmPA9Q,gUhP-qSOXBHdY46xb0qmjA,2,0,1,1,This was my first time in the Polished Nail Sp...,2021-12-08 18:51:09
1685746,CAZSey0Pp-mITCSmr1-MVA,4A0yQChqSVQGZDBwf2UOAQ,GV50rl_ahHaZCuTCd4o1Cg,3,1,0,1,Food is average Italian. Service is always sl...,2017-02-19 01:06:48
...,...,...,...,...,...,...,...,...,...
3058890,rnVBmyZ2UXdjF_1K3i3dug,SERp7ZPIpmSrs9F3foQuPw,VUTUDiBC0K7HJec5uzXucA,5,2,0,0,Had an amazing experience here. Riding a bike ...,2013-07-10 22:33:13
679639,JNAZE3KeD9E3RTZGYOpAqA,VVs0OoDD7ikOS5Qtme6YQQ,Ims_IgNKoDts2FdFNROQJQ,4,5,1,1,Everything was extremely flavorful. We definit...,2020-06-23 12:46:09
3866664,MS20s-neZCRvK9Qh6j_vBw,Rm9xW2AyC-0eC2wC-vQdeg,EuG0fFtMLby-VejJORM_hg,2,3,0,0,I've been interested in purchasing a house fro...,2017-11-28 00:42:45
6454,RUNhUjMT2_pK8Usgfk6VkQ,dMy9qOWJY1hF_JLMn35HWg,geUiM_VTRmUz6dViO7E-jg,5,0,0,0,I seek out brick oven pizza places as craft pi...,2015-11-19 00:16:58


##### Save Sample

In [35]:
sample_path = "data/yelp/samples/stratified_review_sample.xz"

In [36]:
# Save stratified sample as compressed dataframe
y.to_pickle(sample_path, compression="xz")

In [37]:
# Test loading
y = pd.read_pickle(sample_path)
y

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
3309054,-Ci6Vj1DgDLKdW68E_2cuw,cPcStFGBlk-6lWWsttIfVQ,E7RtIisUoRMcADSebBgNAA,3,6,2,6,"No wait time, beautiful patio and indoor seati...",2021-06-06 21:13:08
2947147,H74lqMTYQpD46w3mia3FQg,6XE7-ZJavi56DTf8XMdDcg,BeOUoIGCwelzFJJYES6iog,2,1,0,1,I was so disappointed when I heard Bella sold ...,2015-09-17 13:40:05
2328516,hQV40DurGWAOlSXTj4gVUA,cZj8t1H2SycG0qspnrU_WQ,fEyW48Cdk_dJO2pxUWa8kQ,5,0,0,0,We rented a house just a couple of blocks down...,2018-06-24 03:14:57
4854559,AQFCnNEQjSp0EcwSCcRaCQ,xW9oeMlDySMc5BeCCmPA9Q,gUhP-qSOXBHdY46xb0qmjA,2,0,1,1,This was my first time in the Polished Nail Sp...,2021-12-08 18:51:09
1685746,CAZSey0Pp-mITCSmr1-MVA,4A0yQChqSVQGZDBwf2UOAQ,GV50rl_ahHaZCuTCd4o1Cg,3,1,0,1,Food is average Italian. Service is always sl...,2017-02-19 01:06:48
...,...,...,...,...,...,...,...,...,...
3058890,rnVBmyZ2UXdjF_1K3i3dug,SERp7ZPIpmSrs9F3foQuPw,VUTUDiBC0K7HJec5uzXucA,5,2,0,0,Had an amazing experience here. Riding a bike ...,2013-07-10 22:33:13
679639,JNAZE3KeD9E3RTZGYOpAqA,VVs0OoDD7ikOS5Qtme6YQQ,Ims_IgNKoDts2FdFNROQJQ,4,5,1,1,Everything was extremely flavorful. We definit...,2020-06-23 12:46:09
3866664,MS20s-neZCRvK9Qh6j_vBw,Rm9xW2AyC-0eC2wC-vQdeg,EuG0fFtMLby-VejJORM_hg,2,3,0,0,I've been interested in purchasing a house fro...,2017-11-28 00:42:45
6454,RUNhUjMT2_pK8Usgfk6VkQ,dMy9qOWJY1hF_JLMn35HWg,geUiM_VTRmUz6dViO7E-jg,5,0,0,0,I seek out brick oven pizza places as craft pi...,2015-11-19 00:16:58
