In [3]:
import json
import math
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# TO BE RE-ARANGED AT THE END

## Data Wangling
Prepare the dataframe for business.

In [4]:
# Read original dataframe
df_business_ori = pd.read_json("./data/yelp_academic_dataset_business.json", lines = True)

In [6]:
# Keep only relevant colunms
df_business = df_business_ori[['business_id','state','latitude','longitude', 'stars', 'categories']].copy()
# Show an example
df_business[:3]

Unnamed: 0,business_id,state,latitude,longitude,stars,categories
0,f9NumwFMBDn751xgFiRbNA,NC,35.462724,-80.852612,3.5,"Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh..."
1,Yzvjg0SayhoZgCljUJRF9Q,AZ,33.569404,-111.890264,5.0,"Health & Medical, Fitness & Instruction, Yoga,..."
2,XNoUzKckATkOD1hP6vghZg,QC,45.479984,-73.58007,5.0,"Pets, Pet Services, Pet Groomers"


Find the state with the most number of business.

''''
count = 0
target_state = ''
for state in df_business_ori['state'].unique():
    temps_count = len(np.where(df_business_ori['state'] == state)[0])
    if(temps_count>count):
        count = temps_count
        target_state = state
print("Target state is ",target_state)

# Count the number of business per state
state_count = []
for state in df_business_ori['state'].unique():
    state_count.append(len(np.where(df_business_ori['state'] == state)[0]))
    
# List of states with most business, descending order
sorted_idx = sorted(range(len(state_count)), key=lambda k: -state_count[k])
df_business_ori['state'].unique()[sorted_idx]

In [7]:
# Keep only business in target_state
#df_business = df_business.loc[df_business['state'] == target_state]
# Reset index
df_business.reset_index(drop=True,inplace=True)
# Show an examlpe
df_business.head()

Unnamed: 0,business_id,state,latitude,longitude,stars,categories
0,f9NumwFMBDn751xgFiRbNA,NC,35.462724,-80.852612,3.5,"Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh..."
1,Yzvjg0SayhoZgCljUJRF9Q,AZ,33.569404,-111.890264,5.0,"Health & Medical, Fitness & Instruction, Yoga,..."
2,XNoUzKckATkOD1hP6vghZg,QC,45.479984,-73.58007,5.0,"Pets, Pet Services, Pet Groomers"
3,6OAZjbxqM5ol29BuHsil3w,NV,36.219728,-115.127725,2.5,"Hardware Stores, Home Services, Building Suppl..."
4,51M2Kk903DFYI6gnB5I6SQ,AZ,33.428065,-111.726648,4.5,"Home Services, Plumbing, Electricians, Handyma..."


In [8]:
# Save smaller dataframe as csv in order to not load the entire dataset next time
df_business.to_csv('./data/df_business.csv',index=False)

In [2]:
# Read the save smaller dataset
df_business = pd.read_csv('./data/df_business.csv')

Prepare the dataframe for users.

In [9]:
# Read large json file of user
# The whole 
with open("./data/yelp_academic_dataset_user.json") as json_file:      
    data = json_file.readlines()
    data = list(map(json.loads, data)) 

df_user_ori = pd.DataFrame(data)    

In [10]:
# Keep only relevant colunms
df_user = df_user_ori[['user_id','friends']].copy()
# Show an example
df_user[:3]

Unnamed: 0,user_id,friends
0,ntlvfPzc8eglqvk92iDIAw,"oeMvJh94PiGQnx_6GlndPQ, wm1z1PaJKvHgSDRKfwhfDg..."
1,FOBRPlBHa3WPHFB5qYDlVg,"ly7EnE8leJmyqyePVYFlug, pRlR63iDytsnnniPb3AOug..."
2,zZUnPeh2hEp0WydbAZEOOg,"Uwlk0txjQBPw_JhHsQnyeg, Ybxr1tSCkv3lYA0I1qmnPQ..."


In [11]:
# Save smaller dataframe as csv in order to not load the entire dataset next time
df_user.to_csv('./data/df_user.csv',index=False)

In [2]:
# Read the save smaller dataset
df_user = pd.read_csv('./data/df_user.csv')

In [47]:
# Create a dictionary mapping from user to his friends
dict_friendship = pd.Series(df_user.friends.values,index=df_user.user_id).to_dict()

Load pre-processed dataframe for reviews.

In [4]:
df_review = pd.read_csv("./data/df_reviews.csv")
df_review.head()

Unnamed: 0,review_id,user_id,business_id,stars,date
0,xQY8N_XvtGbearJ5X4QryQ,OwjRMXRC0KyPrIlcjaXeFQ,-MhfebM0QIsKt87iDN-FNw,2,2015-04-15 05:21:16
1,UmFMZ8PyXZTY2QcwzsfQYA,nIJD_7ZXHq-FX8byPMOkMQ,lbrU8StCq3yDfr-QMnGrmQ,1,2013-12-07 03:16:52
2,LG2ZaYiOgpr2DK_90pYjNw,V34qejxNsCbcgD8C0HVk-Q,HQl28KMwrEKHqhFrrDqVNQ,5,2015-12-05 03:18:11
3,i6g_oA9Yf9Y31qt0wibXpw,ofKDkJKXSKZXu5xJNGiiBQ,5JxlZaqCnk1MnbgRirs40Q,1,2011-05-27 05:30:52
4,6TdNDKywdbjoTkizeMce8A,UgMW8bLE0QMJDCkQ1Ax5Mg,IS4cv902ykd8wj1TR0N3-A,4,2017-01-14 21:56:57


## Working..

Find reviews for business in the target state AZ and associate the location of business to each review.

In [87]:
valid_business_id = df_business['business_id'].values

In [88]:
df_review_business = df_review.merge(df_business, left_on='business_id', right_on='business_id',how='inner')

In [89]:
df_review_business.drop(columns=['state'],inplace=True)
df_review_business.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,latitude,longitude
0,UmFMZ8PyXZTY2QcwzsfQYA,nIJD_7ZXHq-FX8byPMOkMQ,lbrU8StCq3yDfr-QMnGrmQ,1,2013-12-07 03:16:52,33.348382,-111.859189
1,i5AtsDlF3fSnBQ5gAgTbeg,VraV4Ci-oJsONsoIWCNeXA,lbrU8StCq3yDfr-QMnGrmQ,2,2012-06-07 00:51:41,33.348382,-111.859189
2,kSRygZjrxPIem7X6FDFZPA,V0bjhLRLphO2OYj-enP-dw,lbrU8StCq3yDfr-QMnGrmQ,1,2012-12-03 00:52:50,33.348382,-111.859189
3,ZayJ1zWyWgY9S_TRLT_y9Q,aq_ZxGHiri48TUXJlpRkCQ,Pthe4qk5xh4n-ef-9bvMSg,5,2015-11-05 23:11:05,33.339962,-111.859727
4,-zN-et1Klryec6ZNwXMtEA,4kux7ad959LcfTy6usKpzA,Pthe4qk5xh4n-ef-9bvMSg,5,2017-11-06 19:07:21,33.339962,-111.859727


In [8]:
len(df_review_business)

2504395

In [9]:
len(df_user)

1968703

In [10]:
2504395/1968703.0

1.2721040197531064

For each review, associate also with user's friends.

In [90]:
df_main = df_review_business.merge(df_user, left_on='user_id', right_on='user_id',how='inner')
df_main.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,latitude,longitude,friends
0,UmFMZ8PyXZTY2QcwzsfQYA,nIJD_7ZXHq-FX8byPMOkMQ,lbrU8StCq3yDfr-QMnGrmQ,1,2013-12-07 03:16:52,33.348382,-111.859189,"B5vnnBub9sscTix_tPAwUw, FKFWX9kiyTvJY8_P9j_Rmw..."
1,5VS8E8Ys3IhQRJU3nW8YtQ,nIJD_7ZXHq-FX8byPMOkMQ,IsN0qEzgAXBjbT3qn6lokA,4,2009-02-04 17:36:27,33.527428,-111.924536,"B5vnnBub9sscTix_tPAwUw, FKFWX9kiyTvJY8_P9j_Rmw..."
2,T0JLHmc_1Nt_Uv2DNmzhhA,nIJD_7ZXHq-FX8byPMOkMQ,jJDnxINrCKstFyeH3F8Cfw,2,2013-11-11 20:46:13,33.373513,-111.840704,"B5vnnBub9sscTix_tPAwUw, FKFWX9kiyTvJY8_P9j_Rmw..."
3,0zjdJuiptDYTKGP-rZtLcg,nIJD_7ZXHq-FX8byPMOkMQ,paJaapnNsrWexFHBq18uDw,5,2014-08-10 22:00:50,33.265405,-111.683521,"B5vnnBub9sscTix_tPAwUw, FKFWX9kiyTvJY8_P9j_Rmw..."
4,Np9TadgdzpqWNa0Nikax5w,nIJD_7ZXHq-FX8byPMOkMQ,nxM6jNyKDgAOh7MAG-VJ_g,1,2015-06-14 02:36:28,33.384363,-111.80511,"B5vnnBub9sscTix_tPAwUw, FKFWX9kiyTvJY8_P9j_Rmw..."


In [91]:
len(df_main)

2504395

In [19]:
test=df_main[['review_id','user_id']].copy()
a = test.groupby('user_id').count().values

In [27]:
max(a)

array([1910])

In [24]:
len(np.where(a>=4)[0])

144921

In [26]:
len(test['user_id'].unique())

607861

how far people are willing to go when a good review is posted on a restaurant by a friend

In [85]:
len(df_main[df_main['user_id']==u[9]])

219

In [80]:
u = df_main['user_id'].unique()
for j in range(50):    
    # user j's friends
    f0 = dict_friendship.get(u[j])
    for i in range(len(f0.split(','))):
        # when friend made a review somewhere
        if(len(np.where(df_main['user_id']==f0.split(',')[i])[0])!=0 ):
            print(j,i)
            
# need to check whether is the business that user has checkin ----TODO

9 0
10 0


KeyboardInterrupt: 