In [1]:
import json
import math
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# I. Load the data & some insights

In [2]:
# We load the 3 datasets 
df_business = pd.read_csv('./data/df_business.csv')
df_reviews = pd.read_csv('./data/df_reviews.csv')
df_users = pd.read_csv('./data/df_user.csv')

In [3]:
# Convert the string of friends to a list
df_users['friend_list'] = df_users.apply(lambda x: x.friends.split(', '), axis=1)
df_user = df_users.drop('friends', axis=1)

For reading purposes, the 3 datasets have the following form

In [4]:
df_business.head(3)

Unnamed: 0,business_id,state,latitude,longitude,stars
0,f9NumwFMBDn751xgFiRbNA,NC,35.462724,-80.852612,3.5
1,Yzvjg0SayhoZgCljUJRF9Q,AZ,33.569404,-111.890264,5.0
2,XNoUzKckATkOD1hP6vghZg,QC,45.479984,-73.58007,5.0


In [5]:
df_reviews.head(3)

Unnamed: 0,review_id,user_id,business_id,stars,date
0,xQY8N_XvtGbearJ5X4QryQ,OwjRMXRC0KyPrIlcjaXeFQ,-MhfebM0QIsKt87iDN-FNw,2,2015-04-15 05:21:16
1,UmFMZ8PyXZTY2QcwzsfQYA,nIJD_7ZXHq-FX8byPMOkMQ,lbrU8StCq3yDfr-QMnGrmQ,1,2013-12-07 03:16:52
2,LG2ZaYiOgpr2DK_90pYjNw,V34qejxNsCbcgD8C0HVk-Q,HQl28KMwrEKHqhFrrDqVNQ,5,2015-12-05 03:18:11


In [6]:
df_users.head(3)

Unnamed: 0,user_id,friends,friend_list
0,ntlvfPzc8eglqvk92iDIAw,"oeMvJh94PiGQnx_6GlndPQ, wm1z1PaJKvHgSDRKfwhfDg...","[oeMvJh94PiGQnx_6GlndPQ, wm1z1PaJKvHgSDRKfwhfD..."
1,FOBRPlBHa3WPHFB5qYDlVg,"ly7EnE8leJmyqyePVYFlug, pRlR63iDytsnnniPb3AOug...","[ly7EnE8leJmyqyePVYFlug, pRlR63iDytsnnniPb3AOu..."
2,zZUnPeh2hEp0WydbAZEOOg,"Uwlk0txjQBPw_JhHsQnyeg, Ybxr1tSCkv3lYA0I1qmnPQ...","[Uwlk0txjQBPw_JhHsQnyeg, Ybxr1tSCkv3lYA0I1qmnP..."


In [7]:
print('There are', df_business.shape[0], 'businesses')
print('There are', df_reviews.shape[0], 'reviews')
print('There are', df_users.shape[0], 'users')

There are 209393 businesses
There are 8021122 reviews
There are 1968703 users


For our study, we need to infer user's home, so we need **at least 3 reviews**. Also, we will study users with **at least 3 friends**.  

In [8]:
# We count the number of reviewe per user
df_numberOfReviews = df_reviews.groupby('user_id').count().drop(['business_id', 'stars', 'date'], axis = 1).rename(columns={'review_id': 'review_count'})

In [9]:
df_numberOfReviews.head()

Unnamed: 0_level_0,review_count
user_id,Unnamed: 1_level_1
---1lKK3aKOuomHnwAkAow,131
---3o4ZsKYoBYBe7H6xG8A,1
---89pEy_h9PvHwcHNbpyg,1
---94vtJ_5o_nikEs6hUjg,5
---PLwSf5gKdIoVnyRHgBA,3


In [10]:
# We merge the df_users dataset to the df_numberOfReviews dataset
df_users = pd.merge(df_users, df_numberOfReviews, left_on='user_id', right_on='user_id', how='inner')

In [11]:
# We create a mask for users with more than 3 friends
mask = df_users.apply(lambda x: len(x.friend_list) >= 3, axis=1)

In [12]:
print('Number of users with more than 3 reviews:', df_numberOfReviews[df_numberOfReviews['review_count'] >= 3].shape[0])
print('Number of users with more than 3 friends:', df_users[mask].shape[0])

Number of users with more than 3 reviews: 602736
Number of users with more than 3 friends: 957463


In [13]:
# Select users that satisfy both conditions
df_selectedUsers = df_users[mask&(df_users['review_count'] >= 3)]
print('Number of users with more than 4 reviews and with more than 3 friends:', df_selectedUsers.shape[0])

Number of users with more than 4 reviews and with more than 3 friends: 343424


In [69]:
friends_dict = df_selectedUsers.set_index("user_id")["friend_list"].to_dict()

In [14]:
# Number of checkins of the selected users
print('Number of checkins of selected users:', df_selectedUsers['review_count'].sum())

Number of checkins of selected users: 4516345


In [15]:
df_reviews = pd.merge(df_reviews, df_business, left_on='business_id', right_on='business_id', how='inner').drop(['state'], axis=1)

In [17]:
df_selectedReviews = df_reviews[df_reviews["user_id"].isin(df_selectedUsers.user_id)]

In [57]:
df_selectedReviews["date"] = pd.to_datetime(df_selectedReviews.date.values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selectedReviews["date"] = pd.to_datetime(df_selectedReviews.date.values)


In [61]:
df_selectedReviews.head()

Unnamed: 0,review_id,user_id,business_id,stars_x,date,latitude,longitude,stars_y
1,t7xOZF5UKXjSpVcXLOSAgw,owbC7FP8SNAlwv6f9S5Stw,-MhfebM0QIsKt87iDN-FNw,2,2014-03-14 08:24:25,36.112896,-115.177637,3.5
2,MimB5Xh85rG7phUMPrShag,v9vGnjphb0Hta0lvtf5haA,-MhfebM0QIsKt87iDN-FNw,3,2015-10-07 22:16:59,36.112896,-115.177637,3.5
4,cnV5xtm6WuyaLfot9uWbDg,LkWNo83Lg92C5V4JEyxOZA,-MhfebM0QIsKt87iDN-FNw,3,2010-10-10 01:27:31,36.112896,-115.177637,3.5
5,i593z2rGxk5Lj23Bx9o1Lg,mLtaUzSjIFO_3BmfPNUKqg,-MhfebM0QIsKt87iDN-FNw,1,2015-08-20 01:10:53,36.112896,-115.177637,3.5
6,TWpbq_vbpJRotCQ6l8SeXA,wKX1tAeRIYPU4NtM-R5N2w,-MhfebM0QIsKt87iDN-FNw,5,2012-03-31 22:00:15,36.112896,-115.177637,3.5


# Getting the Reviewed Label

In [33]:
df_reviews["date"] = pd.to_datetime(df_reviews.date.values)

In [47]:
df_reviews = df_reviews.rename({"stars_x" : "rating", "stars_y" : "average_rating"},axis=1)

In [49]:
df_reviews_groupedUser = df_reviews.groupby(["user_id","business_id"])[["date","rating"]].agg(list)

In [95]:
df_reviews_groupedUser.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,rating
user_id,business_id,Unnamed: 2_level_1,Unnamed: 3_level_1
---1lKK3aKOuomHnwAkAow,--9e1ONYQuAa-CB_Rrw7Tw,[2008-11-11 04:40:05],[4]
---1lKK3aKOuomHnwAkAow,-ErwgUmZ1-jHW_rSu55jlg,[2010-11-09 20:21:52],[5]
---1lKK3aKOuomHnwAkAow,0YhT9HCBkU394IG6tQVtNQ,[2012-05-01 16:17:54],[1]
---1lKK3aKOuomHnwAkAow,1JgaRBX0oiRsvEhHF3ZMjw,[2011-02-16 03:58:48],[1]
---1lKK3aKOuomHnwAkAow,1Vn_lex3LGGwuTo-xeJnww,[2011-02-16 04:16:12],[5]


In [127]:
#takes a row of the shape ["user_id","business_id","date"]
#returns a label : if a friend has left a good review in the same place before : 2, if a friend has left a bad review
#in the same place before : 1, and if a friend hasn't left a review before : 0.
def get_visit_label(user_id,business_id,date_user):
    #get friend list of user:
    friends = friends_dict.get(user_id)
    for friend_id in friends:
        if((friend_id,business_id) in df_reviews_groupedUser.index):
            friend_review = df_reviews_groupedUser.loc[friend_id].loc[business_id]
            for date,rating in zip(friend_review.date,friend_review.rating):
                if(date < date_user):
                    if(rating >= 3.5):
                        return 2
                    else:
                        return 1
    return 0

In [105]:
selected_reviews_array = np.array(df_selectedReviews[["user_id","business_id","date"]])

In [137]:
get_visit_label_vec = np.vectorize(get_visit_label)

In [None]:
labels = get_visit_label_n(selected_reviews_array[:,0],selected_reviews_array[:,1],selected_reviews_array[:,2])

In [124]:
labels = np.apply_along_axis(get_visit_label,axis = 1, arr = selected_reviews_array[:100])

In [125]:
labels

array([0, 0, 1, 0, 2, 2, 2, 1, 0, 2, 1, 0, 2, 1, 1, 2, 2, 0, 0, 0, 2, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 2, 0, 0, 2,
       0, 1, 1, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 2, 0, 0,
       0, 2, 0, 0, 2, 0, 2, 2, 2, 0, 0, 0])