In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

In [2]:
data_dir = os.getcwd() + '/yelp_dataset'
print('Loading review data ...')
def load_dataset(data_dir, file):
    df = pd.DataFrame([])
    data_chunk = pd.read_json(os.path.join(data_dir, file), lines=True, chunksize=100000)
    for chunk in data_chunk:
          df = pd.concat([df, chunk])
    return df
df_review = load_dataset(data_dir, 'review.json')
print('Loading business data ...')
df_business = pd.read_json(os.path.join(data_dir, 'business.json'), lines=True)
print('Done!')

Loading review data ...
Loading business data ...
Done!


In [49]:
def review_features(df):
    print('Extracting review features ...')
    # get total number of 'cool', 'funny' and 'useful'
    df_review_statistic = df.groupby('business_id')[['cool', 'funny', 'useful']].sum()
    # get average stars
    df_review_statistic['stars_review'] = df.groupby('business_id')[['stars']].mean()
    # group review timestamps into a list
    df_review_statistic['date'] = df.groupby('business_id')['date'].apply(list)
    review_date = df_review_statistic['date'].values
    # extract 1st and last review year and month
    review_start = [min(i) for i in review_date]
    review_latest = [max(i) for i in review_date]
    df_review_statistic['review_start_year'] = list(map(lambda x: x.year, review_start))
    df_review_statistic['review_start_month'] = list(map(lambda x: x.month, review_start))
    df_review_statistic['review_latest_year'] = list(map(lambda x: x.year, review_latest))
    df_review_statistic['review_latest_month'] = list(map(lambda x: x.month, review_latest))
    # get duration of review in years
    review_duration = np.array(review_latest) - np.array(review_start)
    print('Calculating business duration accoring to review time...')
    df_review_statistic['review_duration'] = [item/pd.Timedelta(days=365.25) for item in tqdm(review_duration)]
    # get review rate per month
    df_review_statistic['review_per_month'] = df_review_statistic['review_count']/df_review_statistic['review_duration']/12
    df_review_statistic.drop(columns=['date'], inplace=True)
    print('Review features extracted!')
    
    return df_review_statistic

df_review_statistic = review_features(df_review)
df_business = df_business.merge(df_review_statistic, on='business_id', how='inner')

Extracting review features ...


  4%|▍         | 7641/192606 [00:00<00:02, 76409.16it/s]

Calculating business duration accoring to review time...


100%|██████████| 192606/192606 [00:02<00:00, 72156.68it/s]


Review features extracted!


In [51]:
df_business.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,...,funny,useful,stars_review,review_count_y,review_start_year,review_start_month,review_latest_year,review_latest_month,review_duration,review_per_month
0,2818 E Camino Acequia Drive,{'GoodForKids': 'False'},1SWheh84yJXfytovILXOAQ,"Golf, Active Life",Phoenix,,0,33.522143,-112.018481,Arizona Biltmore Golf Club,...,0,2,3.0,5,2015,3,2016,5,1.147214,0.363199
1,30 Eglinton Avenue W,"{'RestaurantsReservations': 'True', 'GoodForMe...",QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,"{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W...",1,43.605499,-79.652289,Emerald Chinese Restaurant,...,80,170,2.738806,134,2009,4,2018,10,9.511438,1.174025
2,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",1,35.092564,-80.859132,Musashi Japanese Restaurant,...,50,195,4.073034,178,2008,5,2018,11,10.454523,1.418844
3,"15655 W Roosevelt St, Ste 237",,xvX2CttrVhyG2z1dFg_0xw,"Insurance, Financial Services",Goodyear,"{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",1,33.455613,-112.395596,Farmers Insurance - Paul Lorenz,...,0,2,5.0,3,2013,1,2018,4,5.272819,0.047413
4,"4209 Stuart Andrew Blvd, Ste F","{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...",HhyxOkGAM07SRYtlQ4wMFQ,"Plumbing, Shopping, Local Services, Home Servi...",Charlotte,"{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",1,35.190012,-80.887223,Queen City Plumbing,...,0,6,4.0,4,2014,6,2017,7,3.12944,0.106515


In [None]:
df_business = df_business.merge(df_review_statistic, how='left', on='business_id')