In [None]:
################################################################################
################################################################################
#
# FILE: yelp_academic_reviews_entry.ipynb 
#
# BY: Dmitry Sedov 
#
# CREATED: Mon Apr 13 2020
#
# DESC: This code import the Yelp academic dataset with reviews and backs out the 
#       entry date from the reviews.
#
# EXEC: 
#      
################################################################################
################################################################################

In [None]:
############################### Libraries ######################################

import os 
import json

import pandas as pd
import sqlalchemy as db

import numpy as np

################################################################################

In [None]:
################################## Constants ###################################

input_folder_path = '/home/user/projects/urban/data/input/Yelp/academic'
review_file_name = 'review.json'
keys = ['business_id', 'stars', 'date']

################################################################################

In [None]:
%%time

all_reviews = []
with open(os.path.join(input_folder_path, review_file_name), 'r') as the_file:
    for line in the_file:
        review = json.loads(line.strip())
        review = {key: review[key] for key in keys}
        all_reviews.append(review)

In [None]:
all_reviews = pd.DataFrame(all_reviews)

In [None]:
all_reviews.head()

In [None]:
all_reviews['review_date'] = pd.to_datetime(all_reviews['date'], format = '%Y-%m-%d %H:%M:%S')

In [None]:
all_reviews.drop(columns = ['date'], inplace = True)

In [None]:
all_reviews.head()

In [None]:
reviews_by_business = all_reviews[['business_id', 
                                   'review_date']
                                 ].groupby('business_id').aggregate({'review_date': ['count', np.min, np.max]})

In [None]:
reviews_by_business.reset_index(inplace = True)

In [None]:
reviews_by_business.head()

In [None]:
reviews_by_business.columns = ['_'.join(col).strip() for col in reviews_by_business.columns.values]

In [None]:
start_date = '2017-06-01 00:00:00'
end_date = '2019-07-01 00:00:00'
mask = (reviews_by_business['review_date_amin'] > start_date) & (reviews_by_business['review_date_amin'] <= end_date)
test = reviews_by_business.loc[mask]

In [None]:
test

In [None]:
######################### Database connections #################################

# Connect to the database via SQLalchemy                                                                           
engine = db.create_engine('postgresql://{user}:{user_pass}@{host}/{dataname2}')                                                       
connection = engine.connect()                                                                                      

################################################################################

In [None]:
# Create a table with mean restaurant visits in zip_code
restaurants_table_statement = """
SELECT
    y_id,
    zip_code
FROM
    restaurants;
"""
restaurants_table = pd.read_sql(restaurants_table_statement, engine)

In [None]:
####################### Close database connections #############################

engine.dispose()

################################################################################

In [None]:
restaurants_table.head()

In [None]:
merged = pd.merge(restaurants_table,
                  test, 
                  how = 'inner', 
                  left_on = 'y_id', 
                  right_on = 'business_id_')

In [None]:
merged.shape