<h1 style="text-align: center">Yelp Rating Prediction</h1>
<hr style="border-top: 1px solid #444">

## Development Environment

In [1]:
# python typings
from typing import TypedDict, Dict, List
import json, time
# libraries
import sys, numpy, pandas, sklearn, tensorflow
print(f"Python: {sys.version}")
print(f"Numpy {numpy.__version__}")
print(f"Pandas {pandas.__version__}")
print(f"Scikit-Learn {sklearn.__version__}")
print(f"Tensor Flow Version: {tensorflow.__version__} (Keras Version: {tensorflow.keras.__version__})")

Python: 3.8.5 (tags/v3.8.5:580fbb0, Jul 20 2020, 15:57:54) [MSC v.1924 64 bit (AMD64)]
Numpy 1.18.5
Pandas 1.1.1
Scikit-Learn 0.23.2
Tensor Flow Version: 2.3.0 (Keras Version: 2.4.0)


<hr style="border-top: 1px solid #444">

# I. Data Importation

## Dataset Location

In [2]:
file_businesses = r"data/yelp_academic_dataset_business.json"
file_user_reviews = r"data/yelp_academic_dataset_review.json"

## Import Yelp Businesses

In [3]:
start_time = time.time()

# business structure
class Business(TypedDict):
    business_id: str
    name: str
    address: str
    city: str
    state: str
    postal_code: str
    latitude: float
    longitude: float
    stars: float
    review_count: int
    is_open: int
    attributes: Dict
    categories: List[str]
    hours: Dict

# businesses indexed by business_id (i.e. {business['business_id']: Business}
businesses_by_id: Dict[str, Business] = {}

# parse all businesses
with open(file_businesses, 'r', encoding='utf-8') as file:
    # iterate over newline-deliminted JSON records
    for record in file:
        # parse JSON record
        business: Business = json.loads(record)
        # map Business by business_id
        businesses_by_id[business['business_id']] = business

print(f"Imported {len(businesses_by_id):,} distinct businesses in {time.time() - start_time:.6f} seconds")

Imported 209,393 distinct businesses in 3.802714 seconds


## Import User Reviews

In [4]:
start_time = time.time()

# user review structure
class UserReview(TypedDict):
    review_id: str
    user_id: str
    business_id: str
    date: str
    stars: int # [0, 1, 2, 3, 4, 5]
    text: int
    # review ratings
    useful: int
    funny: int
    cool: int

# user reviews indexed by business_id (i.e. {business_id: UserReview[]})
business_reviews: Dict[str, List[UserReview]] = {
    business_id: [] for business_id in businesses_by_id.keys()
}

# parse user reviews
with open(file_user_reviews, 'r', encoding='utf-8') as file:
    # iterate over newline-deliminted JSON records
    for record in file:
        # parse JSON record
        review: UserReview = json.loads(record)
        # map user review by business_id
        business_reviews[review['business_id']].append(review)

print(f"Imported {sum([len(reviews) for reviews in business_reviews.values()]):,} distinct businesses in {time.time() - start_time:.6f} seconds")

Imported 8,021,122 distinct businesses in 112.990777 seconds


<hr style="border-top: 1px solid #444">

## Training Data Selection

## Select businesses with more than X reviews

In [6]:
# minimum business["review_count"] required for a business to not be filtered
MINIMUM_REVIEW_COUNT = 10

start_time = time.time()

# filter out businesses with less than MINIMUM_REVIEW_COUNT reviews
filtered_businesses = [business for business in businesses_by_id.values() if MINIMUM_REVIEW_COUNT <= business["review_count"]]

print(f"{len(filtered_businesses):,} remaining businesses (filtered {len(businesses_by_id) - len(filtered_businesses)}) in {time.time() - start_time:.6f} seconds")

103,803 remaining businesses (filtered 105590) in 0.090101 seconds


## Partition businesses for model training and testing

In [9]:
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
# "represents the absolute number of test samples"
TRAINING_SIZE = 10000
# "represents the absolute number of train samples"
TESTING_SIZE = 10000

start_time = time.time()

# imports
from sklearn.model_selection import train_test_split
# split businesses into two disjoint subsets: training and testing
train_businesses, test_businesses = train_test_split(
    # set to partition
    filtered_businesses,
    # partition proportions
    train_size = TRAINING_SIZE,
    test_size = TESTING_SIZE,
    # shuffle the data
    shuffle = True,
    # PRNG seed for deterministic behaviour
    random_state = 42
)

print(f"Partitioned {len(train_businesses) + len(test_businesses):,} businesses into {{training: {len(train_businesses)}, testing: {len(test_businesses)}}} in {time.time() - start_time:.6f} seconds")

Partitioned 20,000 businesses into {training: 10000, testing: 10000} in 0.023006 seconds
