## Introduction
This notebook creates a subset of the [Yelp Reviews Dataset](https://business.yelp.com/data/resources/open-dataset/). This is an **research-only** educational database released by Yelp for the purposes of education. More details about the license and terms of use of the dataset can be found in the [description of the dataset](../../data/Yelp_Dataset_Documentation_and_ToS_copy.pdf)

In [1]:
import os

DATA_ROOT = "../../data/yelp/Yelp_JSON/"
REVIEWS = os.path.join(DATA_ROOT, "yelp_academic_dataset_review.json")
BUSINESS = os.path.join(DATA_ROOT, "yelp_academic_dataset_business.json")
CHECKIN = os.path.join(DATA_ROOT, "yelp_academic_dataset_checkin.json")
TIP = os.path.join(DATA_ROOT, "yelp_academic_dataset_tip.json")
USER = os.path.join(DATA_ROOT, "yelp_academic_dataset_user.json")

In [2]:
# Use a small subset of the data for this certification challenge. All of the data is not needed
SUBSET_SIZE = 100_000

In [17]:
from datasets import load_dataset, Dataset
import pandas as pd

review_dataset = load_dataset("json", data_files=REVIEWS, split="train")
business_dataset = pd.read_json(BUSINESS, lines=True)

In [18]:
business_dataset['attributes'] = business_dataset['attributes'].apply(lambda x: str(x))

In [20]:
business_dataset = business_dataset[business_dataset['attributes'].str.contains('Restaurant')]

In [21]:
restaurant_ids = set(business_dataset['business_id'].tolist())

In [22]:
review_dataset = review_dataset.filter(lambda x: x['business_id'] in restaurant_ids)

Filter:   0%|          | 0/6990280 [00:00<?, ? examples/s]

In [25]:
business_dataset = business_dataset.set_index('business_id')

In [27]:
def denormalize_reviews(reviews):
    """Performs a simple and selective join between review and business data.
    The result is that the business name and address is included in each review."""
    businesses = [business_dataset.loc[business_id] for business_id in reviews['business_id']]
    business_names = [business['name'] for business in businesses]
    business_addresses = [business['address'] for business in businesses]
    business_cities = [business['city'] for business in businesses]
    business_states = [business['state'] for business in businesses]
    business_postal_codes = [business['postal_code'] for business in businesses]
    return {'business_name': business_names,
            'business_address': business_addresses,
            'business_city': business_cities,
            'business_state': business_states,
            'business_postal_code': business_postal_codes}

In [35]:
review_subset = review_dataset.shuffle().select(range(SUBSET_SIZE))

In [36]:
review_subset = review_subset.map(denormalize_reviews, batched=True)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [41]:
review_subset.to_json(f"{DATA_ROOT}/review_subset_denormalized_{SUBSET_SIZE}.json")

Creating json from Arrow format:   0%|          | 0/100 [00:00<?, ?ba/s]

89176858

In [32]:
review_subset = load_dataset("json", data_files=f"{DATA_ROOT}/review_subset_denormalized_{SUBSET_SIZE}.json", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [39]:
def map_text_column(review):
    """Creates a single text column with all review and business information."""
    review_text = f"""
    Restaurant Name: {review['business_name']}
    Address: {review['business_address']}
    City: {review['business_city']}
    State: {review['business_state']}
    Postal Code: {review['business_postal_code']}
    Review: 
    {review['text']}
    """
    return {
        "full_review": review_text
    }


In [40]:
review_subset_text = review_subset.map(map_text_column)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [42]:
columns_to_remove = list(set(review_subset_text.features) - set(['full_review']))
review_subset_text = review_subset_text.remove_columns(columns_to_remove)

In [43]:
review_subset_text.to_json(f"{DATA_ROOT}/review_subset_text_{SUBSET_SIZE}.json")

Creating json from Arrow format:   0%|          | 0/100 [00:00<?, ?ba/s]

73574712

In [44]:
review_subset_text = load_dataset("json", data_files=f"{DATA_ROOT}/review_subset_text_{SUBSET_SIZE}.json", split="train")
review_subset_text.save_to_disk(f"{DATA_ROOT}/review_subset_text_{SUBSET_SIZE}.hf")

Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]

In [45]:
review_dataset = review_dataset.map(denormalize_reviews, batched=True)

Map:   0%|          | 0/5999608 [00:00<?, ? examples/s]

In [46]:
review_text = review_dataset.map(map_text_column)

Map:   0%|          | 0/5999608 [00:00<?, ? examples/s]

In [47]:
columns_to_remove = list(set(review_text.features) - set(['full_review']))
review_text = review_text.remove_columns(columns_to_remove)

In [48]:
review_text.to_json(f"{DATA_ROOT}/review_text.jsonl")
review_text.save_to_disk(f"{DATA_ROOT}/review_text.hf")

Creating json from Arrow format:   0%|          | 0/6000 [00:00<?, ?ba/s]

Saving the dataset (0/9 shards):   0%|          | 0/5999608 [00:00<?, ? examples/s]