## Data Collection

In [None]:
from tqdm.auto import tqdm
from collections import Counter
import json

In [None]:
from datasets import load_dataset, concatenate_datasets


# Load the "Health and Household" category
Health_and_Household = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Health_and_Household", split='full')

# Load the "Health and Personal Care" category
health_personal_care = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Health_and_Personal_Care", split='full')

# Concatenate both datasets
dataset = concatenate_datasets([Health_and_Household, health_personal_care])

dataset

In [None]:
Health_and_Household_MetaData = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Health_and_Household", split="full", trust_remote_code=True)
health_personal_care_meta = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Health_and_Personal_Care", split='full',  trust_remote_code=True)

In [None]:
Health_and_Household[0]

In [None]:
Health_and_Household_MetaData[0]

In [None]:
# Define keywords for title and categories
title_keywords = ['lavender', 'herbal', 'stress', 'anxiety', 'ashwagandha', 'chamomile']
category_keywords = ['vitamin', 'supplement']

# Filter items based on title and categories
metadata_dict = {
    item['parent_asin']: item
    for item in tqdm(Health_and_Household_MetaData, desc="Processing Items")
    if (
        # Check if any keyword is in the title (case insensitive)
            any(keyword in item['title'].lower() for keyword in title_keywords)
            # OR check if 'vitamin' or 'supplement' is in any of the item's categories
            or any(keyword in category.lower() for category in item.get('categories', []) for keyword in category_keywords)
    )
}

In [None]:
# # Sample list
# my_list = ['Health & Household',
#    'Vitamins, Minerals & Supplements',
#    'Vitamins',
#    'Multivitamins']

# index_values = [
#     index
#     for index, item in enumerate(tqdm(Health_and_Household_MetaData, desc='Cat'))
#     if item.get('main_category') and 'health' in item['main_category'].lower()
# ]

# print(index_values)
#len(index_values)


In [None]:
# metadata_p_dict = {
#     item['parent_asin']: item 
#     for item in tqdm(health_personal_care_meta, desc="Processing Items")
#     if 'lavender' in item['title'].lower() or 'herbal' in item['title'].lower()
# }

In [None]:
first_key = next(iter(metadata_dict))
first_value = metadata_dict[first_key]
first_pair = (first_key, first_value)
first_pair

In [None]:
# Filter and enrich reviews with metadata information
filtered_reviews = [
    {
        **review,
        'main_category': metadata_dict[review['parent_asin']]['main_category'],
        'average_rating': metadata_dict[review['parent_asin']]['average_rating'],
        'product_title': metadata_dict[review['parent_asin']]['title'],
    }
    for review in tqdm(Health_and_Household, desc="Filtering and Enriching Reviews")
    if review['parent_asin'] in metadata_dict
]

In [None]:

ratings = [review['rating'] for review in filtered_reviews]
rating_counter = Counter(ratings)

print(rating_counter)

In [None]:

# Specify the filename to save the data
filename = 'amazon_dataset.json'

# Save the list of dictionaries to a JSON file
with open(filename, 'w') as file:
    json.dump(filtered_reviews, file, indent=4)

print(f"Data saved to {filename}.")