In [4]:
# Test the Beauty dataset with real data
import sys
sys.path.insert(0, '/home/zhenkai/personal/Projects/AgenticRecommender')

from agentic_recommender.datasets import BeautyDataset

# Create dataset instance
dataset = BeautyDataset()
print(f"Dataset paths:")
print(f"  Reviews: {dataset.data_path}")
print(f"  Metadata: {dataset.metadata_path}")

print("\nüìä Testing data loading...")
try:
    # Test raw data loading
    raw_data = dataset._load_raw_data()
    print(f"‚úÖ Raw data loaded: {len(raw_data)} interactions")
    
    # Test metadata processing
    metadata = dataset._process_metadata()
    print(f"‚úÖ Metadata processed: {len(metadata)} items have names")
    
    # Show a few examples
    sample_items = list(metadata.items())[:3]
    print(f"\nüè∑Ô∏è Sample item names:")
    for item_id, name in sample_items:
        print(f"  {item_id}: {name}")
    
    print("\nüéâ Dataset loading successful! Real Amazon Beauty data is working.")
    
except Exception as e:
    print(f"‚ùå Error: {e}")
    import traceback
    traceback.print_exc()

Dataset paths:
  Reviews: /home/zhenkai/personal/Projects/AgenticRecommender/agentic_recommender/data/inputs/reviews_Beauty.json
  Metadata: /home/zhenkai/personal/Projects/AgenticRecommender/agentic_recommender/data/inputs/meta_Beauty.json

üìä Testing data loading...
üîç Applying 5-core filtering...
  Iteration 1: 313823 interactions
  Iteration 2: 224229 interactions
  Iteration 3: 205760 interactions
  Iteration 4: 200771 interactions
  Iteration 5: 199277 interactions
  Iteration 6: 198741 interactions
  Iteration 7: 198554 interactions
  Iteration 8: 198506 interactions
  Iteration 9: 198502 interactions
  Iteration 10: 198502 interactions
‚úÖ 5-core filtering completed after 10 iterations
‚úÖ Raw data loaded: 198502 interactions
‚úÖ Metadata processed: 258760 items have names

üè∑Ô∏è Sample item names:
  0205616461: Bio-Active Anti-Aging Serum (Firming Ultra-Hydrating Serum)
  0558925278: Eco Friendly Ecotools Quality Natural Bamboo Cosmetic Mineral Brush Set Kit of 4 Soft Br

In [8]:
# Explore Input JSON Files (Raw Amazon Beauty Data)
import json
import pandas as pd

print("üîç Exploring Input JSON Files\n")

# Load and explore reviews data
reviews_path = '/home/zhenkai/personal/Projects/AgenticRecommender/agentic_recommender/data/inputs/reviews_Beauty.json'
meta_path = '/home/zhenkai/personal/Projects/AgenticRecommender/agentic_recommender/data/inputs/meta_Beauty.json'

print("üìä REVIEWS DATA:")
reviews = []
with open(reviews_path, 'r') as f:
    for i, line in enumerate(f):
        if i >= 5:  # Only show first 5 for exploration
            break
        review = json.loads(line.strip())
        reviews.append(review)

# Display sample reviews
for i, review in enumerate(reviews):
    print(f"\nReview {i+1}:")
    for key, value in list(review.items())[:6]:  # Show first 6 fields
        if isinstance(value, str) and len(value) > 80:
            value = value[:80] + "..."
        print(f"  {key}: {value}")
    if len(review) > 6:
        print(f"  ... and {len(review)-6} more fields")

print(f"\nüìà Total reviews processed: {i+1} (sample)")

print("\n" + "="*60)
print("üè∑Ô∏è METADATA:")
metadata = []
with open(meta_path, 'r') as f:
    for i, line in enumerate(f):
        if i >= 3:  # Only show first 3 for exploration  
            break
        try:
            # Handle Python dict format
            meta = eval(line.strip(), {"__builtins__": {}}, {})
            metadata.append(meta)
        except:
            print(f"Could not parse line {i+1}")
            continue

# Display sample metadata
for i, meta in enumerate(metadata):
    print(f"\nProduct {i+1}:")
    for key, value in list(meta.items())[:6]:  # Show first 6 fields
        if isinstance(value, str) and len(value) > 80:
            value = value[:80] + "..."
        elif isinstance(value, list) and len(value) > 2:
            value = f"[{len(value)} items: {value[:2]}...]"
        elif isinstance(value, dict):
            value = f"dict with {len(value)} keys"
        print(f"  {key}: {value}")
    if len(meta) > 6:
        print(f"  ... and {len(meta)-6} more fields")

print(f"\nüìà Total metadata processed: {i+1} (sample)")

üîç Exploring Input JSON Files

üìä REVIEWS DATA:

Review 1:
  reviewerID: A39HTATAQ9V7YF
  asin: 0205616461
  reviewerName: cheryl roberts
  helpful: [0, 0]
  reviewText: i do love this moisturizer and would recommend it to someone for dry skin ,fine ...
  overall: 5.0
  ... and 3 more fields

Review 2:
  reviewerID: A3JM6GV9MNOF9X
  asin: 0558925278
  reviewerName: Patty
  helpful: [0, 1]
  reviewText: I received this product before the deadline.I tested only Baby Kabuki, and the q...
  overall: 3.0
  ... and 3 more fields

Review 3:
  reviewerID: A1Z513UWSAAO0F
  asin: 0558925278
  reviewerName: Renita M
  helpful: [0, 0]
  reviewText: I love this set. Great buy for the price. I don't wear makeup all the time, but ...
  overall: 5.0
  ... and 3 more fields

Review 4:
  reviewerID: A1WMRR494NWEWV
  asin: 0733001998
  reviewerName: Amazon Shopper
  helpful: [0, 0]
  reviewText: A nice moisturizer, all natural ingredients and no parabens. A bit pricey, but y...
  overall: 4.0
  ... a

In [9]:
# Explore Output Files (Processed Dataset)
import json
import pickle
from pathlib import Path

print("üéØ Exploring Output Files (Processed Dataset)\n")

outputs_dir = Path('/home/zhenkai/personal/Projects/AgenticRecommender/agentic_recommender/data/outputs')

# Load dataset statistics
print("üìä DATASET STATISTICS:")
with open(outputs_dir / 'beauty_stats.json', 'r') as f:
    stats = json.load(f)

for key, value in stats.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.4f}")
    else:
        print(f"  {key}: {value:,}")

print("\n" + "="*60)
print("üèÜ EVALUATION SAMPLES:")

# Load evaluation samples
with open(outputs_dir / 'beauty_evaluation_samples.json', 'r') as f:
    eval_samples = json.load(f)

print(f"Total evaluation samples: {len(eval_samples)}")
print(f"\nSample evaluation task:")

sample = eval_samples[0]
print(f"Session ID: {sample['session_id']}")
print(f"User ID: {sample['user_id']}")
print(f"Prompt items ({len(sample['prompt_items'])}): {sample['prompt_items'][:3]}...")
print(f"Target item: {sample['target_item']}")
print(f"Candidates pool size: {len(sample['candidates'])}")
print(f"Target position in candidates: {sample['target_index']}")

print(f"\nüè∑Ô∏è Item names in this session:")
for item_id, name in list(sample['item_names'].items())[:5]:
    print(f"  {item_id}: {name}")

print("\n" + "="*60)
print("üìà SPLIT SIZES:")

for split in ['train', 'val', 'test']:
    with open(outputs_dir / f'beauty_{split}.json', 'r') as f:
        split_data = json.load(f)
    print(f"  {split}: {len(split_data):,} sessions")

print("\n" + "="*60)
print("üî§ ITEM MAPPINGS:")

# Load item mappings
with open(outputs_dir / 'beauty_item_to_name.json', 'r') as f:
    item_to_name = json.load(f)

print(f"Total items with names: {len(item_to_name):,}")
print(f"\nSample item mappings:")
sample_items = list(item_to_name.items())[:5]
for item_id, name in sample_items:
    print(f"  {item_id}: {name}")

üéØ Exploring Output Files (Processed Dataset)

üìä DATASET STATISTICS:
  num_sessions: 22,363
  num_items: 12,101
  num_users: 22,363
  total_interactions: 198,502
  avg_session_length: 8.8764
  min_session_length: 5
  max_session_length: 204
  density: 0.0007

üèÜ EVALUATION SAMPLES:
Total evaluation samples: 10

Sample evaluation task:
Session ID: 10948
User ID: A2UIAW0X0S94HL
Prompt items (4): ['B004O3UE46', 'B000052ZB4', 'B001DOA73C']...
Target item: B002WEBTPW
Candidates pool size: 100
Target position in candidates: 78

üè∑Ô∏è Item names in this session:
  B004O3UE46: Almay One Coat Get Up and Grow Waterproof Mascara, Black Brown, 0.21 Fluid Ounce
  B000052ZB4: Neutrogena Deep Clean Facial Cleanser, Normal to Oily Skin, 6.7 Ounce
  B001DOA73C: L'Oreal Paris True Match Naturale Mineral Foundation, Cocoa, 0.35 Ounce
  B0019QQ13Y: e.l.f. Liquid Eyeliner (Plum) elf
  B002WEBTPW: 13 Piece Brush Set With Case

üìà SPLIT SIZES:
  train: 17,890 sessions
  val: 2,236 sessions
  test:

In [10]:
# Interactive Data Analysis
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

print("üìä INTERACTIVE DATA ANALYSIS\n")

# Load processed dataset for analysis
with open(outputs_dir / 'beauty_dataset.pkl', 'rb') as f:
    dataset = pickle.load(f)

print("üîç Session Length Distribution:")
session_lengths = [len(session['items']) for session in dataset.sessions]
length_counts = Counter(session_lengths)

# Show distribution
print(f"Min length: {min(session_lengths)}")
print(f"Max length: {max(session_lengths)}")
print(f"Average length: {np.mean(session_lengths):.2f}")
print(f"Median length: {np.median(session_lengths):.2f}")

print(f"\nMost common session lengths:")
for length, count in length_counts.most_common(10):
    print(f"  {length} items: {count:,} sessions ({count/len(session_lengths)*100:.1f}%)")

print("\n" + "="*60)
print("üè∑Ô∏è Product Category Analysis:")

# Analyze product categories from names
category_words = []
for item_name in list(dataset.item_to_name.values())[:1000]:  # Sample for performance
    words = item_name.lower().split()
    # Look for beauty-related keywords
    beauty_keywords = ['serum', 'cream', 'lotion', 'mascara', 'lipstick', 'foundation', 
                      'shampoo', 'conditioner', 'moisturizer', 'cleanser', 'oil']
    for word in words:
        if any(keyword in word for keyword in beauty_keywords):
            category_words.append(word)

if category_words:
    category_counts = Counter(category_words)
    print(f"Most common beauty product types:")
    for category, count in category_counts.most_common(10):
        print(f"  {category}: {count} products")

print("\n" + "="*60)
print("üéØ Recommendation Task Preview:")

# Show what a typical recommendation task looks like
test_session = dataset.sessions[100]  # Pick a test session
prompt_items, target = dataset.prepare_to_predict(test_session)
candidates, target_idx = dataset.create_candidate_pool(test_session)

print(f"User session example:")
print(f"  User ID: {test_session['user_id']}")
print(f"  Total items in session: {len(test_session['items'])}")
print(f"  Items for prediction context: {len(prompt_items)}")
print(f"  Target item to predict: {target}")

print(f"\nüìù Context items (what user has purchased):")
for i, item_id in enumerate(prompt_items[:5]):  # Show first 5
    item_name = dataset.get_item_name(item_id)
    print(f"  {i+1}. {item_name}")
if len(prompt_items) > 5:
    print(f"  ... and {len(prompt_items)-5} more items")

print(f"\nüéØ Prediction target:")
target_name = dataset.get_item_name(target)
print(f"  {target_name}")

print(f"\nüîÄ Candidate pool:")
print(f"  Total candidates: {len(candidates)}")
print(f"  Target is at position: {target_idx + 1}")
print(f"  Random sample of other candidates:")
other_candidates = [c for i, c in enumerate(candidates) if i != target_idx][:3]
for item_id in other_candidates:
    item_name = dataset.get_item_name(item_id)
    print(f"    - {item_name}")

üìä INTERACTIVE DATA ANALYSIS

üîç Session Length Distribution:
Min length: 5
Max length: 204
Average length: 8.88
Median length: 6.00

Most common session lengths:
  5 items: 7,162 sessions (32.0%)
  6 items: 4,221 sessions (18.9%)
  7 items: 2,680 sessions (12.0%)
  8 items: 1,811 sessions (8.1%)
  9 items: 1,366 sessions (6.1%)
  10 items: 881 sessions (3.9%)
  11 items: 695 sessions (3.1%)
  12 items: 558 sessions (2.5%)
  13 items: 439 sessions (2.0%)
  14 items: 361 sessions (1.6%)

üè∑Ô∏è Product Category Analysis:
Most common beauty product types:
  toilette: 194 products
  cream: 31 products
  lotion: 26 products
  oil: 14 products
  shampoo,: 13 products
  lotion,: 13 products
  cleanser,: 13 products
  conditioner,: 11 products
  cream,: 10 products
  shampoo: 9 products

üéØ Recommendation Task Preview:
User session example:
  User ID: A10CRH1O1H6RSV
  Total items in session: 5
  Items for prediction context: 4
  Target item to predict: B0002FCD5I

üìù Context items (w

# Beauty Dataset Exploration Summary

This notebook provides comprehensive exploration of the Amazon Beauty dataset:

## üìÅ File Structure
- **Input files**: `/agentic_recommender/data/inputs/`
  - `reviews_Beauty.json`: Raw user-item interactions  
  - `meta_Beauty.json`: Product metadata with titles and descriptions

- **Output files**: `/agentic_recommender/data/outputs/`
  - `beauty_dataset.pkl`: Complete processed dataset object
  - `beauty_stats.json`: Dataset statistics
  - `beauty_train/val/test.json`: Evaluation splits
  - `beauty_evaluation_samples.json`: Ready-to-use evaluation tasks
  - `beauty_item_to_name.json`: Item ID to name mappings

## üéØ What You Can Do
1. **Explore raw data**: See original Amazon review format and product metadata
2. **Analyze processed data**: View statistics, splits, and data quality
3. **Understand recommendation tasks**: See how sessions become prediction problems  
4. **Interactive analysis**: Examine distributions and product categories

## ‚úÖ Dataset Ready For
- Sequential recommendation model training
- Agentic recommendation workflows  
- Evaluation and benchmarking
- Product discovery and analysis