# Setup

In [1]:
%load_ext autoreload
%autoreload 2 

In [6]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from datasets import load_dataset
from pydantic import BaseModel
from dotenv import load_dotenv
import os

In [5]:
_ = load_dotenv(override=True)

## Arguments

In [11]:
class Args(BaseModel):
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    transaction_table_name: str = "amz_review_rating_raw"
    metadata_table_name: str = "amz_review_metadata_raw"

    hf_dataset_name: str = "McAuley-Lab/Amazon-Reviews-2023"
    amz_rating_hf_dataset_path: str = "0core_rating_only_Electronics"    #load o-core to demo real-world problem: cold-start, sparse data
    amz_metadata_hf_dataset_path: str = "raw_metadata_Electronics"   

args = Args()
print(args.model_dump_json(indent=2))


{
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "transaction_table_name": "amz_review_rating_raw",
  "metadata_table_name": "amz_review_metadata_raw",
  "hf_dataset_name": "McAuley-Lab/Amazon-Reviews-2023",
  "amz_rating_hf_dataset_path": "0core_rating_only_Electronics",
  "amz_metadata_hf_dataset_path": "raw_metadata_Electronics"
}


## Load dataset

In [15]:
amz_rating_raw = load_dataset(args.hf_dataset_name,
                              args.amz_rating_hf_dataset_path,
                              split="full",
                              trust_remote_code=True)

Downloading data:   0%|          | 0.00/2.52G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

In [None]:
amz_rating_df = amz_rating_raw.to_pandas()
amz_rating_df.drop_duplicates(subset=[args.user_col, args.item_col], inplace=True)

logger.info(f"amz_rating_df.shape: {amz_rating_df.shape}")