In [None]:
!wget https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Electronics.jsonl.gz

--2025-11-10 20:32:42--  https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Electronics.jsonl.gz
Resolving mcauleylab.ucsd.edu (mcauleylab.ucsd.edu)... 169.228.63.88
Connecting to mcauleylab.ucsd.edu (mcauleylab.ucsd.edu)|169.228.63.88|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6474438619 (6.0G) [application/gzip]
Saving to: ‘Electronics.jsonl.gz’


2025-11-10 20:37:42 (20.7 MB/s) - ‘Electronics.jsonl.gz’ saved [6474438619/6474438619]



In [None]:
!gunzip Electronics.jsonl.gz

Now, we will try a stratified random sampling method (stratifiying by rating and helpful vote)

In [None]:
import json
from collections import defaultdict

# Adjust this line if your helpful votes column is named differently
helpful_field = 'helpful_vote'  # or 'helpful_votes', as appropriate

strata_counts = defaultdict(int)
total_count = 0

with open('Electronics.jsonl', 'r') as f:
    for line in f:
        record = json.loads(line)
        rating = record.get('rating')
        helpful_votes = record.get(helpful_field, 0)
        helpful_bin = 'helpful' if helpful_votes > 0 else 'not_helpful'
        key = (rating, helpful_bin)
        strata_counts[key] += 1
        total_count += 1

print('Total records:', total_count)
print('Example stratum and count:', list(strata_counts.items())[:5])


Total records: 43886944
Example stratum and count: [((3.0, 'not_helpful'), 2108960), ((1.0, 'not_helpful'), 3456772), ((5.0, 'not_helpful'), 23049103), ((5.0, 'helpful'), 4780645), ((4.0, 'not_helpful'), 4241541)]


In [None]:
sample_size = 20000
strata_targets = {}
for key, count in strata_counts.items():
    # Calculate proportional size, always at least 1 if present
    target = int(round((sample_size / total_count) * count))
    target = max(1, min(target, count))  # don't exceed stratum size
    strata_targets[key] = target

print('Example stratum targets:', list(strata_targets.items())[:5])


Example stratum targets: [((3.0, 'not_helpful'), 961), ((1.0, 'not_helpful'), 1575), ((5.0, 'not_helpful'), 10504), ((5.0, 'helpful'), 2179), ((4.0, 'not_helpful'), 1933)]


In [None]:
import random
strata_samples = defaultdict(list)
strata_seen = defaultdict(int)

with open('Electronics.jsonl', 'r') as f:
    for line in f:
        record = json.loads(line)
        rating = record.get('rating')
        helpful_votes = record.get(helpful_field, 0)
        helpful_bin = 'helpful' if helpful_votes > 0 else 'not_helpful'
        key = (rating, helpful_bin)
        target = strata_targets.get(key, 0)
        if target == 0:
            continue  # skip strata not sampled
        strata_seen[key] += 1
        if len(strata_samples[key]) < target:
            strata_samples[key].append(record)
        else:
            j = random.randint(1, strata_seen[key])
            if j <= target:
                strata_samples[key][j - 1] = record

In [None]:
final_sample = []
for group in strata_samples.values():
    final_sample.extend(group)

print('Final sample size:', len(final_sample))

with open('stratified_sample_20k.jsonl', 'w') as out_file:
    for rec in final_sample:
        out_file.write(json.dumps(rec) + '\n')


Final sample size: 20002


Finally, we will join it with the metadata database.

In [None]:
!wget https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_Electronics.jsonl.gz

--2025-11-11 00:29:31--  https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_Electronics.jsonl.gz
Resolving mcauleylab.ucsd.edu (mcauleylab.ucsd.edu)... 169.228.63.88
Connecting to mcauleylab.ucsd.edu (mcauleylab.ucsd.edu)|169.228.63.88|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1312900427 (1.2G) [application/gzip]
Saving to: ‘meta_Electronics.jsonl.gz’


2025-11-11 00:30:34 (20.4 MB/s) - ‘meta_Electronics.jsonl.gz’ saved [1312900427/1312900427]



In [None]:
!gunzip meta_Electronics.jsonl.gz

In [None]:
import pandas as pd
sample_df = pd.read_json('stratified_sample_20k.jsonl', lines=True)

In [None]:
parent_asin_list = set(sample_df['parent_asin'])

In [None]:
import json
metadata_fields = [
    'parent_asin', 'main_category', 'title', 'average_rating', 'rating_number',
    'features', 'description', 'price', 'store', 'categories', 'details', 'bought_together'
] # pick only what you need (exclude 'images','videos')

filtered_meta = []
with open('meta_Electronics.jsonl', 'r') as f:
    for line in f:
        meta = json.loads(line)
        if meta.get('parent_asin') in parent_asin_list:
            filtered = {col: meta.get(col, None) for col in metadata_fields}
            filtered_meta.append(filtered)

meta_df_small = pd.DataFrame(filtered_meta)

In [None]:
# Ensure 'parent_asin' is the joining column in both
merged_df = sample_df.merge(meta_df_small, on='parent_asin', how='left')

In [None]:
merged_df.head()

Unnamed: 0,rating,title_x,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,...,title_y,average_rating,rating_number,features,description,price,store,categories,details,bought_together
0,3,Seems like very cheap material,Not what I was expecting out of a tablet case....,[],B01IN62VMC,B01IN62VMC,AHEDCDWYN3VNREN65L2YVP4J6VYQ,2016-12-21 01:48:46.000,0,True,...,NEWSTYLE Tab E Lite 7.0 & Tab 3 Lite 7.0 Kids ...,4.3,838,[NEWSTYLE Exclusively Designed for Samsung Gal...,[],,NEWSTYLE,"[Electronics, Computers & Accessories, Tablet ...",{'Package Dimensions': '8.1 x 7.7 x 0.8 inches...,
1,3,Toshiba TV,I very much dislike this TV. I don't like the ...,[],B086VR2KY8,B086VR2KY8,AFXFVX322RYVWZIJUGENUI2JBYTA,2021-10-08 23:12:39.493,0,True,...,Toshiba 50LF621U21 50-inch Smart 4K UHD with D...,4.5,14684,[Fire TV brings together live-over-the air TV ...,[],,Toshiba,"[Electronics, Television & Video, Televisions,...","{'Brand Name': 'Toshiba', 'Item Weight': '23.1...",
2,3,"Great Picture, sound concerns",I purchased this item and it has great feature...,[],B004U5T2PA,B004U5T2PA,AGVUUJTVEDTII2BGR7JQIYI4OLAQ,2012-03-11 17:42:46.000,0,False,...,Vizio E472VL 47-Inch 1080p LCD TV - Black,3.1,39,"[VIZIO Internet Apps, Built-in WiFi, 1080p Ful...","[Product Description, VIZIO's 47"" Class LCD HD...",,VIZIO,"[Electronics, Television & Video, Televisions,...","{'Brand Name': 'VIZIO', 'Item Weight': '39.7 P...",
3,3,Okay,I usually purchase the Lexar Platinum II SD ca...,[],B007ADFV2M,B007CQRSZ0,AEZNLOY2SWMHZCVZ5SBQGUNP2H6Q,2013-04-30 22:30:44.000,0,True,...,Lexar Professional 400x 16GB SDHC UHS-I Flash ...,4.5,393,"[Impressive high-speed, Class 10 performance -...","[Product Description, The premium Lexar Profes...",,Lexar,"[Electronics, Computers & Accessories, Compute...","{'RAM': '16 GB', 'Brand': 'Lexar', 'Item model...",
4,3,Hard puter shell and breaks apart when falling,"These fell on the floor and it broke open, i g...",[],B07HNCL7G2,B07HNCL7G2,AGTB4RB6V7AT2DMGM7RG5RRNE6SQ,2019-08-07 21:59:36.700,0,True,...,SoundPEATS Trueair Wireless Earbuds with Charg...,4.2,833,[[Incredible Audio Performance] - Adopt advanc...,[],,SoundPEATS,"[Electronics, Headphones, Earbuds & Accessorie...",{'Product Dimensions': '0.98 x 0.86 x 0.64 inc...,


In [None]:
merged_df.to_csv('merged_electronics_sample.csv', index=False)