# Import products and queries dataset

In [1]:
!wget https://github.com/amazon-science/esci-data/raw/refs/heads/main/shopping_queries_dataset/shopping_queries_dataset_products.parquet
!wget https://github.com/amazon-science/esci-data/raw/refs/heads/main/shopping_queries_dataset/shopping_queries_dataset_examples.parquet

--2025-06-24 12:30:59--  https://github.com/amazon-science/esci-data/raw/refs/heads/main/shopping_queries_dataset/shopping_queries_dataset_products.parquet
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://media.githubusercontent.com/media/amazon-science/esci-data/refs/heads/main/shopping_queries_dataset/shopping_queries_dataset_products.parquet [following]
--2025-06-24 12:30:59--  https://media.githubusercontent.com/media/amazon-science/esci-data/refs/heads/main/shopping_queries_dataset/shopping_queries_dataset_products.parquet
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1108857465 (1.0G) [application/octet-st

In [2]:
import pandas as pd

df_products = pd.read_parquet('/kaggle/working/shopping_queries_dataset_products.parquet')
df_queries = pd.read_parquet('/kaggle/working/shopping_queries_dataset_examples.parquet')

In [3]:
df_queries[df_queries['product_locale'] != 'us'].drop_duplicates(subset=['query_id'], keep='first')[['query', 'product_locale']]

Unnamed: 0,query,product_locale
70,!solid camiseta sin manga,es
109,"""vitamina c""",es
990,#27 rubio oscuro extensiones sin clip,es
3289,% pura manteca de karite sin aditivos sin olor,es
3502,&ハニー シャンプー,jp
...,...,...
2621208,ﾎﾙｽﾀｰ,jp
2621224,ﾏｼﾞｯｸﾘﾝ,jp
2621240,ﾒｽﾃｨﾝ,jp
2621256,ﾚﾃﾞｨｰｽ水着,jp


Seems like there is no cross language product recommendation

# Preprocessing

## No. of unique products and queries in small version of the dataset

In [4]:
df_queries_small = df_queries[df_queries['small_version'] == 1]
len(df_queries_small['query_id'].unique())

48300

In [5]:
df_products_small = pd.merge(df_products.drop_duplicates(subset=['product_id']), df_queries_small[['product_id']].drop_duplicates() ,on = ['product_id'])

In [6]:
df_products.shape

(1814924, 7)

In [7]:
df_products_small.shape

(879141, 7)

There are approx, 50k unique queries and 0.9 M unique products in small version of the dataset

## Remove null values

In [8]:
null_counts = df_queries_small.isnull().sum()
print(null_counts)

example_id        0
query             0
query_id          0
product_id        0
product_locale    0
esci_label        0
small_version     0
large_version     0
split             0
dtype: int64


In [9]:
null_counts = df_products_small.isnull().sum()
print(null_counts)

product_id                   0
product_title                0
product_description     433869
product_bullet_point    144381
product_brand            75769
product_color           348724
product_locale               0
dtype: int64


In [10]:
df_products_small = df_products_small.map(lambda x : str(x).lower() if pd.notna(x) else '')

In [11]:
null_counts = df_products_small.isnull().sum()
print(null_counts)

product_id              0
product_title           0
product_description     0
product_bullet_point    0
product_brand           0
product_color           0
product_locale          0
dtype: int64


# Save the dataset

In [12]:
df_products_small[df_products_small['product_locale'] == 'us'].to_csv('amazon_esci_products_us_small.csv', index=False)
df_products_small[df_products_small['product_locale'] == 'es'].to_csv('amazon_esci_products_es_small.csv', index=False) 
df_products_small[df_products_small['product_locale'] == 'jp'].to_csv('amazon_esci_products_jp_small.csv', index=False) 

df_queries_small.to_csv('amazon_esci_queries_small.csv', index=False)

## Subsample 100k

Maintain us, jp, es distribution in subsample

In [13]:
def sample_df(total_rows, df):
    distribution = df['product_locale'].value_counts(normalize=True)
    print("Distribution:", distribution)
    samples_per_class = (distribution * total_rows).round().astype(int)
    
    df_sample = pd.concat([
        df[df['product_locale'] == cls].sample(n=n, random_state=42)
        for cls, n in samples_per_class.items()
    ])

    df_sample[df_sample['product_locale'] == 'us'].to_csv(f'{int(total_rows/1000)}k_products_us.csv', index=False)
    df_sample[df_sample['product_locale'] == 'es'].to_csv(f'{int(total_rows/1000)}k_products_es.csv', index=False) 
    df_sample[df_sample['product_locale'] == 'jp'].to_csv(f'{int(total_rows/1000)}k_products_jp.csv', index=False) 

    print(f"{int(total_rows/1000)}k samples saved")

In [14]:
sample_df(100_000, df_products_small)

Distribution: product_locale
us    0.545592
jp    0.263991
es    0.190418
Name: proportion, dtype: float64
100k samples saved


In [15]:
sample_df(10_000, df_products_small)

Distribution: product_locale
us    0.545592
jp    0.263991
es    0.190418
Name: proportion, dtype: float64
10k samples saved
