# Imports and paths

In [30]:
import os
import sys

# root path
ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the project root to the Python path
if ROOT not in sys.path:
    sys.path.append(ROOT)

import polars as pl
import pandas as pd

## Paths

In [31]:
DATA_PATH = os.path.join(ROOT, 'data')
RAW_DATA_PATH = os.path.join(DATA_PATH, 'raw')

USERS_RAW_PATH = os.path.join(RAW_DATA_PATH, 'user_batches')
USERS_CLEAN_PATH = os.path.join(DATA_PATH, 'processed', 'users.parquet')

TRAIN_PATH = os.path.join(DATA_PATH, 'raw', 'train.csv')

PRODUCTS_PATH = os.path.join(DATA_PATH, 'raw', 'products.pkl')

# User Data (API Call)

In [32]:
from src.data.api_calls import fetch_all_user_ids, fetch_user_data

all_ids = fetch_all_user_ids()

In [33]:
import random
random_id = random.choice(all_ids)
fetch_user_data(random_id)

{'user_id': 487435,
 'values': {'country': [25], 'R': [23], 'F': [21], 'M': [25.084799999999994]}}

## Load all batches, merge and save

In [34]:
from src.data.loaders import PolarsLoader

loader = PolarsLoader(sampling=False, file_type='parquet')

In [35]:
down_users = []

# Iterate through all files in the directory
for file_name in os.listdir(USERS_RAW_PATH):
    file_path = os.path.join(USERS_RAW_PATH, file_name)
    
    # Ensure the file is a parquet file before processing
    if file_name.endswith('.parquet'):
        data = loader.load_data(path=file_path)
        users_ids = data['user_id']
        down_users.extend(users_ids)  # Combine all user_ids into a single list
        print(f"Processed {file_name}: {len(users_ids)} user_ids")

print("Final amount of users: ", len(down_users))

Processed user_batch_1.parquet: 100000 user_ids
Processed user_batch_2.parquet: 100000 user_ids
Processed user_batch_3.parquet: 100000 user_ids
Processed user_batch_5.parquet: 100000 user_ids
Processed user_batch_6.parquet: 57006 user_ids
Processed user_batch_4.parquet: 100000 user_ids
Final amount of users:  557006


In [36]:
final_df = pl.DataFrame()

# Iterate through all files in the directory
for file_name in os.listdir(USERS_RAW_PATH):
    file_path = os.path.join(USERS_RAW_PATH, file_name)

    data = loader.load_data(path=file_path)
    final_df = pl.concat([final_df, data])
    print(f"Processed {file_name}")

# Save the final DataFrame
final_df.write_parquet(USERS_CLEAN_PATH)

print(f"Final combined DataFrame saved at {USERS_CLEAN_PATH}")
print('\n', final_df)

Processed user_batch_1.parquet
Processed user_batch_2.parquet
Processed user_batch_3.parquet
Processed user_batch_5.parquet
Processed user_batch_6.parquet
Processed user_batch_4.parquet
Final combined DataFrame saved at /home/ezemriv/other_projects/hackathon-inditex-data-recommender/data/processed/users.parquet

 shape: (557_006, 5)
┌─────────┬─────┬─────┬───────────┬─────────┐
│ country ┆ R   ┆ F   ┆ M         ┆ user_id │
│ ---     ┆ --- ┆ --- ┆ ---       ┆ ---     │
│ i64     ┆ i64 ┆ i64 ┆ f64       ┆ i64     │
╞═════════╪═════╪═════╪═══════════╪═════════╡
│ 25      ┆ 30  ┆ 0   ┆ 0.0       ┆ 430102  │
│ 25      ┆ 177 ┆ 1   ┆ 75.9      ┆ 134198  │
│ 25      ┆ 32  ┆ 61  ┆ 37.694058 ┆ 134207  │
│ 25      ┆ 74  ┆ 86  ┆ 11.64094  ┆ 180365  │
│ 25      ┆ 79  ┆ 5   ┆ 30.283333 ┆ 430101  │
│ …       ┆ …   ┆ …   ┆ …         ┆ …       │
│ 25      ┆ 155 ┆ 9   ┆ 17.423636 ┆ 389294  │
│ 25      ┆ 62  ┆ 16  ┆ 45.104706 ┆ 389292  │
│ 25      ┆ 8   ┆ 74  ┆ 36.052632 ┆ 389298  │
│ 25      ┆ 15  ┆ 26 

# Train/Test Data

In [48]:
loader = PolarsLoader(sampling=True)
train = loader.load_data(path=TRAIN_PATH)
print(train.estimated_size("mb"))

65.05634880065918


In [49]:
train.sample(5)

session_id,date,timestamp_local,add_to_cart,user_id,country,partnumber,device_type,pagetype
i64,date,datetime[μs],i64,f64,i64,i64,i64,f64
3197684,2024-06-10,2024-06-10 20:07:40.460,1,,34,12600,1,24.0
4637688,2024-06-07,2024-06-07 17:14:17.201,0,217888.0,25,26812,1,24.0
1770026,2024-06-07,2024-06-07 19:16:29.666,0,,25,16138,3,24.0
411964,2024-06-11,2024-06-11 13:28:47.619,0,,57,5818,1,24.0
1979467,2024-06-13,2024-06-13 23:38:23.669,0,208183.0,25,21999,1,24.0


In [50]:
train.null_count()

session_id,date,timestamp_local,add_to_cart,user_id,country,partnumber,device_type,pagetype
u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,852533,0,0,0,15


In [52]:
# Get value counts for all columns
value_counts = {col: train[col].n_unique() for col in train.columns}

# Print value counts for each column
for col, counts in value_counts.items():
    print(f"Unique counts for {col}: {counts}\n")

Unique counts for session_id: 95364

Unique counts for date: 15

Unique counts for timestamp_local: 999488

Unique counts for add_to_cart: 2

Unique counts for user_id: 10786

Unique counts for country: 4

Unique counts for partnumber: 26380

Unique counts for device_type: 3

Unique counts for pagetype: 20



In [54]:
# # Check distribution of variables so I can downcast some
# df = train.to_pandas()
# df.describe()

In [55]:
train = train.with_columns([
    pl.col("session_id").cast(pl.UInt32),       # Downcast to unsigned 32-bit integer
    pl.col("add_to_cart").cast(pl.UInt8),       # Downcast to unsigned 8-bit integer
    pl.col("user_id").cast(pl.UInt32),         # Downcast to 32-bit float
    pl.col("country").cast(pl.UInt8),     # Convert to categorical
    pl.col("partnumber").cast(pl.UInt16),       # Downcast to unsigned 32-bit integer
    pl.col("device_type").cast(pl.UInt8),       # Downcast to unsigned 8-bit integer
    pl.col("pagetype").cast(pl.UInt8),        # Downcast to 32-bit float
])

print(train.estimated_size("mb"))

25.033950805664062


In [56]:
def caster(df, train=True):
    print(f"Initial Size: {df.estimated_size('mb')}")
    
    df = df.with_columns([
        pl.col("session_id").cast(pl.UInt32),
        pl.col("user_id").cast(pl.UInt32),
        pl.col("country").cast(pl.UInt8),
        pl.col("partnumber").cast(pl.UInt16),
        pl.col("device_type").cast(pl.UInt8),
        pl.col("pagetype").cast(pl.UInt8),
    ])

    if train:
        df = df.with_columns([
                pl.col("add_to_cart").cast(pl.UInt8)])

    print(f"Final Size: {df.estimated_size('mb')}")

In [57]:
caster(train)

Initial Size: 25.033950805664062
Final Size: 25.033950805664062


### Cast down and save as parquet (also test)

Done with script on ROOT

# Products

In [5]:
prods = pd.read_pickle(PRODUCTS_PATH)
print(prods.head())
print(prods.shape)

  discount                                          embedding  partnumber  \
0        0  [-0.13401361, -0.1200429, -0.016117405, -0.167...       32776   
1        0  [-0.0949274, -0.107294075, -0.16559914, -0.174...       41431   
2        0  [-0.12904441, -0.07724628, -0.09799071, -0.164...       39419   
3        1  [-0.12783332, -0.133868, -0.10101265, -0.18888...       36087   
4        1  [-0.14092924, -0.1258284, -0.10809927, -0.1765...       34132   

   color_id  cod_section  family  
0        85          4.0      73  
1       135          4.0      73  
2       339          4.0      73  
3       135          4.0      73  
4         3          4.0      73  
(43692, 6)
