# Imports and paths

In [2]:
import os
import sys

# root path
ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the project root to the Python path
if ROOT not in sys.path:
    sys.path.append(ROOT)

import polars as pl
import pandas as pd

## Paths

In [3]:
DATA_PATH = os.path.join(ROOT, 'data')
RAW_DATA_PATH = os.path.join(DATA_PATH, 'raw')

USERS_RAW_PATH = os.path.join(RAW_DATA_PATH, 'user_batches')
USERS_CLEAN_PATH = os.path.join(DATA_PATH, 'processed', 'users.parquet')

TRAIN_PATH = os.path.join(DATA_PATH, 'raw', 'train.csv')

PRODUCTS_PATH = os.path.join(DATA_PATH, 'raw', 'products.pkl')

# User Data (API Call)

In [14]:
from src.data.api_calls import fetch_all_user_ids, fetch_user_data

all_ids = fetch_all_user_ids()

In [15]:
import random
random_id = random.choice(all_ids)
fetch_user_data(random_id)

{'user_id': 305432,
 'values': {'country': [25], 'R': [25], 'F': [163], 'M': [17.012304147465397]}}

## Load all batches, merge and save

In [17]:
from src.data.loaders import PolarsLoader

loader = PolarsLoader(sampling=False, file_type='parquet')

In [18]:
down_users = []

# Iterate through all files in the directory
for file_name in os.listdir(USERS_RAW_PATH):
    file_path = os.path.join(USERS_RAW_PATH, file_name)
    
    # Ensure the file is a parquet file before processing
    if file_name.endswith('.parquet'):
        data = loader.load_data(path=file_path)
        users_ids = data['user_id']
        down_users.extend(users_ids)  # Combine all user_ids into a single list
        print(f"Processed {file_name}: {len(users_ids)} user_ids")

print("Final amount of users: ", len(down_users))

Processed user_batch_1.parquet: 100000 user_ids
Processed user_batch_2.parquet: 100000 user_ids
Processed user_batch_3.parquet: 100000 user_ids
Processed user_batch_5.parquet: 100000 user_ids
Processed user_batch_6.parquet: 57006 user_ids
Processed user_batch_4.parquet: 100000 user_ids
Final amount of users:  557006


In [20]:
final_df = pl.DataFrame()

# Iterate through all files in the directory
for file_name in os.listdir(USERS_RAW_PATH):
    file_path = os.path.join(USERS_RAW_PATH, file_name)

    data = loader.load_data(path=file_path)
    final_df = pl.concat([final_df, data])
    print(f"Processed {file_name}")

# Save the final DataFrame
final_df.write_parquet(USERS_CLEAN_PATH)

print(f"Final combined DataFrame saved at {USERS_CLEAN_PATH}")
print('\n', final_df)

Processed user_batch_1.parquet
Processed user_batch_2.parquet
Processed user_batch_3.parquet
Processed user_batch_5.parquet
Processed user_batch_6.parquet
Processed user_batch_4.parquet
Final combined DataFrame saved at /home/ezemriv/other_projects/hackathon-inditex-data-recommender/data/processed/users.parquet

 shape: (557_006, 5)
┌─────────┬─────┬─────┬───────────┬─────────┐
│ country ┆ R   ┆ F   ┆ M         ┆ user_id │
│ ---     ┆ --- ┆ --- ┆ ---       ┆ ---     │
│ i64     ┆ i64 ┆ i64 ┆ f64       ┆ i64     │
╞═════════╪═════╪═════╪═══════════╪═════════╡
│ 25      ┆ 30  ┆ 0   ┆ 0.0       ┆ 430102  │
│ 25      ┆ 177 ┆ 1   ┆ 75.9      ┆ 134198  │
│ 25      ┆ 32  ┆ 61  ┆ 37.694058 ┆ 134207  │
│ 25      ┆ 74  ┆ 86  ┆ 11.64094  ┆ 180365  │
│ 25      ┆ 79  ┆ 5   ┆ 30.283333 ┆ 430101  │
│ …       ┆ …   ┆ …   ┆ …         ┆ …       │
│ 25      ┆ 155 ┆ 9   ┆ 17.423636 ┆ 389294  │
│ 25      ┆ 62  ┆ 16  ┆ 45.104706 ┆ 389292  │
│ 25      ┆ 8   ┆ 74  ┆ 36.052632 ┆ 389298  │
│ 25      ┆ 15  ┆ 26 

# Train/Test Data

In [21]:
loader = PolarsLoader(sampling=True)
train = loader.load_data(path=TRAIN_PATH)

In [22]:
train.sample(10)

session_id,date,timestamp_local,add_to_cart,user_id,country,partnumber,device_type,pagetype
i64,date,datetime[μs],i64,f64,i64,i64,i64,f64
4846732,2024-06-11,2024-06-12 01:53:30.481,0,,29,971,1,24.0
2242123,2024-06-05,2024-06-05 15:42:51.003,0,,57,27628,1,24.0
2073222,2024-06-01,2024-06-01 23:10:52.713,0,,29,15694,1,24.0
178190,2024-06-01,2024-06-01 22:33:21.154,0,,34,30304,1,24.0
4872638,2024-06-08,2024-06-08 17:16:06.273,0,,29,16284,1,24.0
938754,2024-06-02,2024-06-02 15:01:16.679,0,,57,1131,1,24.0
3572864,2024-06-01,2024-06-01 04:02:04.379,0,,57,27247,1,24.0
3743878,2024-06-01,2024-06-01 23:27:48.841,0,,34,40182,1,24.0
2025525,2024-06-11,2024-06-11 06:41:57.564,0,216793.0,25,8572,1,24.0
290421,2024-06-09,2024-06-09 20:42:22.176,0,,29,40006,1,24.0


In [23]:
train.null_count()

session_id,date,timestamp_local,add_to_cart,user_id,country,partnumber,device_type,pagetype
u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,852533,0,0,0,15


# Products

In [5]:
prods = pd.read_pickle(PRODUCTS_PATH)
print(prods.head())
print(prods.shape)

  discount                                          embedding  partnumber  \
0        0  [-0.13401361, -0.1200429, -0.016117405, -0.167...       32776   
1        0  [-0.0949274, -0.107294075, -0.16559914, -0.174...       41431   
2        0  [-0.12904441, -0.07724628, -0.09799071, -0.164...       39419   
3        1  [-0.12783332, -0.133868, -0.10101265, -0.18888...       36087   
4        1  [-0.14092924, -0.1258284, -0.10809927, -0.1765...       34132   

   color_id  cod_section  family  
0        85          4.0      73  
1       135          4.0      73  
2       339          4.0      73  
3       135          4.0      73  
4         3          4.0      73  
(43692, 6)
