# Task 1

Original submission file format:

```
{'target': {'query_1': {'partnumber': 17265},
  'query_2': {'user_id': 34572},
  'query_3': {'average_previous_visits': 5.52},
  'query_4': {'device_type': 23},
  'query_5': {'user_id': 123734},
  'query_6': {'unique_families': 2357},
  'query_7': {'1': 3, '2': 5, '3': 3, '4': 9, '5': 5, '6': 1}}}
```

In [1]:
import os
import sys

# root path
ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the project root to the Python path
if ROOT not in sys.path:
    sys.path.append(ROOT)

import polars as pl
import pandas as pd
from src.data.loaders import PolarsLoader

## Paths

In [2]:
from config import (USERS_DATA_PATH, TRAIN_DATA_PATH, 
                    TEST_DATA_PATH, SUBMISSION_1_PATH, EX_SUBMISSION_1_PATH, 
                    PRODUCTS_PARQUET_PATH, PRODUCTS_DATA_PATH,
                    TRAIN_PARQUET_PATH, TEST_PARQUET_PATH)

In [3]:
# Load sample submission json
import json
submission = json.load(open(EX_SUBMISSION_1_PATH))

## Query 1

**Q1:** Which product (`partnumber`) with `color_id` equal to 3   belongs to the lowest `familiy` code with a `discount`? 

In [4]:
prods = pl.from_pandas(pd.read_pickle(PRODUCTS_DATA_PATH))

q1 = prods.sql("""
          SELECT partnumber
          FROM self
          WHERE color_id = 3
          AND discount = 1
          AND family = MIN(family)
          """).item()

print(f"q1: {q1}")
submission['target']['query_1'] = {'partnumber': q1}

q1: 17265


## Query 2

**Q2:** In the country where most users have made purchases totaling less than 500 (`M`) , which is the user who has the lowest purchase frequency (`F`), the most recent purchase (highest `R`) and the lowest `user_id`? Follow the given order of variables as the sorting priority.

In [5]:
loader = PolarsLoader(sampling=False, file_type='parquet')
users = loader.load_data(USERS_DATA_PATH)

In [6]:
country = (users
           .filter(pl.col('M') < 500)
           .group_by('country')
           .len()
           .sort(by='len', descending=True)
           .head(1)
           )['country'].item()
country

25

In [7]:
q2 = (users
 .filter(pl.col('country') == country)
 .sort("F", "R", "user_id", descending=[False, True, False])
 .head(1)
)['user_id'].item()

print(f"Q2: {q2}")
submission['target']['query_2'] = {'user_id': q2}

Q2: 187374


## Query 3 (Run on Kaggle)

**Q3:** Among the products that were added to the cart at least once, how many times is a product visited before it is added to the cart in average? Give the answer with 2 decimals.

In [8]:
# %%time
# loader = PolarsLoader(sampling=True)

# # Load data lazily
# train = (loader
#          .load_data_lazy(path=TRAIN_DATA_PATH)
#          .select(["session_id", "partnumber", "add_to_cart", "timestamp_local"])
#         )

# # Products that were added to the cart
# products_added = (
#     train
#     .filter(pl.col("add_to_cart") == 1)
#     .select("partnumber")
#     .unique()  # Stay in LazyFrame
# )

# # Keep only interactions for these products
# interactions_for_cart_products = (
#     train
#     .join(products_added, on="partnumber", how="inner")  # Lazy join instead of is_in
#     .sort(["session_id", "partnumber", "timestamp_local"])  # Sorting will also be deferred
# )

# # Add a cumulative flag for add_to_cart in each group
# grouped_data = (
#     interactions_for_cart_products
#     .group_by(["session_id", "partnumber"])
#     .agg([
#         pl.col("add_to_cart").cum_sum().alias("add_to_cart_cumsum"),
#         pl.col("add_to_cart"),
#         pl.col("timestamp_local"),
#     ])
#     .explode(["add_to_cart_cumsum", "add_to_cart", "timestamp_local"]) 
# )

# # Calculate pre-cart visits
# pre_cart_visits = (
#     grouped_data
#     .filter(pl.col("add_to_cart_cumsum") == 0)
#     .group_by("partnumber")
#     .agg(pl.col("session_id").count().alias("visit_count"))
# )

# # Collect only the final aggregated result
# final_result = pre_cart_visits.collect(streaming=True)

# # Calculate the average visits before add_to_cart
# average_visits = round(final_result["visit_count"].mean(), 2)

# print("Average Visits Before Adding to Cart:", average_visits)

**Testing loading the new train data as parquet casted down**

In [9]:
%%time
loader = PolarsLoader(sampling=True, file_type="csv")
df = loader.load_data(TRAIN_DATA_PATH)
df.group_by("session_id").agg(pl.col("add_to_cart").sum().alias("add_to_cart")).head()

CPU times: user 6.95 s, sys: 536 ms, total: 7.48 s
Wall time: 1.36 s


session_id,add_to_cart
i64,i64
2719156,0
3630006,0
2026934,0
1318164,8
2691071,0


In [10]:
%%time
loader = PolarsLoader(sampling=True, file_type="parquet")
df = loader.load_data(TRAIN_PARQUET_PATH)
df.group_by("session_id").agg(pl.col("add_to_cart").sum().alias("add_to_cart")).head()

CPU times: user 324 ms, sys: 51.4 ms, total: 375 ms
Wall time: 104 ms


session_id,add_to_cart
u32,i64
3310055,0
1118147,0
3938209,1
4356452,0
1717290,0


In [11]:
average_visits = 1458.40 # From Kaggle run
submission['target']['query_3'] = {'average_previous_visits': average_visits}

## Query 4

**Q4:** Which device (`device_type`) is most frequently used by users to make purchases (`add_to_cart` = 1) of discounted products (`discount` = 1)?

In [12]:
SAMPLING = True

In [13]:
prod_loader = PolarsLoader(sampling=SAMPLING, file_type='parquet')
prods = prod_loader.load_data(PRODUCTS_PARQUET_PATH, select_cols=['discount', 'partnumber'])

prods = prods.filter(pl.col('discount') == 1)

In [14]:
loader = PolarsLoader(sampling=SAMPLING, file_type="parquet")
train = loader.load_data(TRAIN_PARQUET_PATH).filter((pl.col("add_to_cart") == 1))

In [15]:
filtered_data = train.join(prods, on='partnumber', how='inner')
filtered_data

session_id,date,timestamp_local,add_to_cart,user_id,country,partnumber,device_type,pagetype,discount
u32,date,datetime[μs],u8,u32,u8,u16,u8,u8,u8
3181,2024-06-01,2024-06-01 13:32:33.136,1,,57,42698,1,24,1
3181,2024-06-01,2024-06-01 13:33:46.088,1,,57,25177,1,24,1
3181,2024-06-01,2024-06-01 13:31:31.108,1,,57,25177,1,24,1
16251,2024-06-10,2024-06-10 14:07:42.937,1,,57,1865,1,24,1
16251,2024-06-10,2024-06-10 14:29:55.528,1,,57,3879,1,24,1
…,…,…,…,…,…,…,…,…,…
644293,2024-06-11,2024-06-11 09:23:03.061,1,,29,17254,1,24,1
646630,2024-06-09,2024-06-09 16:59:47.453,1,451583,25,28796,1,24,1
647335,2024-06-10,2024-06-10 13:53:32.479,1,,57,42529,1,24,1
655524,2024-06-04,2024-06-04 22:51:00.642,1,,29,26778,1,24,1


In [16]:
# import random
# filtered_data.filter(pl.col("session_id") == random.choice(filtered_data["session_id"].unique())) # one device per session id

In [17]:
# Count device usage
device_counts = (
    filtered_data.group_by("device_type")
    .agg(pl.col("session_id").count().alias("count"))
)
device_counts

device_type,count
u8,u32
3,311
1,3637
2,60


In [18]:
# Find the most frequently used device
most_frequent_device = (
    device_counts
    .sort("count", descending=True)
    .select(pl.first("device_type"))
    .item()
)

print("Most Frequent Device for Discounted Purchases:", most_frequent_device)

Most Frequent Device for Discounted Purchases: 1


In [19]:
submission['target']['query_4'] = {'device_type': most_frequent_device}

## Query 5

**Q5:** Among users with purchase frequency (`F`) in the top 3 within their purchase country, who has interacted with the most products (`partnumber`) in sessions conducted from a device with identifierr 3 (`device_type` = 3)?

# Replace submission file

In [20]:
with open(SUBMISSION_1_PATH, 'w') as f:
    json.dump(submission, f, indent=4)