# Task 1

Original submission file format:

```
{'target': {'query_1': {'partnumber': 17265},
  'query_2': {'user_id': 34572},
  'query_3': {'average_previous_visits': 5.52},
  'query_4': {'device_type': 23},
  'query_5': {'user_id': 123734},
  'query_6': {'unique_families': 2357},
  'query_7': {'1': 3, '2': 5, '3': 3, '4': 9, '5': 5, '6': 1}}}
```

In [15]:
import os
import sys

# root path
ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the project root to the Python path
if ROOT not in sys.path:
    sys.path.append(ROOT)

import polars as pl
import pandas as pd

## Paths

In [16]:
DATA_PATH = os.path.join(ROOT, 'data')
RAW_DATA_PATH = os.path.join(DATA_PATH, 'raw')

USERS_RAW_PATH = os.path.join(RAW_DATA_PATH, 'user_batches')
USERS_CLEAN_PATH = os.path.join(DATA_PATH, 'processed', 'users.parquet')

TRAIN_PATH = os.path.join(DATA_PATH, 'raw', 'train.csv')

PRODUCTS_PATH = os.path.join(DATA_PATH, 'raw', 'products.pkl')

SUBMISSION_1_PATH = os.path.join(ROOT, 'predictions', 'example_predictions_1.json')

In [17]:
# Load sample submission json
import json
submission = json.load(open(SUBMISSION_1_PATH))

## Query 1

**Q1:** Which product (`partnumber`) with `color_id` equal to 3   belongs to the lowest `familiy` code with a `discount`? 

In [18]:
prods = pl.from_pandas(pd.read_pickle(PRODUCTS_PATH))

q1 = prods.sql("""
          SELECT partnumber
          FROM self
          WHERE color_id = 3
          AND discount = 1
          AND family = MIN(family)
          """).item()

submission['target']['query_1'] = {'partnumber': q1}

## Query 2

**Q2:** In the country where most users have made purchases totaling less than 500 (`M`) , which is the user who has the lowest purchase frequency (`F`), the most recent purchase (highest `R`) and the lowest `user_id`? Follow the given order of variables as the sorting priority.

In [19]:
from src.data.loaders import PolarsLoader

loader = PolarsLoader(sampling=False, file_type='parquet')
users = loader.load_data(USERS_CLEAN_PATH)

In [20]:
country = (users
           .filter(pl.col('M') < 500)
           .group_by('country')
           .len()
           .sort(by='len', descending=True)
           .head(1)
           )['country'].item()
country

25

In [21]:
q2 = (users
 .filter(pl.col('country') == country)
 .sort("F", "R", "user_id", descending=[False, True, False])
 .head(1)
)['user_id'].item()

submission['target']['query_2'] = {'user_id': q2}

## Query 3 (Run on Kaggle)

**Q3:** Among the products that were added to the cart at least once, how many times is a product visited before it is added to the cart in average? Give the answer with 2 decimals.

In [22]:
# loader = PolarsLoader(sampling=True)

# # Load data lazily
# train = (loader
#          .load_data_lazy(path=TRAIN_PATH)
#          .select(["session_id", "partnumber", "add_to_cart", "timestamp_local"])
#         )

# # Products that were added to the cart
# products_added = (
#     train
#     .filter(pl.col("add_to_cart") == 1)
#     .select("partnumber")
#     .unique()  # Stay in LazyFrame
# )

# # Keep only interactions for these products
# interactions_for_cart_products = (
#     train
#     .join(products_added, on="partnumber", how="inner")  # Lazy join instead of is_in
#     .sort(["session_id", "partnumber", "timestamp_local"])  # Sorting will also be deferred
# )

# # Add a cumulative flag for add_to_cart in each group
# grouped_data = (
#     interactions_for_cart_products
#     .group_by(["session_id", "partnumber"])
#     .agg([
#         pl.col("add_to_cart").cum_sum().alias("add_to_cart_cumsum"),
#         pl.col("add_to_cart"),
#         pl.col("timestamp_local"),
#     ])
#     .explode(["add_to_cart_cumsum", "add_to_cart", "timestamp_local"]) 
# )

# # Calculate pre-cart visits
# pre_cart_visits = (
#     grouped_data
#     .filter(pl.col("add_to_cart_cumsum") == 0)
#     .group_by("partnumber")
#     .agg(pl.col("session_id").count().alias("visit_count"))
# )

# # Collect only the final aggregated result
# final_result = pre_cart_visits.collect(streaming=True)

# # Calculate the average visits before add_to_cart
# average_visits = round(final_result["visit_count"].mean(), 2)

# print("Average Visits Before Adding to Cart:", average_visits)

In [23]:
average_visits = 1458.40 # From Kaggle run
submission['target']['query_3'] = {'average_previous_visits': average_visits}

## Query 4

**Q4:** Which device (`device_type`) is most frequently used by users to make purchases (`add_to_cart` = 1) of discounted products (`discount` = 1)?

# Replace submission file

In [24]:
with open(SUBMISSION_1_PATH, 'w') as f:
    json.dump(submission, f, indent=4)