In [1]:
import polars as pl

In [2]:
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd

def _auc_per_session(y, p, s):
    # y: (0/1), p: tahmin, s: session_id
    df = pd.DataFrame({'y': y, 'p': p, 's': s})
    aucs = []
    for _, g in df.groupby('s', sort=False):
        if g['y'].nunique() == 2:        # o oturumda hem 0 hem 1 varsa
            aucs.append(roc_auc_score(g['y'], g['p']))
    return float(np.mean(aucs)) if len(aucs) else np.nan

def trendyol_score(y_click, y_order, p, session_id, w_click=0.3, w_order=0.7):
    auc_c = _auc_per_session(y_click, p, session_id)
    auc_o = _auc_per_session(y_order, p, session_id)
    if np.isnan(auc_c): auc_c = 0.0   
    if np.isnan(auc_o): auc_o = 0.0
    return w_click*auc_c + w_order*auc_o, auc_c, auc_o


In [3]:
DATA_PATH = "/kaggle/input/trendyol-e-ticaret-hackathonu-2025-kaggle/data"

train_sessions = pl.read_parquet(f"{DATA_PATH}/train_sessions.parquet")
test_sessions = pl.read_parquet(f"{DATA_PATH}/test_sessions.parquet")

content_metadata = pl.read_parquet(f"{DATA_PATH}/content/metadata.parquet")
content_price_data = pl.read_parquet(f"{DATA_PATH}/content/price_rate_review_data.parquet")

In [4]:
train_sessions = train_sessions.with_columns(train_sessions["ts_hour"].cast(pl.Date).alias("ts_date"))

test_sessions = test_sessions.with_columns(test_sessions["ts_hour"].cast(pl.Date).alias("ts_date"))

content_price_data = content_price_data.with_columns(content_price_data["update_date"].cast(pl.Date).alias("ts_date"))

In [5]:
content_sw = pl.read_parquet(f"{DATA_PATH}/content/sitewide_log.parquet").with_columns(
    pl.col("date").cast(pl.Date)
)

max_d = content_sw.select(pl.max("date")).item()
d7  = max_d - pl.duration(days=7)
d30 = max_d - pl.duration(days=30)

sw7  = (content_sw.filter(pl.col("date") > d7)
        .group_by("content_id_hashed")
        .agg(click7=pl.sum("total_click"), order7=pl.sum("total_order")))

sw30 = (content_sw.filter(pl.col("date") > d30)
        .group_by("content_id_hashed")
        .agg(click30=pl.sum("total_click"), order30=pl.sum("total_order")))

pop = sw30.join(sw7, on="content_id_hashed", how="full").fill_null(0.0)

# min–max normalizer
def mm(c): 
    return (pl.col(c) - pl.min(c)) / (pl.max(c) - pl.min(c) + 1e-12)

pop = (pop
       .with_columns([
           mm("order7").alias("order7_n"),
           mm("click7").alias("click7_n"),
           mm("order30").alias("order30_n"),
           mm("click30").alias("click30_n"),
       ])
       # kısa dönem ve order daha ağırlıklı
       .with_columns((0.6*pl.col("order7_n") + 0.2*pl.col("click7_n") 
                      + 0.15*pl.col("order30_n") + 0.05*pl.col("click30_n")).alias("pop_score"))
       .select(["content_id_hashed","pop_score"])
)

train_sessions = train_sessions.join(pop, on="content_id_hashed", how="left").with_columns(pl.col("pop_score").fill_null(0.0))
test_sessions  = test_sessions.join(pop,  on="content_id_hashed", how="left").with_columns(pl.col("pop_score").fill_null(0.0))



In [6]:
ctt = pl.read_parquet(f"{DATA_PATH}/content/top_terms_log.parquet").with_columns(
    pl.col("date").cast(pl.Date)
)


max_d = ctt.select(pl.max("date")).item()
win = max_d - pl.duration(days=60)

ctt60 = (ctt.filter(pl.col("date") > win)
         .with_columns((pl.col("total_search_click") / (pl.col("total_search_impression") + 1e-9)).alias("ctr60")))

term_rel = (ctt60.group_by(["content_id_hashed","search_term_normalized"])
            .agg(pl.mean("ctr60").clip(0,1).alias("term_ctr")))

join_keys = ["content_id_hashed","search_term_normalized"]
train_sessions = train_sessions.join(term_rel, on=join_keys, how="left").with_columns(pl.col("term_ctr").fill_null(0.0))
test_sessions  = test_sessions.join(term_rel,  on=join_keys, how="left").with_columns(pl.col("term_ctr").fill_null(0.0))

In [7]:
# ===USER META + USER 30D ACTIVITY===
user_meta = pl.read_parquet(f"{DATA_PATH}/user/metadata.parquet").select(
    ["user_id_hashed", "user_birth_year", "user_tenure_in_days", "user_gender"]
).with_columns((2025 - pl.col("user_birth_year")).alias("user_age"))

u_site = pl.read_parquet(f"{DATA_PATH}/user/sitewide_log.parquet").with_columns(
    pl.col("ts_hour").cast(pl.Date).alias("ts_date")
)

max_ut = u_site.select(pl.max("ts_date")).item()

u30 = u_site.filter(pl.col("ts_date") > (max_ut - pl.duration(days=30)))

u30_agg = (
    u30.group_by("user_id_hashed")
       .agg([
           pl.col("total_click").sum().alias("u30_click_sum"),
           pl.col("total_cart").sum().alias("u30_cart_sum"),
           pl.col("total_fav").sum().alias("u30_fav_sum"),
           pl.col("total_order").sum().alias("u30_order_sum"),
       ])
       .with_columns([
           (pl.col("u30_click_sum") + 1e-6).log().alias("u30_click_log"),
           (pl.col("u30_order_sum") + 1e-6).log().alias("u30_order_log"),
           (pl.col("u30_order_sum")/(pl.col("u30_click_sum")+1e-9)).clip(0,1).alias("u30_conv_rate"),
       ])
)

for name in ["train_sessions","test_sessions"]:
    locals()[name] = (
        locals()[name]
        .join(user_meta, on="user_id_hashed", how="left")
        .join(u30_agg,   on="user_id_hashed", how="left")
        .with_columns([
            pl.col("user_age").fill_null(0),
            pl.col("user_tenure_in_days").fill_null(0),
            pl.col("user_gender").fill_null("U"),
            pl.col("u30_click_log").fill_null(0.0),
            pl.col("u30_order_log").fill_null(0.0),
            pl.col("u30_conv_rate").fill_null(0.0),
        ])
    )


In [8]:
# -------------------------
# EWMA Pop Score 
# -------------------------
alpha = 0.3
max_d = content_sw.select(pl.max("date")).item()

# Son 60 günü al 
content_sw = content_sw.filter(pl.col("date") > (max_d - pl.duration(days=60)))

# Günlük bazda toplama
daily = (
    content_sw
    .group_by(["content_id_hashed", "date"])
    .agg([
        pl.col("total_click").sum().alias("click"),
        pl.col("total_order").sum().alias("order"),
    ])
    .sort(["content_id_hashed", "date"])
)

# ewma_t = alpha*x_t + (1-alpha)*ewma_(t-1)
def ewma_expr(col):
    return (
        (alpha * pl.col(col))
        + (1 - alpha) * pl.col(f"{col}_ewma_prev")
    )

# İlk değer için aynı değer 
for metric in ["click", "order"]:
    daily = daily.with_columns(pl.col(metric).alias(f"{metric}_ewma"))

    daily = daily.with_columns(
        pl.col(f"{metric}_ewma").shift(1).over("content_id_hashed").alias(f"{metric}_ewma_prev")
    )
    daily = daily.with_columns(
        ewma_expr(metric).over("content_id_hashed").alias(f"{metric}_ewma")
    )

# Son gün
ewma_last = (
    daily.group_by("content_id_hashed")
    .agg([
        pl.col("click_ewma").last().alias("click_ewma_last"),
        pl.col("order_ewma").last().alias("order_ewma_last"),
    ])
    # Normalize
    .with_columns([
        ((pl.col("order_ewma_last") - pl.min("order_ewma_last")) /
         (pl.max("order_ewma_last") - pl.min("order_ewma_last") + 1e-12)).alias("o_ewma_n"),
        ((pl.col("click_ewma_last") - pl.min("click_ewma_last")) /
         (pl.max("click_ewma_last") - pl.min("click_ewma_last") + 1e-12)).alias("c_ewma_n"),
    ])
    .with_columns((0.75 * pl.col("o_ewma_n") + 0.25 * pl.col("c_ewma_n")).alias("pop_ewma"))
    .select(["content_id_hashed", "pop_ewma"])
)


train_sessions = train_sessions.join(ewma_last, on="content_id_hashed", how="left").with_columns(pl.col("pop_ewma").fill_null(0.0))
test_sessions = test_sessions.join(ewma_last, on="content_id_hashed", how="left").with_columns(pl.col("pop_ewma").fill_null(0.0))


In [9]:
# ===SESSION POPULARITY RELATIVE RANK===
def add_session_pop_rank(df: pl.DataFrame) -> pl.DataFrame:
    return (
        df.with_columns([
            pl.col("pop_ewma").rank("ordinal", descending=True).over("session_id").alias("pop_rank_in_sess"),
            pl.len().over("session_id").alias("_n")
        ])
        .with_columns(((pl.col("pop_rank_in_sess")-1)/(pl.col("_n")-1+1e-9)).alias("pop_rr_in_sess"))
        .drop("_n")
    )

train_sessions = add_session_pop_rank(train_sessions)
test_sessions  = add_session_pop_rank(test_sessions)


In [10]:
ctt = pl.read_parquet(f"{DATA_PATH}/content/top_terms_log.parquet").with_columns(
    pl.col("date").cast(pl.Date)
)

max_d = ctt.select(pl.max("date")).item()
win = max_d - pl.duration(days=60)
ctt60 = ctt.filter(pl.col("date") > win)

glob_ctr = (
    ctt60["total_search_click"].sum() /
    (ctt60["total_search_impression"].sum() + 1e-9)
)

k = 50.0
alpha0 = glob_ctr * k
beta0  = (1.0 - glob_ctr) * k

agg = (
    ctt60.group_by(["content_id_hashed","search_term_normalized"])
         .agg([
             pl.col("total_search_click").sum().alias("click"),
             pl.col("total_search_impression").sum().alias("imp")
         ])
         .with_columns(
             ((pl.col("click") + alpha0) / (pl.col("imp") + alpha0 + beta0))
             .alias("term_ctr_smoothed")
         )
)

train_sessions = train_sessions.join(agg.select(["content_id_hashed","search_term_normalized","term_ctr_smoothed"]),
                                     on=["content_id_hashed","search_term_normalized"], how="left") \
                               .with_columns(pl.col("term_ctr_smoothed").fill_null(0.0))

test_sessions  = test_sessions.join(agg.select(["content_id_hashed","search_term_normalized","term_ctr_smoothed"]),
                                    on=["content_id_hashed","search_term_normalized"], how="left") \
                              .with_columns(pl.col("term_ctr_smoothed").fill_null(0.0))


In [11]:
train_sessions = train_sessions.join(content_metadata, on="content_id_hashed", how="left")
train_sessions = train_sessions.join(content_price_data, on=["content_id_hashed", "ts_date"], how="left")

test_sessions = test_sessions.join(content_metadata, on="content_id_hashed", how="left")
test_sessions = test_sessions.join(content_price_data, on=["content_id_hashed", "ts_date"], how="left")

In [12]:
for df_name in ["train_sessions","test_sessions"]:
    df = locals()[df_name]
    df = df.with_columns(
        (((pl.col("original_price") - pl.col("selling_price")) /
           (pl.col("original_price")+1e-9)).alias("discount_rate"))
    ).with_columns(
        pl.when(pl.col("discount_rate").is_nan() | ~pl.col("discount_rate").is_finite())
          .then(0.0).otherwise(pl.col("discount_rate")).alias("discount_rate")
    )
    locals()[df_name] = df

In [13]:
##fotoğraf/videolu yorumlar
for df_name in ["train_sessions","test_sessions"]:
    df = locals()[df_name]
    df = df.with_columns(
        (pl.col("content_review_wth_media_count") / (pl.col("content_review_count")+1e-9))
        .fill_null(0.0).alias("media_review_ratio")
    )
    locals()[df_name] = df

In [14]:
# Fiyatın kategoriye göre ucuzluğu
both = pl.concat([
    train_sessions.select(["leaf_category_name","ts_date","selling_price"]).with_columns(pl.lit("tr").alias("_src")),
    test_sessions.select(["leaf_category_name","ts_date","selling_price"]).with_columns(pl.lit("te").alias("_src")),
])

leaf_day = both.group_by(["leaf_category_name","ts_date"]).agg(
    med_price = pl.median("selling_price")
)

def add_rel(df):
    return (df.join(leaf_day, on=["leaf_category_name","ts_date"], how="left")
              .with_columns((pl.col("selling_price")/(pl.col("med_price")+1e-9)).alias("price_vs_leaf_med"))
              .drop("med_price"))

train_sessions = add_rel(train_sessions)
test_sessions  = add_rel(test_sessions)


In [15]:
# === PRICE TREND / VOLATILITY + BAYESIAN RATING===
pcols = content_price_data.select(["content_id_hashed","ts_date","selling_price"])
max_pd = pcols.select(pl.max("ts_date")).item()
p7  = pcols.filter(pl.col("ts_date") > (max_pd - pl.duration(days=7)))
p30 = pcols.filter(pl.col("ts_date") > (max_pd - pl.duration(days=30)))

p7_agg = p7.group_by("content_id_hashed").agg([
    pl.col("selling_price").mean().alias("p7_mean"),
    pl.col("selling_price").std(ddof=1).fill_null(0.0).alias("p7_std"),
])
p30_agg = p30.group_by("content_id_hashed").agg([
    pl.col("selling_price").mean().alias("p30_mean"),
    pl.col("selling_price").std(ddof=1).fill_null(0.0).alias("p30_std"),
])

for name in ["train_sessions","test_sessions"]:
    df = locals()[name]
    df = (df.join(p7_agg,  on="content_id_hashed", how="left")
            .join(p30_agg, on="content_id_hashed", how="left")
            .with_columns([
                pl.col("p7_mean").fill_null(pl.col("selling_price")),
                pl.col("p30_mean").fill_null(pl.col("selling_price")),
                pl.col("p7_std").fill_null(0.0),
                pl.col("p30_std").fill_null(0.0),
                (pl.col("selling_price")/(pl.col("p7_mean")+1e-9)).alias("price_vs_p7"),
                (pl.col("selling_price")/(pl.col("p30_mean")+1e-9)).alias("price_vs_p30"),
            ]))
    locals()[name] = df

# Bayesian rating from (content_rate_avg, content_rate_count)
global_mean = float(train_sessions["content_rate_avg"].mean())
k_bayes = 20.0
for name in ["train_sessions","test_sessions"]:
    locals()[name] = locals()[name].with_columns(
        (
            (pl.col("content_rate_avg") * pl.col("content_rate_count") + global_mean * k_bayes) /
            (pl.col("content_rate_count") + k_bayes + 1e-9)
        ).alias("rating_bayes")
    ).with_columns(pl.col("rating_bayes").fill_null(global_mean))


In [16]:
#Term global CTR (smoothed)
#Arama teriminin genel tıklanabilirliği.

term = pl.read_parquet(f"{DATA_PATH}/term/search_log.parquet").with_columns(pl.col("ts_hour").cast(pl.Date).alias("ts_date"))
max_dt = term.select(pl.max("ts_date")).item()
term60 = term.filter(pl.col("ts_date") > (max_dt - pl.duration(days=60)))

glob_ctr_t = (term60["total_search_click"].sum() / (term60["total_search_impression"].sum()+1e-9))
k=50.0; a0=glob_ctr_t*k; b0=(1-glob_ctr_t)*k

term_ctr = (term60.group_by("search_term_normalized")
    .agg([
        pl.col("total_search_click").sum().alias("clk"),
        pl.col("total_search_impression").sum().alias("imp")
    ])
    .with_columns(((pl.col("clk")+a0) / (pl.col("imp")+a0+b0)).alias("term_global_ctr_sm"))
    .select(["search_term_normalized","term_global_ctr_sm"])
)

for df_name in ["train_sessions","test_sessions"]:
    locals()[df_name] = locals()[df_name].join(term_ctr, on="search_term_normalized", how="left")\
        .with_columns(pl.col("term_global_ctr_sm").fill_null(0.0))


In [17]:
# User × Term CTR (smoothed)
# Kullanıcı bu terimde genelde tıklıyor mu?
ut = pl.read_parquet(f"{DATA_PATH}/user/top_terms_log.parquet")\
       .with_columns(pl.col("ts_hour").cast(pl.Date).alias("ts_date"))

max_du = ut.select(pl.max("ts_date")).item()
ut60 = ut.filter(pl.col("ts_date") > (max_du - pl.duration(days=60)))

glob = (ut60["total_search_click"].sum() / (ut60["total_search_impression"].sum()+1e-9))
k=30.0; a0=glob*k; b0=(1-glob)*k

u_t = (ut60.group_by(["user_id_hashed","search_term_normalized"])
    .agg([
        pl.col("total_search_click").sum().alias("clk"),
        pl.col("total_search_impression").sum().alias("imp")
    ])
    .with_columns(((pl.col("clk")+a0) / (pl.col("imp")+a0+b0)).alias("user_term_ctr_sm"))
    .select(["user_id_hashed","search_term_normalized","user_term_ctr_sm"])
)

for df_name in ["train_sessions","test_sessions"]:
    locals()[df_name] = locals()[df_name].join(u_t, on=["user_id_hashed","search_term_normalized"], how="left")\
        .with_columns(pl.col("user_term_ctr_sm").fill_null(0.0))


In [18]:
# === ADD-ON: TERM × CONTENT LIFT ===
for name in ["train_sessions","test_sessions"]:
    locals()[name] = locals()[name].with_columns(
        ( (pl.col("term_ctr_smoothed") + 1e-6) / (pl.col("term_global_ctr_sm") + 1e-6) )
        .log()
        .alias("term_content_lift_log")
    )

In [19]:
# intent alignment
# Bu terim geçmişte bu leaf’le tıklanmış mı?


leaf_map = content_metadata.select(["content_id_hashed","leaf_category_name"])
ctt60 = ctt60.join(leaf_map, on="content_id_hashed", how="left")

term_leaf = (ctt60.group_by(["search_term_normalized","leaf_category_name"])
    .agg(pl.col("total_search_click").sum().alias("clk"))
)

term_tot = term_leaf.group_by("search_term_normalized").agg(pl.col("clk").sum().alias("clk_sum"))
term_leaf = term_leaf.join(term_tot, on="search_term_normalized", how="left")\
    .with_columns((pl.col("clk")/(pl.col("clk_sum")+1e-9)).alias("term_leaf_share"))\
    .select(["search_term_normalized","leaf_category_name","term_leaf_share"])

for df_name in ["train_sessions","test_sessions"]:
    locals()[df_name] = locals()[df_name].join(term_leaf, on=["search_term_normalized","leaf_category_name"], how="left")\
        .with_columns(pl.col("term_leaf_share").fill_null(0.0))

In [20]:
# Session-local özellik
def add_session_local(df):
    return (df.with_columns([
                pl.col("selling_price").rank("ordinal").over("session_id").alias("_rk"),
                pl.len().over("session_id").alias("_n")
            ])
            .with_columns(((pl.col("_rk")-1)/(pl.col("_n")-1+1e-9)).alias("price_rank_in_sess"))
            .drop(["_rk","_n"]))

train_sessions = add_session_local(train_sessions)
test_sessions  = add_session_local(test_sessions)

In [21]:
# === ADD-ON: SESSION SIZE ===
def add_sess_size(df: pl.DataFrame) -> pl.DataFrame:
    return df.with_columns(pl.len().over("session_id").alias("sess_size"))
train_sessions = add_sess_size(train_sessions)
test_sessions  = add_sess_size(test_sessions)

In [22]:
# =========================
# USER-BASED PERSONALIZATION (60g)
# =========================
UF_PATH = f"{DATA_PATH}/user/fashion_sitewide_log.parquet"
uf = (
    pl.read_parquet(UF_PATH)
      .with_columns(pl.col("ts_hour").cast(pl.Date).alias("ts_date"))
      .select(["user_id_hashed","content_id_hashed","ts_date","total_click","total_order"])
)

# Son 60 gün
max_du = uf.select(pl.max("ts_date")).item()
uf = uf.filter(pl.col("ts_date") > (max_du - pl.duration(days=60)))

# --- USER–CONTENT yakınlığı (personal_pop_uc)
uc = (
    uf.group_by(["user_id_hashed","content_id_hashed"])
      .agg([
          pl.col("total_click").sum().alias("uc_click"),
          pl.col("total_order").sum().alias("uc_order")
      ])
)
uc = uc.join(
    uc.group_by("user_id_hashed")
      .agg([
          pl.col("uc_click").max().alias("max_uc_click"),
          pl.col("uc_order").max().alias("max_uc_order"),
      ]),
    on="user_id_hashed", how="left"
).with_columns([
    (pl.col("uc_click")/(pl.col("max_uc_click")+1e-9)).alias("uc_click_n"),
    (pl.col("uc_order")/(pl.col("max_uc_order")+1e-9)).alias("uc_order_n"),
]).with_columns(
    (0.3*pl.col("uc_click_n") + 0.7*pl.col("uc_order_n")).alias("personal_pop_uc")
).select(["user_id_hashed","content_id_hashed","personal_pop_uc"])

# --- USER–LEAF-CATEGORY yakınlığı (personal_aff_leaf)
leaf_map = content_metadata.select(["content_id_hashed","leaf_category_name"])

uf_leaf = uf.join(leaf_map, on="content_id_hashed", how="left")

ul = (
    uf_leaf.group_by(["user_id_hashed","leaf_category_name"])
           .agg([
               pl.col("total_click").sum().alias("ul_click"),
               pl.col("total_order").sum().alias("ul_order"),
           ])
)
ul = ul.join(
    ul.group_by("user_id_hashed")
      .agg([
          pl.col("ul_click").max().alias("max_ul_click"),
          pl.col("ul_order").max().alias("max_ul_order"),
      ]),
    on="user_id_hashed", how="left"
).with_columns([
    (pl.col("ul_click")/(pl.col("max_ul_click")+1e-9)).alias("ul_click_n"),
    (pl.col("ul_order")/(pl.col("max_ul_order")+1e-9)).alias("ul_order_n"),
]).with_columns(
    (0.3*pl.col("ul_click_n") + 0.7*pl.col("ul_order_n")).alias("personal_aff_leaf")
).select(["user_id_hashed","leaf_category_name","personal_aff_leaf"])


train_sessions = (
    train_sessions
    .join(uc, on=["user_id_hashed","content_id_hashed"], how="left")
    .join(ul, on=["user_id_hashed","leaf_category_name"], how="left")
    .with_columns([
        pl.col("personal_pop_uc").fill_null(0.0),
        pl.col("personal_aff_leaf").fill_null(0.0),
    ])
)
test_sessions = (
    test_sessions
    .join(uc, on=["user_id_hashed","content_id_hashed"], how="left")
    .join(ul, on=["user_id_hashed","leaf_category_name"], how="left")
    .with_columns([
        pl.col("personal_pop_uc").fill_null(0.0),
        pl.col("personal_aff_leaf").fill_null(0.0),
    ])
)

In [23]:
# ===TIME & TERM-SHAPE===
def add_time_and_term(df: pl.DataFrame) -> pl.DataFrame:
    return (
        df.with_columns([
            pl.col("ts_hour").dt.hour().alias("hour"),
            pl.col("ts_hour").dt.weekday().alias("dow"),
            (pl.col("ts_hour").dt.weekday().is_in([5,6])).cast(pl.Int8).alias("is_weekend"),
            pl.col("search_term_normalized").str.len_chars().alias("term_len"),
            (pl.col("search_term_normalized").str.count_matches(r"\s+") + 1).fill_null(1).alias("term_word_count"),
        ])
    )

train_sessions = add_time_and_term(train_sessions)
test_sessions  = add_time_and_term(test_sessions)

In [24]:
# === LEAF ORDER SHARE PRIOR ===
leaf_map = content_metadata.select(["content_id_hashed","leaf_category_name"])
leaf_orders = (content_sw.join(leaf_map, on="content_id_hashed", how="left")
               .group_by("leaf_category_name")
               .agg(pl.col("total_order").sum().alias("leaf_orders")))
leaf_prior = leaf_orders.with_columns(
    (pl.col("leaf_orders")/pl.col("leaf_orders").sum()).alias("leaf_order_share")
).select(["leaf_category_name","leaf_order_share"])

for name in ["train_sessions","test_sessions"]:
    locals()[name] = (
        locals()[name]
        .join(leaf_prior, on="leaf_category_name", how="left")
        .with_columns(pl.col("leaf_order_share").fill_null(0.0))
    )

In [25]:
# === OOF TARGET ENCODING FOR LEAF ===
from sklearn.model_selection import GroupKFold

_te_pd = train_sessions.select(
    ["leaf_category_name","ordered","clicked","user_id_hashed"]
).to_pandas()

te_oof_leaf_order = np.zeros(len(_te_pd))
te_oof_leaf_click = np.zeros(len(_te_pd))

gkf_te = GroupKFold(n_splits=5)
for tr, va in gkf_te.split(_te_pd, groups=_te_pd["user_id_hashed"]):
    m_o = _te_pd.iloc[tr].groupby("leaf_category_name")["ordered"].mean()
    m_c = _te_pd.iloc[tr].groupby("leaf_category_name")["clicked"].mean()
    te_oof_leaf_order[va] = _te_pd.iloc[va]["leaf_category_name"].map(m_o).fillna(m_o.mean()).values
    te_oof_leaf_click[va] = _te_pd.iloc[va]["leaf_category_name"].map(m_c).fillna(m_c.mean()).values

train_sessions = train_sessions.with_columns([
    pl.Series("te_leaf_order_oof", te_oof_leaf_order),
    pl.Series("te_leaf_click_oof", te_oof_leaf_click),
])

_full_o = _te_pd.groupby("leaf_category_name")["ordered"].mean()
_full_c = _te_pd.groupby("leaf_category_name")["clicked"].mean()
test_te_o = test_sessions.select(["leaf_category_name"]).to_pandas()["leaf_category_name"].map(_full_o).fillna(_full_o.mean()).values
test_te_c = test_sessions.select(["leaf_category_name"]).to_pandas()["leaf_category_name"].map(_full_c).fillna(_full_c.mean()).values
test_sessions = test_sessions.with_columns([
    pl.Series("te_leaf_order_oof", test_te_o),
    pl.Series("te_leaf_click_oof", test_te_c),
])

In [26]:
#-------Advanced Interactions-------------------
def add_advanced_interactions(df):
    return df.with_columns([
        # Price-Quality interactions
        (pl.col("selling_price") * pl.col("content_rate_avg")).alias("price_x_quality"),
        (pl.col("discount_rate") * pl.col("content_rate_avg")).alias("discount_x_quality"),
        (pl.col("selling_price") * pl.col("pop_ewma")).alias("price_x_popularity"),
        # User-Context interactions
        (pl.col("user_age") * pl.col("selling_price")).alias("age_x_price"),
        (pl.col("u30_conv_rate") * pl.col("pop_ewma")).alias("user_conversion_x_popularity"),
        (pl.col("user_tenure_in_days") * pl.col("content_rate_avg")).alias("tenure_x_quality"),
        # Search-Content interactions
        (pl.col("term_ctr_smoothed") * pl.col("pop_ewma")).alias("term_ctr_x_popularity"),
        (pl.col("term_global_ctr_sm") * pl.col("content_rate_avg")).alias("term_global_x_quality"),
        (pl.col("user_term_ctr_sm") * pl.col("personal_pop_uc")).alias("user_term_x_personal"),
        # Time-Context interactions
        (pl.col("hour") * pl.col("selling_price")).alias("hour_x_price"),
        (pl.col("is_weekend") * pl.col("pop_ewma")).alias("weekend_x_popularity"),
        # Session-Context interactions
        (pl.col("sess_size") * pl.col("relative_position")).alias("session_size_x_position"),
        (pl.col("session_category_diversity") * pl.col("pop_ewma")).alias("diversity_x_popularity"),
    ])
# === 1. CART/FAV BASED FEATURES ===
def add_cart_fav_features():
    
    # Content sitewide cart/fav verileri
    max_d = content_sw.select(pl.max("date")).item()
    
    # Son 7 ve 30 gün cart/fav activity
    sw7_cf = (content_sw.filter(pl.col("date") > (max_d - pl.duration(days=7)))
    .group_by("content_id_hashed")
    .agg([
        pl.col("total_cart").sum().alias("cart7"),
        pl.col("total_fav").sum().alias("fav7"),
        pl.col("total_click").sum().alias("click7_cf"),
        pl.col("total_order").sum().alias("order7_cf")
    ]))
    
    sw30_cf = (content_sw.filter(pl.col("date") > (max_d - pl.duration(days=30)))
    .group_by("content_id_hashed")
    .agg([
        pl.col("total_cart").sum().alias("cart30"),
        pl.col("total_fav").sum().alias("fav30"),
        pl.col("total_click").sum().alias("click30_cf"),
        pl.col("total_order").sum().alias("order30_cf")
    ]))
    
    cart_fav_features = sw7_cf.join(sw30_cf, on="content_id_hashed", how="full").fill_null(0.0)
    
    cart_fav_features = cart_fav_features.with_columns([
        (pl.col("order7_cf") / (pl.col("cart7") + 1e-9)).alias("cart_to_order_7d"),
        (pl.col("order30_cf") / (pl.col("cart30") + 1e-9)).alias("cart_to_order_30d"),

        (pl.col("cart7") / (pl.col("click7_cf") + 1e-9)).alias("click_to_cart_7d"),
        (pl.col("cart30") / (pl.col("click30_cf") + 1e-9)).alias("click_to_cart_30d"),

        (pl.col("fav7") / (pl.col("cart7") + 1e-9)).alias("fav_cart_ratio_7d"),
        (pl.col("fav30") / (pl.col("cart30") + 1e-9)).alias("fav_cart_ratio_30d"),

        ((pl.col("cart7") + pl.col("fav7")) / 2).log().alias("engagement_score_7d"),
        ((pl.col("cart30") + pl.col("fav30")) / 2).log().alias("engagement_score_30d"),
    ])
    return cart_fav_features.select([
        "content_id_hashed", "cart_to_order_7d", "cart_to_order_30d",
        "click_to_cart_7d", "click_to_cart_30d", "fav_cart_ratio_7d",
        "fav_cart_ratio_30d", "engagement_score_7d", "engagement_score_30d"
    ])
    
cart_fav_data = add_cart_fav_features()
train_sessions = train_sessions.join(cart_fav_data, on="content_id_hashed", how="left").with_columns([
    pl.col(c).fill_null(0.0) for c in cart_fav_data.columns if c != "content_id_hashed"
])
test_sessions = test_sessions.join(cart_fav_data, on="content_id_hashed", how="left").with_columns([
    pl.col(c).fill_null(0.0) for c in cart_fav_data.columns if c != "content_id_hashed"
])

In [ ]:
# === TRAIN: GroupKFold CV + Early Stopping ===
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score, log_loss

feature_cols = [
    "content_rate_avg","filterable_label_count",
    "original_price","selling_price","discounted_price",
    "pop_score","term_ctr","pop_ewma","term_ctr_smoothed",
    "discount_rate","personal_pop_uc","personal_aff_leaf",
    "media_review_ratio","price_vs_leaf_med",
    "term_global_ctr_sm","user_term_ctr_sm","term_leaf_share",
    "price_rank_in_sess","user_age",
    "user_tenure_in_days","u30_click_log","u30_order_log","u30_conv_rate",
    "p7_mean","p30_mean","p7_std","p30_std","price_vs_p7","price_vs_p30",
    "rating_bayes","term_content_lift_log","pop_rr_in_sess",
    "leaf_order_share","hour","dow","is_weekend","term_len","term_word_count",
    "te_leaf_order_oof","te_leaf_click_oof","sess_size",
    "cart_to_order_7d", "cart_to_order_30d", "click_to_cart_7d", "click_to_cart_30d",
    "fav_cart_ratio_7d", "fav_cart_ratio_30d", "engagement_score_7d", "engagement_score_30d",
]


# Eksik sütunları güvene al
need = set(feature_cols)
missing_train = [c for c in need if c not in train_sessions.columns]
missing_test  = [c for c in need if c not in test_sessions.columns]
for c in missing_train:
    train_sessions = train_sessions.with_columns(pl.lit(0.0).alias(c))
for c in missing_test:
    test_sessions  = test_sessions.with_columns(pl.lit(0.0).alias(c))
assert "user_id_hashed" in train_sessions.columns
if "user_id_hashed" not in test_sessions.columns:
    test_sessions = test_sessions.with_columns(pl.lit("unknown").alias("user_id_hashed"))


In [27]:
# === EMBEDDING-BASED FEATURE ===
from sentence_transformers import SentenceTransformer, util
import numpy as np
import polars as pl
import os

# Model yükle (GPU varsa otomatik hızlanır)
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Unique query ve product stringlerini çıkar
def prepare_texts(df):
    queries = df["search_term_normalized"].unique().to_list()
    products = (df["leaf_category_name"].fill_null("") + " " + df["cv_tags"].fill_null("")).unique().to_list()
    return queries, products

# Cacheli encode
def encode_unique(texts, fname):
    if os.path.exists(fname):
        return np.load(fname)
    embs = model.encode(texts, batch_size=128, convert_to_numpy=True, show_progress_bar=True)
    np.save(fname, embs)
    return embs

# Cosine similarity lookup
def build_sim_feature(df, qmap, pmap):
    def cos_sim_row(q, p, t):
        q = q if q is not None else ""
        p = p if p is not None else ""
        t = t if t is not None else ""
        key = p + " " + t
        if q in qmap and key in pmap:
            return float(util.cos_sim(qmap[q], pmap[key]).item())
        return 0.0

    return (
        df
        .with_columns(
            pl.struct(["search_term_normalized", "leaf_category_name", "cv_tags"])
            .map_elements(lambda row: cos_sim_row(row["search_term_normalized"], row["leaf_category_name"], row["cv_tags"]))
            .alias("query_prod_cos")
        )
    )

# === Çalıştırma ===
# Train
q_texts, p_texts = prepare_texts(train_sessions)
q_embs = encode_unique(q_texts, "query_emb.npy")
p_embs = encode_unique(p_texts, "product_emb.npy")

qmap = dict(zip(q_texts, q_embs))
pmap = dict(zip(p_texts, p_embs))

train_sessions = build_sim_feature(train_sessions, qmap, pmap)

# Test
q_texts_t, p_texts_t = prepare_texts(test_sessions)
# encode_unique tekrar çağırınca cache dosyası varsa direkt load eder
q_embs_t = encode_unique(q_texts_t, "query_emb_test.npy")
p_embs_t = encode_unique(p_texts_t, "product_emb_test.npy")

qmap_t = dict(zip(q_texts_t, q_embs_t))
pmap_t = dict(zip(p_texts_t, p_embs_t))

test_sessions = build_sim_feature(test_sessions, qmap_t, pmap_t)

# Feature listene ekle
feature_cols.append("query_prod_cos")
print("✅ Embedding feature eklendi:", "query_prod_cos")



2025-08-22 12:16:46.724894: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755865006.917916      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755865006.973696      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/3355 [00:00<?, ?it/s]

  .with_columns(


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/3315 [00:00<?, ?it/s]

  .with_columns(


✅ Embedding feature eklendi: query_prod_cos


In [28]:
# Train DF: hedef ve meta kolonları ayrıca seç
train_df = train_sessions.select(feature_cols + ["user_id_hashed","clicked","ordered","session_id"]).to_pandas()

X          = train_df.drop(columns=["ordered","clicked","user_id_hashed","session_id"])
y_order    = train_df["ordered"].astype(int)
y_click    = train_df["clicked"].astype(int)
session_id = train_df["session_id"]
groups     = train_df["user_id_hashed"]

# Test matrisi aynı kolon sırasıyla
test_X = test_sessions.select(X.columns.to_list()).to_pandas()

# GroupKFold CV + Early Stopping
gkf = GroupKFold(n_splits=5)
oof = np.zeros(len(X), dtype=float)
test_pred = np.zeros(len(test_X), dtype=float)

params = dict(
    objective="binary",
    boosting_type="gbdt",
    n_estimators=5000,     
    learning_rate=0.03,
    num_leaves=127,
    reg_alpha=0.5,
    reg_lambda=1.0,
    random_state=42,
    verbose=-1,
    bagging_fraction=0.8,
    bagging_freq=1,
    min_data_in_leaf=200,
    feature_fraction=0.8,
    min_gain_to_split= 0.03,
    lambda_l1= 0.5, 
    lambda_l2= 3,
    max_bin= 255,
    drop_rate=0.1, 
    skip_drop=0.5,
)

fold_scores = []

for fold, (tr_idx, va_idx) in enumerate(gkf.split(X, y_order, groups=groups), 1):
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X.iloc[tr_idx], y_order.iloc[tr_idx],
        eval_set=[(X.iloc[va_idx], y_order.iloc[va_idx])],
        eval_metric="binary_logloss",
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(100)]
    )

    # valid tahmin
    p_va = model.predict_proba(X.iloc[va_idx])[:, 1]
    oof[va_idx] = p_va

    # Trendyol Score
    sc, auc_c, auc_o = trendyol_score(
        y_click.iloc[va_idx].values,
        y_order.iloc[va_idx].values,
        p_va,
        session_id.iloc[va_idx].values
    )
    fold_scores.append(sc)
    print(f"[Fold {fold}] TrendyolScore={sc:.5f} | AUC_click={auc_c:.5f} | AUC_order={auc_o:.5f}")

    # test tahmini
    test_pred += model.predict_proba(test_X)[:, 1] / gkf.get_n_splits()


# OOF genel skor
final_sc, final_auc_c, final_auc_o = trendyol_score(
    y_click.values, y_order.values, oof, session_id.values
)
print(f"OOF TrendyolScore={final_sc:.5f} | clickAUC={final_auc_c:.5f} | orderAUC={final_auc_o:.5f}")


print("OOF AUC:", roc_auc_score(y_order, oof))
print("OOF Logloss:", log_loss(y_order, oof))

# === Mevcut order modelinin çıktısını isimlendir ===
oof_order  = oof.copy()
test_order = test_pred.copy()

# Aynı fold split'lerini tekrar kullanalım 
folds = list(gkf.split(X, y_order, groups=groups))

# === Click modeli (aynı paramlarla) ===
oof_click  = np.zeros(len(X), dtype=float)
test_click = np.zeros(len(test_X), dtype=float)

# class imbalance ayarı 
pos_c = y_click.sum(); neg_c = len(y_click) - pos_c
params_click = {**params, "scale_pos_weight": float(neg_c) / max(float(pos_c), 1.0)}

for fold, (tr_idx, va_idx) in enumerate(folds, 1):
    m = lgb.LGBMClassifier(**params_click)
    m.fit(
        X.iloc[tr_idx], y_click.iloc[tr_idx],
        eval_set=[(X.iloc[va_idx], y_click.iloc[va_idx])],
        eval_metric="binary_logloss",
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(100)]
    )
    oof_click[va_idx] = m.predict_proba(X.iloc[va_idx])[:, 1]
    test_click += m.predict_proba(test_X)[:, 1] / len(folds)

# === 2) OOF üzerinde en iyi w'yu TrendyolScore ile tara ===
ws = np.linspace(0.40, 0.95, 36)  

best = (-1, None, None, None)  # (score, w, auc_c, auc_o)

for w in ws:
    p = w*oof_order + (1-w)*oof_click
    sc, auc_c, auc_o = trendyol_score(
        y_click.values, y_order.values, p, session_id.values
    )
    if sc > best[0]:
        best = (sc, w, auc_c, auc_o)

print(f"[BLEND] best TrendyolScore={best[0]:.6f}  w={best[1]:.3f}  (click={best[2]:.5f}, order={best[3]:.5f})")

# === Test için aynı w'yu uygula (+ küçük tie-breaker opsiyonel) ===
w = best[1]
test_blend = w*test_order + (1-w)*test_click

# Eşitlik kırma (çok küçük tut): pop_ewma sinyali
epsilon = 0.02
pop_ewma_te = test_sessions.select("pop_ewma").to_pandas().values.ravel()
test_final = test_blend + epsilon * pop_ewma_te


Training until validation scores don't improve for 200 rounds
[100]	valid_0's binary_logloss: 0.0175723
[200]	valid_0's binary_logloss: 0.0174867
[300]	valid_0's binary_logloss: 0.0174852
[400]	valid_0's binary_logloss: 0.0174988
[500]	valid_0's binary_logloss: 0.017528
Early stopping, best iteration is:
[333]	valid_0's binary_logloss: 0.0174767
[Fold 1] TrendyolScore=0.70509 | AUC_click=0.60238 | AUC_order=0.74910
Training until validation scores don't improve for 200 rounds
[100]	valid_0's binary_logloss: 0.018261
[200]	valid_0's binary_logloss: 0.0181862
[300]	valid_0's binary_logloss: 0.0182165
[400]	valid_0's binary_logloss: 0.0182364
Early stopping, best iteration is:
[203]	valid_0's binary_logloss: 0.0181854
[Fold 2] TrendyolScore=0.69390 | AUC_click=0.59979 | AUC_order=0.73423
Training until validation scores don't improve for 200 rounds
[100]	valid_0's binary_logloss: 0.0176111
[200]	valid_0's binary_logloss: 0.0175137
[300]	valid_0's binary_logloss: 0.0175235
[400]	valid_0's 

In [29]:
# === 4) Submission üret ===
tmp = test_sessions.with_columns(
    pl.Series(name="prediction", values=test_final)
).select(["session_id","content_id_hashed","prediction"])\
 .sort(["session_id","prediction"], descending=True)


submission_df = tmp.group_by("session_id").agg(
    pl.col("content_id_hashed").alias("prediction")
).with_columns(
    pl.col("prediction").list.join(" ")
)

submission_df.write_csv("submission.csv")
print("Saved:", "submission.csv", "| w used:", w)

Saved: submission.csv | w used: 0.887142857142857
