In [4]:
import polars as pl
import joblib
from src.data_loader import get_all_data, split_date_lazy
from src.feature_engineering import build_features_from_purchases, build_labels
from src.trainer import train_model
from src import config


In [5]:
# ======================================================================
# 2. LOAD & SPLIT DATA USING CUSTOM UTILITY
# ======================================================================
print("üìÇ Loading and splitting data...")

# 1. Load d·ªØ li·ªáu
lf_items, lf_users, lf_purchases = get_all_data("/home/genyonguyen/Data/CS116_Reccommender_System/data")

# 2. S·ª≠ d·ª•ng h√†m c·ªßa b·∫°n ƒë·ªÉ chia t·∫≠p
# H√†m n√†y t·ª± ƒë·ªông x·ª≠ l√Ω cast datetime n·∫øu created_date ƒëang l√† String
lf_purchase_hist, lf_purchase_recent, lf_purchase_val = split_date_lazy(
    lf_purchases, 
    date_column_name="created_date"
)

# 3. Ki·ªÉm tra nhanh k·∫øt qu·∫£ b·∫±ng collect_schema (kh√¥ng t·ªën RAM)
print("‚úÖ Data split completed.")
print(f"   - Hist samples: {lf_purchase_hist.select(pl.len()).collect().item():,}")
print(f"   - Recent samples: {lf_purchase_recent.select(pl.len()).collect().item():,}")
print(f"   - Val samples: {lf_purchase_val.select(pl.len()).collect().item():,}")

üìÇ Loading and splitting data...
üìÇ ƒêang qu√©t d·ªØ li·ªáu t·ª´ th∆∞ m·ª•c: /home/genyonguyen/Data/CS116_Reccommender_System/data
‚úÖ ƒê√£ load Items
‚úÖ ƒê√£ load Users
‚úÖ ƒê√£ load Purchases
‚úÖ Data split completed.
   - Hist samples: 29,580,965
   - Recent samples: 3,099,667
   - Val samples: 3,049,193


In [3]:
# Ki·ªÉm tra xem c√≥ bao nhi√™u user xu·∫•t hi·ªán ·ªü c·∫£ Hist v√† Recent
user_hist = lf_purchase_hist.select("customer_id").unique().collect()
user_recent = lf_purchase_recent.select("customer_id").unique().collect()

intersection = user_hist.join(user_recent, on="customer_id", how="inner")
print(f"S·ªë user trong Hist: {user_hist.height:,}")
print(f"S·ªë user trong Recent: {user_recent.height:,}")
print(f"S·ªë user giao thoa: {intersection.height:,}")

S·ªë user trong Hist: 2,170,516
S·ªë user trong Recent: 650,383
S·ªë user giao thoa: 507,012


In [6]:
# ======================================================================
# 3. INITIALIZE CORE DATA & CO-OCCURRENCE (FIXED FOR SCHEMA)
# ======================================================================
import os
import joblib
import polars as pl

print("üèóÔ∏è ƒêang chu·∫©n b·ªã d·ªØ li·ªáu n·ªÅn t·∫£ng cho Model v√† Inference...")

# 1. G·ªôp to√†n b·ªô l·ªãch s·ª≠ mua h√†ng 
# L∆ØU √ù: Ph·∫£i g·ªôp c·∫£ lf_purchase_hist n·∫øu mu·ªën t√≠nh X1-X3 ƒë·∫ßy ƒë·ªß nh·∫•t
full_history = pl.concat([lf_purchase_val])

# 2. N·∫°p Ground Truth
gt_path = "groundtruth_main.pkl"
if os.path.exists(gt_path):
    gt_raw = joblib.load(gt_path)
    target_users_set = set(str(u) for u in gt_raw.keys())
    print(f" ‚úÖ ƒê√£ n·∫°p Ground Truth: {len(target_users_set):,} users.")
else:
    print(" ‚ö†Ô∏è C·∫£nh b√°o: Kh√¥ng t√¨m th·∫•y file groundtruth_main.pkl!")

# 3. T·∫°o b·∫£ng l·ªãch s·ª≠ mua h√†ng r√∫t g·ªçn
# S·ª¨A L·ªñI: item_id l√† String, customer_id l√† Int32 theo Schema c·ªßa b·∫°n
user_items = (
    full_history
    .select([
        pl.col("customer_id").cast(pl.Int32), # Kh·ªõp Schema Int32
        pl.col("item_id").cast(pl.String)     # Kh·ªõp Schema String
    ])
    .drop_nulls()
    .unique() 
    .group_by("customer_id")
    .agg(pl.col("item_id").alias("item_id_list"))
    .collect()
)

# 4. T√≠nh to√°n Ma tr·∫≠n ƒë·ªìng xu·∫•t hi·ªán (top_item_links)
print(" üõ†Ô∏è ƒêang t√≠nh to√°n Ma tr·∫≠n Co-occurrence...")
user_items_clean = user_items.explode("item_id_list").rename({"item_id_list": "item_id"})

# Self-join ƒë·ªÉ t√¨m c√°c c·∫∑p s·∫£n ph·∫©m mua c√πng nhau
item_pairs = (
    user_items_clean.join(user_items_clean, on="customer_id", suffix="_right")
    .filter(pl.col("item_id") < pl.col("item_id_right")) # Tr√°nh tr√πng l·∫∑p c·∫∑p (A,B) v√† (B,A)
    .group_by(["item_id", "item_id_right"])
    .len()
    .filter(pl.col("len") > 2) 
)

top_item_links = (
    pl.concat([
        item_pairs.select([pl.col("item_id"), pl.col("item_id_right"), "len"]),
        item_pairs.select([pl.col("item_id_right").alias("item_id"), pl.col("item_id").alias("item_id_right"), "len"])
    ])
    .sort(["item_id", "len"], descending=[False, True])
    .group_by("item_id")
    .head(12) 
    .select([
        pl.col("item_id").alias("item_id_source"), 
        pl.col("item_id_right").alias("item_id_rec")
    ])
)

# 5. T·∫°o ƒë·∫∑c tr∆∞ng ph·ª•c v·ª• Inference (infer_features_df)
print(" üõ†Ô∏è ƒêang x√¢y d·ª±ng infer_features_df t·ª´ full_history...")
# H√†m n√†y s·∫Ω t·ª± ƒë·ªông nh·∫≠n di·ªán item_id l√† String v√† customer_id l√† Int32
infer_features_df = build_features_from_purchases(
    lf_purchases=full_history, 
    lf_items=lf_items, 
    lf_users=lf_users
)

print("\nüöÄ KH·ªûI T·∫†O XONG! C√°c bi·∫øn ƒë√£ kh·ªõp Schema String/Int32.")

üèóÔ∏è ƒêang chu·∫©n b·ªã d·ªØ li·ªáu n·ªÅn t·∫£ng cho Model v√† Inference...
 ‚ö†Ô∏è C·∫£nh b√°o: Kh√¥ng t√¨m th·∫•y file groundtruth_main.pkl!
 üõ†Ô∏è ƒêang t√≠nh to√°n Ma tr·∫≠n Co-occurrence...
 üõ†Ô∏è ƒêang x√¢y d·ª±ng infer_features_df t·ª´ full_history...

üöÄ KH·ªûI T·∫†O XONG! C√°c bi·∫øn ƒë√£ kh·ªõp Schema String/Int32.


In [5]:
# ======================================================================
# 4. BUILDING TRAIN DATASET (CHU·∫®N H√ìA THEO SCHEMA)
# ======================================================================

def build_features_from_purchases_fixed(lf_purchases, lf_items, lf_users=None):
    # 1. JOIN metadata - Ch√∫ √Ω: item_id ƒë·ªÅu l√† String n√™n Join r·∫•t an to√†n
    data_lf = lf_purchases.join(
        lf_items.select(['item_id', 'brand', 'age_group', 'category']),
        on='item_id',
        how='inner'
    )

    # 2. T√çNH TO√ÅN V√Ä GI·ªÆ L·∫†I C·ªòT KEY
    feature_df = (
        data_lf
        .select([
            pl.col("customer_id").alias("X_-1"), # Gi·ªØ Int32 theo Schema
            pl.col("item_id").alias("X_0"),      # Gi·ªØ String theo Schema
            pl.col("brand"),                     # String
            pl.col("category"),                  # String
            pl.col("age_group"),                 # String

            # T√≠nh to√°n t·∫ßn su·∫•t
            pl.len().over(["customer_id", "brand"]).alias("X_1"),
            pl.len().over(["customer_id", "age_group"]).alias("X_2"),
            pl.len().over(["customer_id", "category"]).alias("X_3"),
        ])
        .unique(subset=["X_-1", "X_0"])
        .with_columns([
            pl.col("X_1").cast(pl.Float64),
            pl.col("X_2").cast(pl.Float64),
            pl.col("X_3").cast(pl.Float64),
        ])
    )
    return feature_df

print("üõ† Building features from Hist...")
train_features_df = build_features_from_purchases_fixed(lf_purchase_hist, lf_items)

print("üéØ Building labels from Recent...")
train_label_df = build_labels(lf_purchase_hist, lf_purchase_recent, lf_items, negative_ratio=1.0)

print("üîó Merging via Entity Profile...")
# Ch√∫ √Ω: L·∫•y Metadata g·ªëc (String) ƒë·ªÉ Join
train_label_with_meta = train_label_df.join(
    lf_items.select(["item_id", "brand", "category", "age_group"]),
    on="item_id", 
    how="left"
)

# B∆∞·ªõc 4: T√°ch profile (Kh√¥ng cast Int64 cho brand/category n·ªØa)
user_brand_feat = train_features_df.select(["X_-1", "brand", "X_1"]).unique(subset=["X_-1", "brand"])
user_age_feat   = train_features_df.select(["X_-1", "age_group", "X_2"]).unique(subset=["X_-1", "age_group"])
user_cat_feat   = train_features_df.select(["X_-1", "category", "X_3"]).unique(subset=["X_-1", "category"])

# B∆∞·ªõc 5: Join t·ªïng h·ª£p
train_df = (
    train_label_with_meta
    .select([
        pl.col("customer_id").alias("X_-1"),
        pl.col("item_id").alias("X_0"),
        pl.col("label").alias("Y"),
        "brand", "category", "age_group"
    ])
    .join(user_brand_feat, on=["X_-1", "brand"], how="left")
    .join(user_age_feat, on=["X_-1", "age_group"], how="left")
    .join(user_cat_feat, on=["X_-1", "category"], how="left")
    .with_columns(
        pl.col(["X_1", "X_2", "X_3"]).fill_null(0)
    )
    .select(["X_-1", "X_0", "X_1", "X_2", "X_3", "Y"])
)

print(f"‚úÖ Train dataset created: {train_df.select(pl.len()).collect().item():,} rows.")
display(train_df.limit(5).collect())

üõ† Building features from Hist...
üéØ Building labels from Recent...
üöÄ Building Labels with Vectorized Hard Negative Strategy...
üîó Merging via Entity Profile...
‚úÖ Train dataset created: 18,765,984 rows.


X_-1,X_0,X_1,X_2,X_3,Y
i32,str,f64,f64,f64,i8
5624205,"""0007160000078""",0.0,99.0,0.0,1
6642730,"""5481000000002""",0.0,63.0,0.0,1
5777942,"""6849000000003""",3.0,21.0,3.0,1
6288770,"""5952000000001""",0.0,41.0,0.0,1
5415343,"""0952000000074""",1.0,8.0,0.0,1


In [6]:
# ======================================================================
# CHECK DATA DENSITY (KI·ªÇM TRA S·ªê D√íNG B·∫∞NG 0)
# ======================================================================

# T√≠nh to√°n th·ªëng k√™
check_df = train_df.select([
    pl.len().alias("total_rows"),
    # ƒê·∫øm s·ªë d√≤ng m√† c·∫£ 3 ƒë·∫∑c tr∆∞ng ƒë·ªÅu b·∫±ng 0
    pl.struct(["X_1", "X_2", "X_3"])
      .filter((pl.col("X_1") == 0) & (pl.col("X_2") == 0) & (pl.col("X_3") == 0))
      .count().alias("all_zeros"),
    
    # ƒê·∫øm ri√™ng l·∫ª t·ª´ng c·ªôt
    (pl.col("X_1") == 0).sum().alias("X_1_zero"),
    (pl.col("X_2") == 0).sum().alias("X_2_zero"),
    (pl.col("X_3") == 0).sum().alias("X_3_zero")
]).collect()

# Tr√≠ch xu·∫•t gi√° tr·ªã
total = check_df["total_rows"][0]
all_zero = check_df["all_zeros"][0]
x1_z = check_df["X_1_zero"][0]
x2_z = check_df["X_2_zero"][0]
x3_z = check_df["X_3_zero"][0]

print(f"üìä TH·ªêNG K√ä TRAIN_DF (T·ªïng s·ªë d√≤ng: {total:,})")
print("-" * 45)
print(f"üö´ S·ªë d√≤ng c√≥ X1, X2, X3 ƒë·ªÅu b·∫±ng 0: {all_zero:,} ({all_zero/total*100:.2f}%)")
print(f"üî∏ C·ªôt X_1 (Brand) b·∫±ng 0: {x1_z:,} ({x1_z/total*100:.2f}%)")
print(f"üî∏ C·ªôt X_2 (Age Group) b·∫±ng 0: {x2_z:,} ({x2_z/total*100:.2f}%)")
print(f"üî∏ C·ªôt X_3 (Category) b·∫±ng 0: {x3_z:,} ({x3_z/total*100:.2f}%)") 
print("-" * 45)

if all_zero / total > 0.8:
    print("‚ö†Ô∏è C·∫¢NH B√ÅO: T·ª∑ l·ªá d√≤ng tr·ªëng qu√° cao! H√£y ki·ªÉm tra l·∫°i logic t·∫°o Candidate.")
else:
    print("‚úÖ T·ª∑ l·ªá d·ªØ li·ªáu ·ªïn ƒë·ªãnh. B·∫°n c√≥ th·ªÉ ti·∫øn h√†nh hu·∫•n luy·ªán.")

üìä TH·ªêNG K√ä TRAIN_DF (T·ªïng s·ªë d√≤ng: 18,766,176)
---------------------------------------------
üö´ S·ªë d√≤ng c√≥ X1, X2, X3 ƒë·ªÅu b·∫±ng 0: 695,393 (3.71%)
üî∏ C·ªôt X_1 (Brand) b·∫±ng 0: 4,761,292 (25.37%)
üî∏ C·ªôt X_2 (Age Group) b·∫±ng 0: 4,276,069 (22.79%)
üî∏ C·ªôt X_3 (Category) b·∫±ng 0: 1,698,387 (9.05%)
---------------------------------------------
‚úÖ T·ª∑ l·ªá d·ªØ li·ªáu ·ªïn ƒë·ªãnh. B·∫°n c√≥ th·ªÉ ti·∫øn h√†nh hu·∫•n luy·ªán.


In [3]:
# ======================================================================
# 5. TRAINING MODEL (OPTIMIZED FOR X1, X2, X3 & SCHEMA)
# ======================================================================
import xgboost as xgb
import joblib
import os
import polars as pl
from src import config

In [3]:
model_xgb = joblib.load(config.MODEL_PATH)

In [None]:
# Danh s√°ch feature b·∫°n ƒë√£ ch·ªçn (X1: Brand, X2: Age, X3: Cat)
feature_cols = ["X_1", "X_2", "X_3"] 

# Th·ª±c thi hu·∫•n luy·ªán
model_xgb = train_model(train_df, feature_cols)

# 4. L∆∞u m√¥ h√¨nh
if not os.path.exists('model'):
    os.makedirs('model')

joblib.dump(model_xgb, config.MODEL_PATH)
print(f"‚úÖ Model trained and saved at: {config.MODEL_PATH}")

‚è≥ Collecting and processing training data...


KeyboardInterrupt: 

In [8]:
import matplotlib.pyplot as plt

# L·∫•y ƒëi·ªÉm quan tr·ªçng c·ªßa c√°c t√≠nh nƒÉng
importance = model_xgb.get_score(importance_type='gain') # Ho·∫∑c 'weight'
sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)

# Hi·ªÉn th·ªã
print("üìä T·∫¶M QUAN TR·ªåNG C·ª¶A C√ÅC T√çNH NƒÇNG (FEATURE IMPORTANCE):")
for feat, score in sorted_importance:
    print(f"  - {feat}: {score:.2f}")

üìä T·∫¶M QUAN TR·ªåNG C·ª¶A C√ÅC T√çNH NƒÇNG (FEATURE IMPORTANCE):
  - X_3: 4404.66
  - X_1: 873.54
  - X_2: 678.32


In [4]:
import pickle as pkl

gt = pkl.load(open("groundtruth.pkl", "rb"))
target_users_set = set(gt.keys())

In [5]:
warm_users_set = set(
    lf_purchase_val
    .select("customer_id")
    .unique()
    .collect(engine="streaming")
    .to_series()
    .to_list()
) & target_users_set

In [6]:
from collections import defaultdict

df_purchase_val = (
    lf_purchase_val
    .select(["customer_id", "item_id"])
    .join(lf_items.select(["item_id", "brand", "age_group", "category"]), on="item_id", how="left")
    .collect(engine="streaming")
)

# Aggregate counts for each attribute in parallel
brand_counts = (
    df_purchase_val.group_by(['customer_id', 'brand'])
    .agg(pl.len().alias('count'))
)

age_counts = (
    df_purchase_val.group_by(['customer_id', 'age_group'])
    .agg(pl.len().alias('count'))
)

category_counts = (
    df_purchase_val.group_by(['customer_id', 'category'])
    .agg(pl.len().alias('count'))
)

# Build the nested defaultdict structure
feature_dict = defaultdict(lambda: {
    'brand': {},
    'age_group': {},
    'category': {}
})

# Populate brand counts
for row in brand_counts.iter_rows(named=True):
    feature_dict[row['customer_id']]['brand'][row['brand']] = row['count']

# Populate age_group counts
for row in age_counts.iter_rows(named=True):
    feature_dict[row['customer_id']]['age_group'][row['age_group']] = row['count']

# Populate category counts
for row in category_counts.iter_rows(named=True):
    feature_dict[row['customer_id']]['category'][row['category']] = row['count']

del df_purchase_val
del brand_counts
del age_counts
del category_counts

In [21]:
item_feature_dict = (
    lf_items
    .with_columns(
        pl.concat_str(["brand", "age_group", "category"], separator="|")
        .str.split("|")
        .alias("values")
    )
    .select(["item_id", "values"])
    .collect()
    .to_dict(as_series=False)
)

item_feature_dict = dict(zip(item_feature_dict["item_id"], item_feature_dict["values"]))

In [27]:
# ======================================================================
# 6. RUN INFERENCE FOR WARM USERS (OPTIMIZED WITH ABSOLUTE BOOSTING)
# ======================================================================
import xgboost as xgb
import polars as pl
import numpy as np
from src.candidate_selector import CandidateSelector
from tqdm import tqdm

customer_ids = []
pred_ids = []

purchases_recent_val_lf = pl.concat([lf_purchase_recent, lf_purchase_val])
cs = CandidateSelector(purchases_recent_val_lf, lf_users, lf_items)

# --- B∆Ø·ªöC 3: D·ª∞ ƒêO√ÅN V√Ä RERANK THEO BATCH ---
for customer_id in tqdm(warm_users_set):

    candidates = cs.get_candidates(customer_id)

    # Feature extract
    features = []
    for c_id in candidates:
        brand_count = feature_dict[customer_id]["brand"].get(item_feature_dict[c_id][0], 0)
        age_count = feature_dict[customer_id]["age_group"].get(item_feature_dict[c_id][1], 0)
        category_count = feature_dict[customer_id]["category"].get(item_feature_dict[c_id][2], 0)
        features.append([brand_count, age_count, category_count])

    X_infer = pl.DataFrame(features, schema=["X_1", "X_2", "X_3"], orient='row')
    probs = model_xgb.predict(xgb.DMatrix(X_infer))

    ranked_indices = np.argsort(probs)[::-1]
    ranked_results = []
    
    # Rerank
    for idx in ranked_indices[:10]:
        ranked_results.append(candidates[idx])
    
    customer_ids.extend([customer_id] * len(ranked_results))
    pred_ids.extend(ranked_results)


recommendations_warm = pl.DataFrame({"X_-1": customer_ids, "X_0": pred_ids})

[Build lookup structures] Completed in 2.89 seconds
[Build item co-purchase matrix] Completed in 2.55 seconds
[Compute item popularity] Completed in 0.59 seconds
[Build item similarity map] Completed in 7.54 seconds
[Collect item features] Completed in 0.07 seconds
[Collect user purchases with features] Completed in 0.57 seconds
[Cache business rule candidates] Completed in 0.58 seconds
[Build user reorder candidates] Completed in 0.79 seconds


664277it [00:00, 1105148.84it/s]


[ERROR] Timer was not started.


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 244990/244990 [2:00:48<00:00, 33.80it/s]  


In [25]:
import json

recommendations_warm = json.load(open("recommendations_warm.json", "rb"))

In [26]:
recommendations_warm = {int(k): list(map(str, v)) for k, v in recommendations_warm.items()}

In [6]:
hist_dict = (
    lf_purchase_val.select(["customer_id", "item_id"])
    .group_by("customer_id")
    .agg(pl.col("item_id"))
    .collect()
    .to_dict(as_series=False)
)

In [7]:
hist_dict = dict(zip(hist_dict["customer_id"], hist_dict["item_id"]))

In [8]:
import pickle as pkl

gt = pkl.load(open("groundtruth.pkl", "rb"))

In [None]:
import numpy as np
from tqdm import tqdm

def precision_at_k_fast(pred, gt, hist, filter_bought_items=True, K=10):
    precisions = []
    cold_start_users = []

    # users that have both prediction and history
    valid_users = gt.keys() & pred.keys() & hist.keys()

    skipped = 0
    for user in tqdm(gt.keys()):
        if user not in valid_users:
            cold_start_users.append(user)
            continue

        # ground truth as set
        relevant_items = set(gt[user])

        if filter_bought_items:
            relevant_items.difference_update(hist[user])

        if not relevant_items:
            skipped += 1
            precisions.append(0.0)
            continue

        # prediction@K
        pred_topk = pred[user][:K]
        hits = 0
        for item in pred_topk:
            if item in relevant_items:
                hits += 1

        precisions.append(hits / K)
    
    print(f"Skipped: {skipped}")

    return np.mean(precisions), cold_start_users

In [30]:
score, _ = precision_at_k_fast(recommendations_warm, gt, hist_dict)
score

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 391900/391900 [00:00<00:00, 711886.98it/s]

Skipped: 47470





np.float64(0.004439773051961304)

In [32]:
score, _ = precision_at_k_fast(recommendations_warm, gt, hist_dict, filter_bought_items=False)
score

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 391900/391900 [00:00<00:00, 1053644.60it/s]

Skipped: 0





np.float64(0.004439773051961304)

In [28]:
# ======================================================================
# 5. CREATE POPULAR ITEMS LIST (FOR COLD START STRATEGY)
# ======================================================================

print("üì¶ ƒêang chu·∫©n b·ªã danh s√°ch Popular Items t·ª´ d·ªØ li·ªáu g·∫ßn nh·∫•t...")

# N√™n s·ª≠ d·ª•ng d·ªØ li·ªáu c·ªßa th√°ng g·∫ßn nh·∫•t (lf_purchase_val) ƒë·ªÉ l·∫•y xu h∆∞·ªõng m·ªõi nh·∫•t
# ƒêi·ªÅu n√†y gi√∫p tƒÉng Precision cho nh√≥m Cold l√™n m·ª©c ~0.011668
popular_items_list = (
    lf_purchase_val.group_by("item_id")
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
    .head(100) # L·∫•y 100 m√≥n ƒë·ªÉ l√†m kho d·ª± ph√≤ng (Candidate Pool)
    .collect()
    .get_column("item_id")
    .to_list()
)

# √âp ki·ªÉu sang String ƒë·ªÉ ƒë·ªìng nh·∫•t v·ªõi schema c·ªßa Submission v√† Item-Links
popular_items_list_str = [str(i) for i in popular_items_list]

print(f"‚úÖ ƒê√£ t·∫°o xong danh s√°ch {len(popular_items_list_str)} m√≥n ph·ªï bi·∫øn nh·∫•t.")
print(f"üìç V√≠ d·ª• 5 m√≥n ƒë·∫ßu b·∫£ng: {popular_items_list_str[:5]}")

üì¶ ƒêang chu·∫©n b·ªã danh s√°ch Popular Items t·ª´ d·ªØ li·ªáu g·∫ßn nh·∫•t...
‚úÖ ƒê√£ t·∫°o xong danh s√°ch 100 m√≥n ph·ªï bi·∫øn nh·∫•t.
üìç V√≠ d·ª• 5 m√≥n ƒë·∫ßu b·∫£ng: ['4690000000001', '1512000000004', '2803000000013', '6768000000005', '0020020000185']


In [38]:
# ======================================================================
# 7. HYBRID HANDLING (REPURCHASE + ITEM-LINKS + POPULAR) - FIXED VERSION
# ======================================================================
import os
import polars as pl

print("[1/3] Ph√¢n t√°ch nh√≥m Warm v√† Cold Users...")

# 1. Thu th·∫≠p k·∫øt qu·∫£ t·ª´ Cell 6
if isinstance(recommendations_warm, pl.LazyFrame):
    warm_results = recommendations_warm.select([
        pl.col("X_-1").cast(pl.String),
        pl.col("X_0").cast(pl.String)
    ]).collect()
else:
    warm_results = recommendations_warm.select([
        pl.col("X_-1").cast(pl.String),
        pl.col("X_0").cast(pl.String)
    ])

cold_users_ids = list(target_users_set - warm_users_set)
print(f" ‚úÖ Th·ªëng k√™: {len(warm_users_set & target_users_set):,} Warm users v√† {len(cold_users_ids):,} Cold users.")

# --- 2. Sinh g·ª£i √Ω b·ªï tr·ª£ ƒëa t·∫ßng ---
print("[2/3] ƒêang t·∫°o g·ª£i √Ω b·ªï tr·ª£ (Repurchase & Item-Links)...")
df_cold_base = pl.DataFrame({"X_-1": [str(u) for u in cold_users_ids]}).lazy()

# Nh√°nh A: Repurchase (ƒê·ªì c≈© cho 47k users)
batch_repur = (
    df_cold_base.join(
        user_items.lazy().with_columns(pl.col("customer_id").cast(pl.String)).rename({"customer_id": "X_-1"}),
        on="X_-1", how="inner"
    )
    .explode("item_id_list")
    .select([pl.col("X_-1"), pl.col("item_id_list").alias("X_0")])
)

# Nh√°nh B: Semi-cold (ƒê·ªì mua k√®m)
batch_semi = (
    df_cold_base.join(
        user_items.lazy().with_columns(pl.col("customer_id").cast(pl.String)).rename({"customer_id": "X_-1"}), 
        on="X_-1", how="inner"
    )
    .explode("item_id_list")
    .rename({"item_id_list": "item_id"})
    .join(
        top_item_links.lazy().with_columns([
            pl.col("item_id_source").cast(pl.String),
            pl.col("item_id_rec").cast(pl.String)
        ]), 
        left_on="item_id", right_on="item_id_source", how="inner"
    )
    .select([pl.col("X_-1"), pl.col("item_id_rec").alias("X_0")])
)

# Nh√°nh C: Pure-cold (Popular)
users_with_any_rec = pl.concat([
    batch_semi.select("X_-1").unique(),
    batch_repur.select("X_-1").unique()
]).unique()

batch_pure = (
    df_cold_base.join(users_with_any_rec, on="X_-1", how="anti")
    .with_columns(pl.lit(popular_items_list_str).alias("X_0"))
    .explode("X_0")
    .select(["X_-1", "X_0"])
)

# G·ªôp k·∫øt qu·∫£ Cold Start v·ªõi th·ª© t·ª± ∆∞u ti√™n
recommendations_cold = pl.concat([
    batch_repur.with_columns(pl.lit(1).alias("sub_prio")),
    batch_semi.with_columns(pl.lit(2).alias("sub_prio")),
    batch_pure.with_columns(pl.lit(3).alias("sub_prio"))
]).unique(subset=["X_-1", "X_0"]).sort(["X_-1", "sub_prio"]).collect()

# --- 3. K·∫æT H·ª¢P V√Ä XU·∫§T SUBMISSION (√âP THEO GT_DICT) ---
print("[3/3] ƒêang g·ªôp k·∫øt qu·∫£ v√† l·ªçc theo ƒë√∫ng danh s√°ch m·ª•c ti√™u...")

# L·∫•y danh s√°ch ID chu·∫©n t·ª´ Ground Truth (ƒê√°p √°n c·ªßa th·∫ßy)
final_target_ids = list(map(str, list(gt.keys())))

submission = (
    pl.concat([
        warm_results.with_columns(pl.lit(1).alias("prio")),
        recommendations_cold.select(["X_-1", "X_0"]).with_columns(pl.lit(2).alias("prio"))
    ])
    .cast({"X_-1": pl.String})
    # B∆Ø·ªöC QUAN TR·ªåNG: Ch·ªâ gi·ªØ l·∫°i nh·ªØng ng∆∞·ªùi c√≥ trong ƒë√°p √°n th·ª±c t·∫ø
    .filter(pl.col("X_-1").is_in(final_target_ids)) 
    .unique(subset=["X_-1", "X_0"]) 
    .sort(["X_-1", "prio"]) 
    .group_by("X_-1")
    .agg(pl.col("X_0").head(12)) 
    .with_columns(
        pl.col("X_0").list.join(" ").alias("prediction")
    )
    .select([
        pl.col("X_-1").alias("customer_id"),
        "prediction"
    ])
    .unique(subset=["customer_id"]) 
)

# Ghi file
submission.write_csv("final_submission.csv")

print(f" ‚úÖ T·∫§T C·∫¢ HO√ÄN T·∫§T!")
print(f" üìä T·ªïng s·ªë User th·ª±c t·∫ø trong file: {submission.height:,}")

[1/3] Ph√¢n t√°ch nh√≥m Warm v√† Cold Users...
 ‚úÖ Th·ªëng k√™: 244,990 Warm users v√† 146,910 Cold users.
[2/3] ƒêang t·∫°o g·ª£i √Ω b·ªï tr·ª£ (Repurchase & Item-Links)...
[3/3] ƒêang g·ªôp k·∫øt qu·∫£ v√† l·ªçc theo ƒë√∫ng danh s√°ch m·ª•c ti√™u...
 ‚úÖ T·∫§T C·∫¢ HO√ÄN T·∫§T!
 üìä T·ªïng s·ªë User th·ª±c t·∫ø trong file: 391,900


In [40]:
import pickle as pkl

# N·∫°p file pkl ch·ª©a ƒë√°p √°n th·ª±c t·∫ø
print("üìÇ ƒêang n·∫°p Ground Truth t·ª´ file groundtruth_main.pkl...")
gt_dict_raw = pkl.load(open("groundtruth.pkl", "rb"))

# Chuy·ªÉn ƒë·ªïi key sang String ƒë·ªÉ ƒë·∫£m b·∫£o kh·ªõp v·ªõi customer_id trong submission
gt_dict = {str(k): v for k, v in gt_dict_raw.items()}

print(f"‚úÖ ƒê√£ n·∫°p xong {len(gt_dict):,} users trong Ground Truth.")

üìÇ ƒêang n·∫°p Ground Truth t·ª´ file groundtruth_main.pkl...
‚úÖ ƒê√£ n·∫°p xong 391,900 users trong Ground Truth.


In [41]:
import polars as pl

# 1. ƒê·ªçc l·∫°i file k·∫øt qu·∫£ v·ª´a xu·∫•t ra
df_pred = pl.read_csv("final_submission.csv")

# 2. Chuy·ªÉn ƒë·ªïi t·ª´ b·∫£ng sang Dictionary ƒë·ªÉ tra c·ª©u nhanh trong h√†m ƒë√°nh gi√°
# Format: { "customer_id": ["item1", "item2", ... , "item12"] }
pred_dict = {
    str(row["customer_id"]): row["prediction"].split(" ") 
    for row in df_pred.iter_rows(named=True)
}

print(f"‚úÖ ƒê√£ chu·∫©n b·ªã pred_dict cho {len(pred_dict):,} users.")

‚úÖ ƒê√£ chu·∫©n b·ªã pred_dict cho 391,900 users.


In [42]:
# --- T·∫†O HIST_LOOKUP (S·ª¨A L·ªñI .COLLECT) ---
print("üîç ƒêang t·∫°o hist_lookup t·ª´ d·ªØ li·ªáu l·ªãch s·ª≠...")

# Ki·ªÉm tra n·∫øu user_items l√† LazyFrame th√¨ m·ªõi d√πng .collect()
if isinstance(user_items, pl.LazyFrame):
    user_items_df = user_items.select(["customer_id", "item_id_list"]).collect()
else:
    user_items_df = user_items.select(["customer_id", "item_id_list"])

# T·∫°o Dictionary ƒë·ªÉ tra c·ª©u
hist_lookup = {
    str(row["customer_id"]): set(map(str, row["item_id_list"]))
    for row in user_items_df.iter_rows(named=True)
}

print(f"‚úÖ ƒê√£ chu·∫©n b·ªã xong l·ªãch s·ª≠ cho {len(hist_lookup):,} kh√°ch h√†ng.")

üîç ƒêang t·∫°o hist_lookup t·ª´ d·ªØ li·ªáu l·ªãch s·ª≠...
‚úÖ ƒê√£ chu·∫©n b·ªã xong l·ªãch s·ª≠ cho 628,069 kh√°ch h√†ng.


In [43]:
import polars as pl
import numpy as np

def evaluate_flexible(pred_dict, gt_dict, hist_lookup, warm_users_trained, K=10, include_repurchase=True):
    warm_precs = []
    semi_precs = []
    pure_precs = []
    
    # 47k users n√†y th∆∞·ªùng thu·ªôc nh√≥m Warm/Semi-cold v√¨ h·ªç c√≥ l·ªãch s·ª≠
    repurchase_only_count = 0 

    for user_id, gt_items in gt_dict.items():
        u_str = str(user_id)
        current_gt = set(map(str, gt_items))
        past_items = hist_lookup.get(u_str, set())
        
        # X√°c ƒë·ªãnh t·∫≠p ƒë·ªì m·ªõi
        relevant_new_items = current_gt - past_items
        
        # LOGIC SI√äU THAM S·ªê:
        if not relevant_new_items:
            if not include_repurchase:
                # T·∫Øt: B·ªè qua 47k users ch·ªâ mua ƒë·ªì c≈©
                continue 
            else:
                # B·∫≠t: Coi ƒë·ªì c≈© kh√°ch mua l·∫°i l√† m·ª•c ti√™u ƒë√°nh gi√° (Ground Truth)
                target_items = current_gt
                repurchase_only_count += 1
        else:
            # Lu√¥n t√≠nh nh·ªØng ng∆∞·ªùi c√≥ mua ƒë·ªì m·ªõi
            target_items = relevant_new_items

        # T√≠nh Precision@K
        user_preds = pred_dict.get(u_str, [])[:K]
        if not user_preds:
            precision = 0.0
        else:
            hits = len(set(user_preds) & target_items)
            precision = hits / K
        
        # Ph√¢n lu·ªìng b√°o c√°o d·ª±a tr√™n tr·∫°ng th√°i User
        if u_str in warm_users_trained:
            warm_precs.append(precision)
        elif u_str in hist_lookup:
            semi_precs.append(precision)
        else:
            pure_precs.append(precision)
            
    return warm_precs, semi_precs, pure_precs, repurchase_only_count

# --- CH·∫†Y TH·ª¨ NGHI·ªÜM ---
# B·∫°n c√≥ th·ªÉ thay ƒë·ªïi include_repurchase = True ho·∫∑c False ·ªü ƒë√¢y
include_47k = True 

warm_p, semi_p, pure_p, re_count = evaluate_flexible(
    pred_dict=pred_dict, 
    gt_dict=gt_dict, 
    hist_lookup=hist_lookup,
    warm_users_trained=warm_users_set,
    K=10,
    include_repurchase=include_47k
)

# --- IN B√ÅO C√ÅO ---
status = "B·∫¨T (T√≠nh c·∫£ 47k user mua ƒë·ªì c≈©)" if include_47k else "T·∫ÆT (Ch·ªâ t√≠nh ƒë·ªì m·ªõi)"
print(f"\nüöÄ CH·∫æ ƒê·ªò ƒê√ÅNH GI√Å: {status}")
print("="*60)
print(f"üî• NH√ìM WARM      : {len(warm_p):,} users - P@10: {np.mean(warm_p):.6f}")
print(f"‚ö° NH√ìM SEMI-COLD : {len(semi_p):,} users - P@10: {np.mean(semi_p):.6f}")
print(f"‚ùÑÔ∏è NH√ìM PURE-COLD : {len(pure_p):,} users - P@10: {np.mean(pure_p):.6f}")
print("-" * 60)
total_evaluated = len(warm_p) + len(semi_p) + len(pure_p)
print(f"üèÜ GLOBAL MEAN P@10 : {np.mean(warm_p + semi_p + pure_p):.6f}")
print(f"üì¶ T·ªïng User ƒë∆∞·ª£c t√≠nh: {total_evaluated:,} / {len(gt_dict):,}")
if include_47k:
    print(f"‚ÑπÔ∏è Trong ƒë√≥ c√≥ {re_count:,} users l√† nh√≥m 'Ch·ªâ mua l·∫°i ƒë·ªì c≈©'.")
print("="*60)


üöÄ CH·∫æ ƒê·ªò ƒê√ÅNH GI√Å: B·∫¨T (T√≠nh c·∫£ 47k user mua ƒë·ªì c≈©)
üî• NH√ìM WARM      : 0 users - P@10: nan
‚ö° NH√ìM SEMI-COLD : 244,990 users - P@10: 0.005122
‚ùÑÔ∏è NH√ìM PURE-COLD : 146,910 users - P@10: 0.005982
------------------------------------------------------------
üèÜ GLOBAL MEAN P@10 : 0.005444
üì¶ T·ªïng User ƒë∆∞·ª£c t√≠nh: 391,900 / 391,900
‚ÑπÔ∏è Trong ƒë√≥ c√≥ 47,470 users l√† nh√≥m 'Ch·ªâ mua l·∫°i ƒë·ªì c≈©'.


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
