In [None]:
import pandas as pd
import numpy as np
import gc
import warnings
import sys
import os
import joblib
import random
from pathlib import Path
from tqdm import tqdm
from catboost import CatBoostRanker, Pool
import scipy.sparse as sparse
from sklearn.decomposition import TruncatedSVD

# Torch/Transformers check
try:
    import torch
    from torch.utils.data import DataLoader, Dataset
    from transformers import AutoTokenizer, AutoModel

    BERT_AVAILABLE = True
except ImportError:
    BERT_AVAILABLE = False

# XGBoost check
try:
    import xgboost as xgb
    XGB_AVAILABLE = True
except ImportError:
    XGB_AVAILABLE = False

warnings.filterwarnings('ignore')


class Config:
    ROOT_DIR = Path(".")
    DATA_DIR = ROOT_DIR / "data"
    RAW_DATA_DIR = DATA_DIR / "raw"
    PROCESSED_DATA_DIR = DATA_DIR / "processed"
    MODEL_DIR = Path("output/models")
    SUBMISSION_DIR = Path("output/submissions")

    PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)
    MODEL_DIR.mkdir(parents=True, exist_ok=True)
    SUBMISSION_DIR.mkdir(parents=True, exist_ok=True)

    # ВАЖНО: Мы будем использовать список сидов для ансамбля
    SEEDS = [42, 1337, 777, 2024, 100]

    NEGATIVES_PER_USER = 15

    USE_BERT = True
    BERT_MODEL_NAME = "DeepPavlov/rubert-base-cased"
    BERT_BATCH_SIZE = 8
    BERT_MAX_LEN = 128

    VAL_SIZE_RATIO = 0.2

    CB_PARAMS = {
        'loss_function': 'YetiRank',
        'iterations': 3000,  # Уменьшим, раз он и так стопается на 30
        'learning_rate': 0.03,
        'depth': 6,
        'task_type': 'CPU',
        'verbose': 0,  # Молчаливый режим для ансамбля
        'eval_metric': 'NDCG:top=20',
        'early_stopping_rounds': 50
    }
    
    XGB_PARAMS = {
        'objective': 'rank:pairwise',
        'eval_metric': 'ndcg@20',
        'max_depth': 6,
        'learning_rate': 0.03,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'n_estimators': 3000,
        'random_state': 42,
        'verbosity': 0,
        'early_stopping_rounds': 80
    }


class Constants:
    TRAIN_FILENAME = "train.csv"
    TARGETS_FILENAME = "targets.csv"
    CANDIDATES_FILENAME = "candidates.csv"
    USER_DATA_FILENAME = "users.csv"
    BOOK_DATA_FILENAME = "books.csv"
    BOOK_GENRES_FILENAME = "book_genres.csv"
    GENRES_FILENAME = "genres.csv"
    BOOK_DESCRIPTIONS_FILENAME = "book_descriptions.csv"

    COL_USER_ID = "user_id"
    COL_BOOK_ID = "book_id"
    COL_TIMESTAMP = "timestamp"
    COL_HAS_READ = "has_read"
    COL_RELEVANCE = "relevance"
    COL_DESCRIPTION = "description"
    COL_BOOK_ID_LIST = "book_id_list"

    F_SVD_SCORE = "svd_score"
    F_BERT_SIM = "bert_cosine_sim"


def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    if BERT_AVAILABLE:
        torch.manual_seed(seed)


# --- BERT ---
class TextDataset(Dataset):
    def __init__(self, texts, ids, tokenizer, max_len):
        self.texts = texts
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self): return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        encoding = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.max_len,
            return_token_type_ids=False, padding='max_length',
            truncation=True, return_attention_mask=True, return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'book_id': torch.tensor(self.ids[item], dtype=torch.long)
        }


def compute_bert_embeddings(desc_df):
    if not BERT_AVAILABLE or not Config.USE_BERT: return {}
    cache_path = Config.PROCESSED_DATA_DIR / "bert_embeddings.pkl"
    if cache_path.exists():
        print("Loading cached BERT embeddings...")
        return joblib.load(cache_path)

    # ... (код вычисления BERT опущен, он такой же как в v5, берется из кэша) ...
    print("Warning: Cache not found, please ensure bert_embeddings.pkl exists from previous run to save time!")
    return {}


# --- SVD  ---
def train_svd_model(train_df):
    print("Training SVD...")
    train_df['weight'] = train_df[Constants.COL_HAS_READ].map({1: 2, 0: 1})
    users = train_df[Constants.COL_USER_ID].unique()
    books = train_df[Constants.COL_BOOK_ID].unique()
    user_map = {u: i for i, u in enumerate(users)}
    book_map = {b: i for i, b in enumerate(books)}
    row = train_df[Constants.COL_USER_ID].map(user_map).values
    col = train_df[Constants.COL_BOOK_ID].map(book_map).values
    data = train_df['weight'].values
    sparse_matrix = sparse.csr_matrix((data, (row, col)), shape=(len(users), len(books)))
    svd = TruncatedSVD(n_components=64, random_state=42)  # Фикс сид для SVD
    u_fac = svd.fit_transform(sparse_matrix)
    i_fac = svd.components_.T
    return u_fac, i_fac, user_map, book_map


def get_svd_score(user_ids, book_ids, u_fac, i_fac, u_map, b_map):
    u_indices = np.array([u_map.get(u, -1) for u in user_ids])
    b_indices = np.array([b_map.get(b, -1) for b in book_ids])
    scores = np.zeros(len(user_ids), dtype=np.float32)
    mask = (u_indices != -1) & (b_indices != -1)
    if mask.sum() > 0:
        scores[mask] = np.sum(u_fac[u_indices[mask]] * i_fac[b_indices[mask]], axis=1)
    return scores


# --- NEW FEATURES: AUDIENCE PROFILE ---
def add_audience_features(df, train_history_df, user_meta):
    print("Generating Audience Demographics features...")

    # Мержим историю с юзерами (у юзеров уже почищен возраст, там есть NaN вместо нулей)
    history_with_meta = train_history_df.merge(user_meta, on=Constants.COL_USER_ID, how='left')

    # Группируем. Pandas mean() АВТОМАТИЧЕСКИ ИГНОРИРУЕТ NaN.
    # То есть средний возраст книги посчитается только по тем, кто указал нормальный возраст.
    book_audience = history_with_meta.groupby(Constants.COL_BOOK_ID).agg(
        book_audience_age_mean=('age', 'mean'),
        book_audience_age_std=('age', 'std')
    ).reset_index()

    # Мержим к текущему датасету
    df = df.merge(book_audience, on=Constants.COL_BOOK_ID, how='left')

    # Вот теперь заполняем пропуски для тех книг, где вообще никто возраст не указал
    # Заполняем глобальной медианой (из чистых данных)
    clean_global_median = user_meta['age'].median()

    df['book_audience_age_mean'] = df['book_audience_age_mean'].fillna(clean_global_median)
    df['book_audience_age_std'] = df['book_audience_age_std'].fillna(10.0)

    # Для разницы (diff) нам нужен возраст текущего юзера
    if 'age' not in df.columns:
        df = df.merge(user_meta[[Constants.COL_USER_ID, 'age']], on=Constants.COL_USER_ID, how='left')

    # Если у текущего юзера возраст NaN (кривой), мы не можем посчитать разницу.
    # Заполним его возраст тоже медианой, чтобы модель получила хоть какое-то число.
    # (Или можно оставить NaN, CatBoost съест, но для diff лучше число)
    df['age_filled'] = df['age'].fillna(clean_global_median)

    # Считаем разницу
    df['age_diff_with_audience'] = abs(df['age_filled'] - df['book_audience_age_mean'])

    # Удаляем временную колонку
    df = df.drop(columns=['age_filled'])

    return df

def add_heuristic_features(df, train_history_df, book_meta):
    book_stats = train_history_df.groupby(Constants.COL_BOOK_ID).agg(
        book_pop_count=(Constants.COL_USER_ID, 'count'),
        book_global_mean=(Constants.COL_HAS_READ, 'mean')
    ).reset_index()

    train_with_meta = train_history_df.merge(book_meta[[Constants.COL_BOOK_ID, 'author_id']], on=Constants.COL_BOOK_ID,
                                             how='left')
    user_author_stats = train_with_meta.groupby([Constants.COL_USER_ID, 'author_id'])[Constants.COL_HAS_READ].agg(
        user_author_count='count',
        user_author_mean='mean'
    ).reset_index()

    df = df.merge(book_stats, on=Constants.COL_BOOK_ID, how='left')
    if 'author_id' not in df.columns:
        df = df.merge(book_meta[[Constants.COL_BOOK_ID, 'author_id']], on=Constants.COL_BOOK_ID, how='left')
    df = df.merge(user_author_stats, on=[Constants.COL_USER_ID, 'author_id'], how='left')

    df['book_pop_count'] = df['book_pop_count'].fillna(0)
    df['book_global_mean'] = df['book_global_mean'].fillna(train_history_df[Constants.COL_HAS_READ].mean())
    df['user_author_count'] = df['user_author_count'].fillna(0)
    df['user_author_mean'] = df['user_author_mean'].fillna(0)
    return df


# --- BUILD FEATURES ---
def build_features(df, u_meta, b_meta, desc_df, svd_data, bert_embs, train_history_full, user_profiles=None):
    print("Building features...")
    df = df.merge(u_meta, on=Constants.COL_USER_ID, how='left')
    cols_to_use = [c for c in b_meta.columns if c not in df.columns or c == Constants.COL_BOOK_ID]
    df = df.merge(b_meta[cols_to_use], on=Constants.COL_BOOK_ID, how='left')

    u_fac, i_fac, u_map, b_map = svd_data
    df[Constants.F_SVD_SCORE] = get_svd_score(df[Constants.COL_USER_ID], df[Constants.COL_BOOK_ID], u_fac, i_fac, u_map,
                                              b_map)

    # 1. Standard Heuristics
    df = add_heuristic_features(df, train_history_full, b_meta)

    # 2. NEW: Audience Demographics
    df = add_audience_features(df, train_history_full, u_meta)

    # 3. BERT Features (Sim + Raw)
    if bert_embs:
        sample_key = next(iter(bert_embs))
        dim = len(bert_embs[sample_key])

        # Similarity
        if user_profiles:
            # Маппинг для векторов 
            # Упрощенная вставка кода для краткости
            # Предполагаем, что user_profiles уже вычислен
            u_ids = df[Constants.COL_USER_ID].values
            b_ids = df[Constants.COL_BOOK_ID].values
            u_vecs = np.array([user_profiles.get(u, np.zeros(dim)) for u in u_ids])
            b_vecs = np.array([bert_embs.get(b, np.zeros(dim)) for b in b_ids])

            dot = np.sum(u_vecs * b_vecs, axis=1)
            n_u = np.linalg.norm(u_vecs, axis=1)
            n_b = np.linalg.norm(b_vecs, axis=1)
            df[Constants.F_BERT_SIM] = dot / (n_u * n_b + 1e-9)

        # Raw BERT (First 64 dims)
        dim_raw = 64
        emb_matrix = np.zeros((len(df), dim_raw), dtype=np.float32)
        b_ids = df[Constants.COL_BOOK_ID].values
        for i, bid in enumerate(b_ids):
            if bid in bert_embs:
                emb_matrix[i] = bert_embs[bid][:dim_raw]
        bert_df = pd.DataFrame(emb_matrix, columns=[f"bert_{i}" for i in range(dim_raw)], index=df.index)
        df = pd.concat([df, bert_df], axis=1)

    # Clean
    cat_cols = ['gender', 'author_id', 'publisher', 'language']
    for c in cat_cols:
        if c in df.columns: df[c] = df[c].fillna("unk").astype(str)

    num_cols = ['age', 'publication_year', 'avg_rating', 'book_pop_count',
                'user_author_count', 'book_audience_age_mean', 'age_diff_with_audience']
    for c in num_cols:
        if c in df.columns: df[c] = df[c].fillna(0)

    return df, cat_cols


# --- HELPERS (Load, Negatives, Clean, Expand) - same as v5 ---
def load_and_prep():
    print("Loading data...")
    dtype_spec = {Constants.COL_USER_ID: "int32", Constants.COL_BOOK_ID: "int32", Constants.COL_HAS_READ: "int32"}

    train = pd.read_csv(Config.RAW_DATA_DIR / Constants.TRAIN_FILENAME, dtype=dtype_spec,
                        parse_dates=[Constants.COL_TIMESTAMP])
    train[Constants.COL_RELEVANCE] = train[Constants.COL_HAS_READ].map({1: 2, 0: 1}).astype("int8")

    candidates = pd.read_csv(Config.RAW_DATA_DIR / Constants.CANDIDATES_FILENAME,
                             dtype={Constants.COL_USER_ID: "int32"})

    # --- ЗАГРУЗКА И ЧИСТКА ЮЗЕРОВ ---
    user_meta = pd.read_csv(Config.RAW_DATA_DIR / Constants.USER_DATA_FILENAME)

    # 1. Чистим возраст: всё что меньше 6 и больше 95 превращаем в NaN (пустоту)
    if 'age' in user_meta.columns:
        user_meta.loc[(user_meta['age'] <= 5) | (user_meta['age'] >= 95), 'age'] = np.nan
        print(f"Cleaned Age. NaNs count: {user_meta['age'].isna().sum()}")

    # 2. Пол (gender). Если он числовой, убедимся что он адекватный, или тоже в NaN/Unknown
    # Обычно там 0, 1, 2. Если есть мусор - можно тоже почистить, но возраст важнее.

    # -------------------------------

    book_meta = pd.read_csv(Config.RAW_DATA_DIR / Constants.BOOK_DATA_FILENAME).drop_duplicates(Constants.COL_BOOK_ID)
    book_desc = pd.read_csv(Config.RAW_DATA_DIR / Constants.BOOK_DESCRIPTIONS_FILENAME)

    return train, candidates, user_meta, book_meta, book_desc


def generate_negatives(train_df, all_books):
    user_inter = train_df.groupby(Constants.COL_USER_ID)[Constants.COL_BOOK_ID].apply(set).to_dict()
    all_books_arr = np.array(all_books)
    rows = []
    for uid, books in user_inter.items():
        cands = np.random.choice(all_books_arr, size=Config.NEGATIVES_PER_USER + 5)
        cnt = 0
        for b in cands:
            if b not in books:
                rows.append({Constants.COL_USER_ID: uid, Constants.COL_BOOK_ID: b, Constants.COL_RELEVANCE: 0})
                cnt += 1
                if cnt >= Config.NEGATIVES_PER_USER: break
    return pd.concat([train_df, pd.DataFrame(rows)], ignore_index=True)


def clean_data_for_models(df, cat_cols):
    obj_cols = df.select_dtypes(include=['object']).columns
    garbage_cols = [c for c in obj_cols if c not in cat_cols]
    if garbage_cols: df = df.drop(columns=garbage_cols)
    for c in cat_cols:
        if c in df.columns: df[c] = df[c].astype(str)
    return df


def calculate_user_bert_profiles(train_history_df, bert_embs):
    # (Код из v5)
    if not bert_embs: return {}, 0
    sample_key = next(iter(bert_embs))
    dim = len(bert_embs[sample_key])
    bert_data = []
    for bid, vec in bert_embs.items():
        bert_data.append([bid] + list(vec))
    bert_df = pd.DataFrame(bert_data, columns=[Constants.COL_BOOK_ID] + [f"b{i}" for i in range(dim)])
    merged = train_history_df.merge(bert_df, on=Constants.COL_BOOK_ID, how='inner')
    user_profiles = merged.groupby(Constants.COL_USER_ID)[[f"b{i}" for i in range(dim)]].mean()
    return {uid: row.values for uid, row in user_profiles.iterrows()}, dim


def expand_candidates(df):
    rows = []
    for _, r in df.iterrows():
        if pd.isna(r[Constants.COL_BOOK_ID_LIST]): continue
        for b in str(r[Constants.COL_BOOK_ID_LIST]).split(','):
            if b.strip(): rows.append((r[Constants.COL_USER_ID], int(b.strip())))
    return pd.DataFrame(rows, columns=[Constants.COL_USER_ID, Constants.COL_BOOK_ID])


def prepare_group_info(group_ids):
    """Convert group IDs to group sizes for XGBoost"""
    unique_groups, counts = np.unique(group_ids, return_counts=True)
    return counts.tolist()


# === Calculating metrics === 

from sklearn.metrics import ndcg_score

def calculate_ndcg_at_k(y_true, y_pred, k=20):
    if len(y_pred) <= k:
        top_k_indices = np.arange(len(y_pred))
    else:
        top_k_indices = np.argsort(y_pred)[::-1][:k]

    y_true_at_k = np.take(y_true, top_k_indices)
    y_pred_at_k = np.take(y_pred, top_k_indices)

    y_true_2d = y_true_at_k.reshape(1, -1)
    y_pred_2d = y_pred_at_k.reshape(1, -1)

    ndcg = ndcg_score(y_true_2d, y_pred_2d, k=k)
    return ndcg

def calculate_user_ndcg_for_dataset(df, k=20):
    ndcg_scores = {}
    grouped = df.groupby('user_id')
    for user_id, group in grouped:
        y_true = group['relevance'].values
        y_pred = group['score'].values
        ndcg = calculate_ndcg_at_k(y_true, y_pred, k)
        ndcg_scores[user_id] = ndcg
    return ndcg_scores

def calculate_mean_ndcg(ndcg_scores_dict):
    if not ndcg_scores_dict:
        return 0.0
    return np.mean(list(ndcg_scores_dict.values()))

# === MAIN ===


def main():
    if not XGB_AVAILABLE:
        print("XGBoost is not available. Please install it to use this ensemble.")
        return
        
    seed_everything()

    # Load
    train_df, cand_df, u_meta, b_meta, desc_df = load_and_prep()

    bert_embs = compute_bert_embeddings(desc_df)

    # Find Best Iterations (Single Run) & Calculate NDCG ---
    print("Finding optimal iterations and calculating NDCG...")
    train_df_sorted = train_df.sort_values(Constants.COL_TIMESTAMP)
    split_idx = int(len(train_df_sorted) * (1 - Config.VAL_SIZE_RATIO))
    train_part = train_df_sorted.iloc[:split_idx].copy()
    val_part = train_df_sorted.iloc[split_idx:].copy()

    svd_data_val = train_svd_model(train_part)
    u_prof_val, _ = calculate_user_bert_profiles(train_part, bert_embs)

    all_books = b_meta[Constants.COL_BOOK_ID].unique()
    train_part_neg = generate_negatives(train_part, all_books)
    val_part_neg = generate_negatives(val_part, all_books)

    train_feat, cat_cols = build_features(train_part_neg, u_meta, b_meta, desc_df, svd_data_val, bert_embs, train_part,
                                          u_prof_val)
    val_feat, _ = build_features(val_part_neg, u_meta, b_meta, desc_df, svd_data_val, bert_embs, train_part, u_prof_val)

    # Sort & Clean
    train_feat = train_feat.sort_values(Constants.COL_USER_ID).reset_index(drop=True)
    val_feat = val_feat.sort_values(Constants.COL_USER_ID).reset_index(drop=True)

    drop_cols = [Constants.COL_USER_ID, Constants.COL_BOOK_ID, Constants.COL_RELEVANCE, Constants.COL_HAS_READ,
                 Constants.COL_TIMESTAMP,
                 "description", "als_weight", "weight", "title", "author_name", "image_url", "book_id_list"]

    X_tr = train_feat.drop(columns=[c for c in drop_cols if c in train_feat.columns], errors='ignore')
    X_val = val_feat.drop(columns=[c for c in drop_cols if c in val_feat.columns], errors='ignore')

    real_cats = [c for c in cat_cols if c in X_tr.columns]
    X_tr_clean = clean_data_for_models(X_tr.copy(), real_cats)
    X_val_clean = clean_data_for_models(X_val.copy(), real_cats)

    # Prepare CatBoost pools
    train_pool = Pool(data=X_tr_clean, label=train_feat[Constants.COL_RELEVANCE], 
                      group_id=train_feat[Constants.COL_USER_ID],
                      cat_features=real_cats)
    val_pool = Pool(data=X_val_clean, label=val_feat[Constants.COL_RELEVANCE], 
                    group_id=val_feat[Constants.COL_USER_ID],
                    cat_features=real_cats)

    # --- Calculate NDCG for CatBoost ---
    model = CatBoostRanker(**Config.CB_PARAMS)
    model.fit(train_pool, eval_set=val_pool)
    best_cb_iter = model.best_iteration_
    
    cb_val_preds = model.predict(X_val_clean)
    val_with_cb_preds = val_feat.copy()
    val_with_cb_preds['score'] = cb_val_preds
    cb_ndcg_scores = calculate_user_ndcg_for_dataset(val_with_cb_preds, k=20)
    mean_cb_ndcg = calculate_mean_ndcg(cb_ndcg_scores)
    print(f">>> CatBoost - Best Iteration: {best_cb_iter}, Mean NDCG@20 on Val: {mean_cb_ndcg:.4f}")

    # --- Calculate NDCG for XGBoost ---
   
    X_tr_xgb = X_tr_clean.select_dtypes(include=[np.number])
    X_val_xgb = X_val_clean.select_dtypes(include=[np.number])
    
    group_train = prepare_group_info(train_feat[Constants.COL_USER_ID].values)
    group_val = prepare_group_info(val_feat[Constants.COL_USER_ID].values)
    
    xgb_params_temp = Config.XGB_PARAMS.copy()
    xgb_params_temp['n_estimators'] = min(best_cb_iter*2, 2000)
    
    xgb_model = xgb.XGBRanker(**xgb_params_temp)
    xgb_model.fit(
        X_tr_xgb, train_feat[Constants.COL_RELEVANCE].values,
        group=group_train,
        eval_set=[(X_val_xgb, val_feat[Constants.COL_RELEVANCE].values)],
        eval_group=[group_val],
        verbose=0
    )
    best_xgb_iter = xgb_model.best_iteration
    
    xgb_val_preds = xgb_model.predict(X_val_xgb)
    val_with_xgb_preds = val_feat.copy()
    val_with_xgb_preds['score'] = xgb_val_preds
    xgb_ndcg_scores = calculate_user_ndcg_for_dataset(val_with_xgb_preds, k=20)
    mean_xgb_ndcg = calculate_mean_ndcg(xgb_ndcg_scores)
    print(f">>> XGBoost - Best Iteration: {best_xgb_iter}, Mean NDCG@20 on Val: {mean_xgb_ndcg:.4f}")

    del train_part, val_part, train_feat, val_feat, train_pool, val_pool, model, xgb_model
    gc.collect()

    # --- STEP 2: ENSEMBLE REFIT ---
    print("\n" + "=" * 30 + " ENSEMBLE TRAINING " + "=" * 30)

    svd_data_full = train_svd_model(train_df)
    u_prof_full, _ = calculate_user_bert_profiles(train_df, bert_embs)

    train_full_neg = generate_negatives(train_df, all_books)
    train_full_feat, _ = build_features(train_full_neg, u_meta, b_meta, desc_df, svd_data_full, bert_embs, train_df,
                                        u_prof_full)
    train_full_feat = train_full_feat.sort_values(Constants.COL_USER_ID).reset_index(drop=True)

    X_full = train_full_feat.drop(columns=[c for c in drop_cols if c in train_full_feat.columns], errors='ignore')
    X_full_clean = clean_data_for_models(X_full.copy(), real_cats)

    cand_exp = expand_candidates(cand_df)
    cand_feat, _ = build_features(cand_exp, u_meta, b_meta, desc_df, svd_data_full, bert_embs, train_df, u_prof_full)
    X_test = cand_feat.drop(columns=[Constants.COL_USER_ID, Constants.COL_BOOK_ID], errors='ignore')
    X_test = X_test.drop(columns=[c for c in drop_cols if c in X_test.columns], errors='ignore')
    for f in X_full_clean.columns:
        if f not in X_test.columns: X_test[f] = 0
    X_test = X_test[X_full_clean.columns]
    X_test_clean = clean_data_for_models(X_test.copy(), real_cats)

    # ✅ FIXED: Use include=[np.number] for XGBoost full and test sets
    X_full_xgb = X_full_clean.select_dtypes(include=[np.number])
    X_test_xgb = X_test_clean.select_dtypes(include=[np.number])
    group_full = prepare_group_info(train_full_feat[Constants.COL_USER_ID].values)

    cb_final_scores = np.zeros(len(X_test_clean))
    xgb_final_scores = np.zeros(len(X_test_xgb))

    for i, seed in enumerate(Config.SEEDS):
        print(f"Training Ensemble Models {i + 1}/{len(Config.SEEDS)} with seed {seed}...")
        
        # CatBoost
        cb_params = Config.CB_PARAMS.copy()
        cb_params['iterations'] = best_cb_iter
        cb_params['random_seed'] = seed

        cb_m = CatBoostRanker(**cb_params)
        cb_pool = Pool(data=X_full_clean, label=train_full_feat[Constants.COL_RELEVANCE],
                       group_id=train_full_feat[Constants.COL_USER_ID], cat_features=real_cats)
        cb_m.fit(cb_pool)
        cb_preds = cb_m.predict(X_test_clean)
        cb_final_scores += cb_preds
        cb_m.save_model(Config.MODEL_DIR / f"catboost_seed_{seed}.cbm")
        del cb_m, cb_pool
        gc.collect()

        # XGBoost
        # XGBoost
        xgb_params = Config.XGB_PARAMS.copy()
        xgb_params['n_estimators'] = best_xgb_iter
        xgb_params['random_state'] = seed
        # Remove early stopping since no eval_set is provided in final training
        xgb_params.pop('early_stopping_rounds', None)

        xgb_m = xgb.XGBRanker(**xgb_params)
        xgb_m.fit(X_full_xgb, train_full_feat[Constants.COL_RELEVANCE].values, group=group_full)
        xgb_preds = xgb_m.predict(X_test_xgb)
        xgb_final_scores += xgb_preds
        xgb_m.save_model(str(Config.MODEL_DIR / f"xgboost_seed_{seed}.json"))
        del xgb_m
        gc.collect()

    cb_avg_scores = cb_final_scores / len(Config.SEEDS)
    xgb_avg_scores = xgb_final_scores / len(Config.SEEDS)
    final_ensemble_scores = 0.5 * cb_avg_scores + 0.5 * xgb_avg_scores
    cand_feat['score'] = final_ensemble_scores

    cand_feat = cand_feat.sort_values([Constants.COL_USER_ID, 'score'], ascending=[True, False])
    top_20 = cand_feat.groupby(Constants.COL_USER_ID).head(20)
    sub = top_20.groupby(Constants.COL_USER_ID)[Constants.COL_BOOK_ID].apply(
        lambda x: ",".join(map(str, x))).reset_index()
    sub.columns = [Constants.COL_USER_ID, Constants.COL_BOOK_ID_LIST]

    targets = pd.read_csv(Config.RAW_DATA_DIR / Constants.TARGETS_FILENAME)
    final_sub = targets.merge(sub, on=Constants.COL_USER_ID, how='left').fillna("")

    out = Config.SUBMISSION_DIR / "sunny_xgb_cb.csv"
    final_sub.to_csv(out, index=False)
    print(f"Ensemble Done! Saved to {out}")



if __name__ == "__main__":
    main()

Loading data...
Cleaned Age. NaNs count: 97
Loading cached BERT embeddings...
Finding optimal iterations and calculating NDCG...
Training SVD...
Building features...
Generating Audience Demographics features...
Building features...
Generating Audience Demographics features...
>>> CatBoost - Best Iteration: 54, Mean NDCG@20 on Val: 0.9986
>>> XGBoost - Best Iteration: 3, Mean NDCG@20 on Val: 0.9972

Training SVD...
Building features...
Generating Audience Demographics features...
Building features...
Generating Audience Demographics features...
Training Ensemble Models 1/5 with seed 42...
Training Ensemble Models 2/5 with seed 1337...
Training Ensemble Models 3/5 with seed 777...
Training Ensemble Models 4/5 with seed 2024...
Training Ensemble Models 5/5 with seed 100...
Ensemble Done! Saved to output/submissions/sunny_xgb_cb.csv
>>> Final Ensemble - Mean NDCG@20 on Validation: 0.9979
