In [1]:
# -*- coding: utf-8 -*-
# =====================================================
# Ï†ÑÏ≤òÎ¶¨ Ï†ÑÏ≤¥ ÏΩîÎìú (group stats Ìè¨Ìï® Î≤ÑÏ†Ñ)
# =====================================================
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import OrdinalEncoder

# ---------------------------------------------------
# 0) ÌäπÏàò Î¨∏ÏûêÏó¥Îì§ÏùÑ NaNÏúºÎ°ú ÌÜµÏùº
# ---------------------------------------------------
MISSING_LIKE = ["\\N", "/N", "NA", "N/A", "null", "NULL",
                "NaN", "-", "", " ", "na", "Na", "None", "none"]

def normalize_missing(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in df.columns:
        if df[c].dtype == object:
            df[c] = df[c].replace(MISSING_LIKE, np.nan)
    return df

# ---------------------------------------------------
# Í≥µÏö©: Î¨∏ÏûêÏó¥ÏóêÏÑú Ïà´ÏûêÎßå Ï∂îÏ∂ú
# ---------------------------------------------------
num_pat = re.compile(r'(\d+\.?\d*)')

def extract_number_series(s: pd.Series) -> pd.Series:
    return pd.to_numeric(
        s.astype(str).str.extract(num_pat, expand=False),
        errors="coerce"
    )

# ---------------------------------------------------
# New_Price Î¨∏ÏûêÏó¥ ‚Üí Ïà´Ïûê(Lakh Îã®ÏúÑ)Î°ú ÌÜµÏùº
#   Ïòà) "Rs. 5.8 Lakh"  -> 5.8
#       "Rs. 1.2 Crore" -> 120   (1 Crore = 100 Lakh)
#       "Rs. 850000"    -> 8.5   (850000 Rupee = 8.5 Lakh)
# ---------------------------------------------------
def clean_new_price_simple(s: pd.Series) -> pd.Series:
    """
    Í∞ÄÍ≤© Îã®ÏúÑÎ•º Î™®Îëê Lakh(ÎùºÌÅ¨) Îã®ÏúÑÎ°ú ÌÜµÏùº.
    """
    s = s.astype(str).str.strip()
    num = extract_number_series(s)
    s_lower = s.str.lower()

    price_lakh = num.copy()

    is_lakh = s_lower.str.contains("lakh")
    is_crore = s_lower.str.contains("crore")

    # LakhÎäî Í∑∏ÎåÄÎ°ú
    price_lakh[is_lakh] = num[is_lakh]

    # Crore -> 100 Lakh
    price_lakh[is_crore] = num[is_crore] * 100

    # Îã®ÏúÑ Î™ÖÏãú ÏóÜÎäî Í≤ΩÏö∞: Rupee Í∞íÏù¥ÎùºÍ≥† Î≥¥Í≥† LakhÎ°ú Î≥ÄÌôò
    no_unit = ~(is_lakh | is_crore)
    price_lakh[no_unit] = num[no_unit] / 1e5

    price_lakh = price_lakh.where(price_lakh.notna(), np.nan)
    return price_lakh

# ---------------------------------------------------
# Mileage: Ïà´Ïûê + Îã®ÏúÑ Ï≤òÎ¶¨ ‚Üí kmpl Î°ú ÌÜµÏùº
#   Ïòà) "20.4 kmpl", "19.8 km/kg" Îì±
# ---------------------------------------------------
def clean_mileage_to_kmpl(s: pd.Series) -> pd.Series:
    s = s.astype(str)
    val = extract_number_series(s)
    unit = s.str.extract(r'(kmpl|km/kg)', expand=False)

    kmpl = val.copy()
    # km/kg ‚Üí kmpl (ÎåÄÎûµÏ†ÅÏù∏ ÌôòÏÇ∞ Í≥ÑÏàò, ÌïÑÏöîÏãú Ï°∞Ï†ï Í∞ÄÎä•)
    factor_kg_to_liter = 1.4
    mask_km_per_kg = unit == "km/kg"
    kmpl.loc[mask_km_per_kg] = val.loc[mask_km_per_kg] / factor_kg_to_liter

    return kmpl

# ---------------------------------------------------
# Mileage / Engine / Power:
# 0 Î∞è NaN ‚Üí Brand/Model Í∑∏Î£π medianÏúºÎ°ú Î≥¥Ï†ï
# ---------------------------------------------------
def fill_zero_and_missing_by_brand_model(df: pd.DataFrame, col: str) -> pd.DataFrame:
    df = df.copy()
    if col not in df.columns:
        return df

    # 0 ÏùÄ NaN Ï∑®Í∏â
    df.loc[df[col] == 0, col] = np.nan

    group_cols = [c for c in ["Brand", "Model"] if c in df.columns]
    if not group_cols:
        df[col] = df[col].fillna(df[col].median())
        return df

    group_median = df.groupby(group_cols)[col].transform('median')
    df[col] = df[col].fillna(group_median)

    df[col] = df[col].fillna(df[col].median())
    return df

# ---------------------------------------------------
# Name ‚Üí Brand, Model
# ---------------------------------------------------
def split_name_to_brand_model(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "Name" not in df.columns:
        return df

    name = df["Name"].astype(str).str.strip()
    df["Brand"] = name.str.split().str[0]
    df["Model"] = name.str.split().str[1:].str.join(" ")
    df = df.drop(columns=["Name"])
    return df

# ---------------------------------------------------
# New_Price Í≤∞Ï∏° Î≥¥Ï†ï (Model ‚Üí Brand ‚Üí Location ‚Üí Ï†ÑÏ≤¥ median)
# ---------------------------------------------------
def fill_new_price_with_hierarchy_train(df: pd.DataFrame):
    df = df.copy()
    model_median = df.groupby("Model")["New_Price"].median()
    brand_median = df.groupby("Brand")["New_Price"].median()
    loc_median   = df.groupby("Location")["New_Price"].median()
    global_median = df["New_Price"].median()

    df["New_Price"] = df["New_Price"].fillna(df["Model"].map(model_median))
    df["New_Price"] = df["New_Price"].fillna(df["Brand"].map(brand_median))
    df["New_Price"] = df["New_Price"].fillna(df["Location"].map(loc_median))
    df["New_Price"] = df["New_Price"].fillna(global_median)

    artifacts = {
        "model_median":  model_median,
        "brand_median":  brand_median,
        "loc_median":    loc_median,
        "global_median": global_median,
    }
    return df, artifacts

def fill_new_price_with_hierarchy_test(df: pd.DataFrame, artifacts):
    df = df.copy()
    mm = artifacts["model_median"]
    bm = artifacts["brand_median"]
    lm = artifacts["loc_median"]
    gm = artifacts["global_median"]

    df["New_Price"] = df["New_Price"].fillna(df["Model"].map(mm))
    df["New_Price"] = df["New_Price"].fillna(df["Brand"].map(bm))
    df["New_Price"] = df["New_Price"].fillna(df["Location"].map(lm))
    df["New_Price"] = df["New_Price"].fillna(gm)

    return df

# ---------------------------------------------------
# Î≤îÏ£ºÌòï OrdinalEncoder (Label Encoding Ïó≠Ìï†)
# ---------------------------------------------------
CAT_COLS_TO_ENCODE = [
    "Fuel_Type", "Transmission", "Location",
    "Owner_Type", "Brand", "Model",
    "Seats", "No. of Doors"
]

def fit_encode_categoricals_train(df: pd.DataFrame):
    df = df.copy()
    cat_cols = [c for c in CAT_COLS_TO_ENCODE
                if c in df.columns and df[c].dtype == object]

    oe = None
    if cat_cols:
        oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
        df[cat_cols] = oe.fit_transform(df[cat_cols].astype(str))

    artifacts = {
        "ordinal_encoder": oe,
        "cat_cols": cat_cols,
    }
    return df, artifacts

def encode_categoricals_test(df: pd.DataFrame, artifacts):
    df = df.copy()
    oe = artifacts["ordinal_encoder"]
    cat_cols = artifacts["cat_cols"]
    if oe is not None and cat_cols:
        df[cat_cols] = oe.transform(df[cat_cols].astype(str))
    return df

# =====================================================
# Î©îÏù∏ Ï†ÑÏ≤òÎ¶¨ Ìï®Ïàò (train / test)
# =====================================================
def preprocess_train(train: pd.DataFrame):
    df = train.copy()

    # 0) ÌäπÏàò Í≤∞Ï∏° Î¨∏ÏûêÏó¥ Ï≤òÎ¶¨
    df = normalize_missing(df)

    # 1) Name ‚Üí Brand, Model
    df = split_name_to_brand_model(df)

    # 2) Year ‚Üí Age, Year ÏÇ≠Ï†ú
    if "Year" in df.columns:
        df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
        df["Age"] = 2025 - df["Year"]
        df["Age"] = df["Age"].fillna(df["Age"].median())
        df = df.drop(columns=["Year"])

    # 6) New_Price: Î¨∏ÏûêÏó¥ -> Lakh Ïà´Ïûê + missing flag
    if "New_Price" in df.columns:
        df["New_Price"] = clean_new_price_simple(df["New_Price"])
        df["NewPrice_missing"] = df["New_Price"].isna().astype(int)
    else:
        df["NewPrice_missing"] = 1

    # 8) Mileage / Engine / Power Ïà´ÏûêÌôî + Brand/Model Í∏∞Î∞ò Î≥¥Ï†ï
    if "Mileage" in df.columns:
        df["Mileage"] = clean_mileage_to_kmpl(df["Mileage"])
        df = fill_zero_and_missing_by_brand_model(df, "Mileage")

    if "Engine" in df.columns:
        df["Engine"] = extract_number_series(df["Engine"])
        df = fill_zero_and_missing_by_brand_model(df, "Engine")

    if "Power" in df.columns:
        df["Power"] = extract_number_series(df["Power"])
        df = fill_zero_and_missing_by_brand_model(df, "Power")

    # 3) Kilometers_Driven -> numeric + 0/NaN Ï≤òÎ¶¨ + log ÌååÏÉù
    km_median = None
    if "Kilometers_Driven" in df.columns:
        df["Kilometers_Driven"] = pd.to_numeric(df["Kilometers_Driven"], errors="coerce")
        df.loc[df["Kilometers_Driven"] == 0, "Kilometers_Driven"] = np.nan
        km_median = df["Kilometers_Driven"].median()
        df["Kilometers_Driven"] = df["Kilometers_Driven"].fillna(km_median)
        df["Kilometers_log"] = np.log1p(df["Kilometers_Driven"])

    # 6) New_Price Í≤∞Ï∏°Í∞í Í≥ÑÏ∏µÏ†Å Î≥¥Ï†ï
    if "New_Price" in df.columns:
        df, newprice_art = fill_new_price_with_hierarchy_train(df)
    else:
        newprice_art = None

    # üî• Model / Brand Î≥Ñ New_Price Ï§ëÏïôÍ∞í ÌååÏÉù ÌîºÏ≤ò
    if newprice_art is not None:
        mm = newprice_art["model_median"]
        bm = newprice_art["brand_median"]
        gm = newprice_art["global_median"]

        if "Model" in df.columns:
            df["model_newprice_median"] = df["Model"].map(mm)
        else:
            df["model_newprice_median"] = np.nan

        if "Brand" in df.columns:
            df["brand_newprice_median"] = df["Brand"].map(bm)
        else:
            df["brand_newprice_median"] = np.nan

        df["model_newprice_median"] = df["model_newprice_median"].fillna(gm)
        df["brand_newprice_median"] = df["brand_newprice_median"].fillna(gm)

    # üî• Ï∂îÍ∞Ä: Model / Brand Í∏∞Î∞ò group stats (Engine, Power, Mileage)
    group_stats = None
    if all(col in df.columns for col in ["Brand", "Model", "Engine", "Power", "Mileage"]):
        # Î™®Îç∏/Î∏åÎûúÎìúÎ≥Ñ median Í≥ÑÏÇ∞
        model_engine_med   = df.groupby("Model")["Engine"].median()
        model_power_med    = df.groupby("Model")["Power"].median()
        model_mileage_med  = df.groupby("Model")["Mileage"].median()

        brand_engine_med   = df.groupby("Brand")["Engine"].median()
        brand_power_med    = df.groupby("Brand")["Power"].median()
        brand_mileage_med  = df.groupby("Brand")["Mileage"].median()

        # Ï†ÑÏ≤¥ median (test fallback Ïö©)
        global_engine_med  = df["Engine"].median()
        global_power_med   = df["Power"].median()
        global_mileage_med = df["Mileage"].median()

        # Ïã§Ï†ú ÌîºÏ≤ò Í∞í ÏÉùÏÑ±
        df["model_engine_median"]   = df["Model"].map(model_engine_med).fillna(global_engine_med)
        df["model_power_median"]    = df["Model"].map(model_power_med).fillna(global_power_med)
        df["model_mileage_median"]  = df["Model"].map(model_mileage_med).fillna(global_mileage_med)

        df["brand_engine_median"]   = df["Brand"].map(brand_engine_med).fillna(global_engine_med)
        df["brand_power_median"]    = df["Brand"].map(brand_power_med).fillna(global_power_med)
        df["brand_mileage_median"]  = df["Brand"].map(brand_mileage_med).fillna(global_mileage_med)

        group_stats = {
            "model_engine_median":   model_engine_med,
            "model_power_median":    model_power_med,
            "model_mileage_median":  model_mileage_med,
            "brand_engine_median":   brand_engine_med,
            "brand_power_median":    brand_power_med,
            "brand_mileage_median":  brand_mileage_med,
            "global_engine_median":  global_engine_med,
            "global_power_median":   global_power_med,
            "global_mileage_median": global_mileage_med,
        }
    else:
        group_stats = None

    # 7) New_Price log ÌååÏÉù
    if "New_Price" in df.columns:
        df["NewPrice_log"] = np.log1p(df["New_Price"])

    # 4) Î≤îÏ£ºÌòï Ïù∏ÏΩîÎî©
    df, cat_art = fit_encode_categoricals_train(df)

    artifacts = {
        "newprice_artifacts": newprice_art,
        "cat_artifacts":      cat_art,
        "km_median":          km_median,
        "group_stats":        group_stats,
    }

    return df, artifacts


def preprocess_test(test: pd.DataFrame, artifacts):
    df = test.copy()

    # 0) ÌäπÏàò Í≤∞Ï∏° Î¨∏ÏûêÏó¥ Ï≤òÎ¶¨
    df = normalize_missing(df)

    # 1) Name ‚Üí Brand, Model
    df = split_name_to_brand_model(df)

    # 2) Year ‚Üí Age, Year ÏÇ≠Ï†ú
    if "Year" in df.columns:
        df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
        df["Age"] = 2025 - df["Year"]
        df["Age"] = df["Age"].fillna(df["Age"].median())
        df = df.drop(columns=["Year"])

    # 6) New_Price: Î¨∏ÏûêÏó¥ -> Lakh Ïà´Ïûê + missing flag
    if "New_Price" in df.columns:
        df["New_Price"] = clean_new_price_simple(df["New_Price"])
        df["NewPrice_missing"] = df["New_Price"].isna().astype(int)
    else:
        df["NewPrice_missing"] = 1

    # 8) Mileage / Engine / Power Ïà´ÏûêÌôî + Brand/Model Í∏∞Î∞ò Î≥¥Ï†ï
    if "Mileage" in df.columns:
        df["Mileage"] = clean_mileage_to_kmpl(df["Mileage"])
        df = fill_zero_and_missing_by_brand_model(df, "Mileage")

    if "Engine" in df.columns:
        df["Engine"] = extract_number_series(df["Engine"])
        df = fill_zero_and_missing_by_brand_model(df, "Engine")

    if "Power" in df.columns:
        df["Power"] = extract_number_series(df["Power"])
        df = fill_zero_and_missing_by_brand_model(df, "Power")

    # 3) Kilometers_Driven -> numeric + 0/NaN Ï≤òÎ¶¨ + log ÌååÏÉù
    if "Kilometers_Driven" in df.columns:
        df["Kilometers_Driven"] = pd.to_numeric(df["Kilometers_Driven"], errors="coerce")
        df.loc[df["Kilometers_Driven"] == 0, "Kilometers_Driven"] = np.nan

        km_median = artifacts.get("km_median", df["Kilometers_Driven"].median())
        df["Kilometers_Driven"] = df["Kilometers_Driven"].fillna(km_median)
        df["Kilometers_log"] = np.log1p(df["Kilometers_Driven"])

    # 6) New_Price Í≤∞Ï∏°Í∞í Í≥ÑÏ∏µÏ†Å Î≥¥Ï†ï (trainÏóêÏÑú Íµ¨Ìïú median ÏÇ¨Ïö©)
    newprice_art = artifacts.get("newprice_artifacts", None)
    if newprice_art is not None and "New_Price" in df.columns:
        df = fill_new_price_with_hierarchy_test(df, newprice_art)

        # Model / Brand Ï§ëÏïôÍ∞í ÌååÏÉù ÌîºÏ≤ò
        mm = newprice_art["model_median"]
        bm = newprice_art["brand_median"]
        gm = newprice_art["global_median"]

        if "Model" in df.columns:
            df["model_newprice_median"] = df["Model"].map(mm)
        else:
            df["model_newprice_median"] = np.nan

        if "Brand" in df.columns:
            df["brand_newprice_median"] = df["Brand"].map(bm)
        else:
            df["brand_newprice_median"] = np.nan

        df["model_newprice_median"] = df["model_newprice_median"].fillna(gm)
        df["brand_newprice_median"] = df["brand_newprice_median"].fillna(gm)

    # üî• train ÏóêÏÑú Íµ¨Ìïú Model / Brand group stats Î°ú ÌååÏÉù ÌîºÏ≤ò ÏÉùÏÑ±
    group_stats = artifacts.get("group_stats", None)
    if group_stats is not None and all(col in df.columns for col in ["Brand", "Model", "Engine", "Power", "Mileage"]):
        ge = group_stats["global_engine_median"]
        gp = group_stats["global_power_median"]
        gm = group_stats["global_mileage_median"]

        df["model_engine_median"]  = df["Model"].map(group_stats["model_engine_median"]).fillna(ge)
        df["model_power_median"]   = df["Model"].map(group_stats["model_power_median"]).fillna(gp)
        df["model_mileage_median"] = df["Model"].map(group_stats["model_mileage_median"]).fillna(gm)

        df["brand_engine_median"]  = df["Brand"].map(group_stats["brand_engine_median"]).fillna(ge)
        df["brand_power_median"]   = df["Brand"].map(group_stats["brand_power_median"]).fillna(gp)
        df["brand_mileage_median"] = df["Brand"].map(group_stats["brand_mileage_median"]).fillna(gm)

    # 7) New_Price log ÌååÏÉù
    if "New_Price" in df.columns:
        df["NewPrice_log"] = np.log1p(df["New_Price"])

    # 4) Î≤îÏ£ºÌòï Ïù∏ÏΩîÎî© (trainÏóêÏÑú fitÎêú encoder ÏÇ¨Ïö©)
    cat_art = artifacts.get("cat_artifacts", None)
    if cat_art is not None:
        df = encode_categoricals_test(df, cat_art)

    return df


In [3]:

train_raw = pd.read_csv("raw_data/train.csv")
test_raw  = pd.read_csv("raw_data/test.csv")

train_proc, artifacts = preprocess_train(train_raw)
test_proc  = preprocess_test(test_raw, artifacts)

print(train_proc.shape, test_proc.shape)
train_proc.head()


(4470, 28) (1491, 27)


Unnamed: 0,ID,Location,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Colour,...,Kilometers_log,model_newprice_median,brand_newprice_median,model_engine_median,model_power_median,model_mileage_median,brand_engine_median,brand_power_median,brand_mileage_median,NewPrice_log
0,G4XLU0,3.0,59138.0,1.0,1.0,0.0,17.0,1405.0,70.0,Others,...,10.987646,11.265,7.63,1396.0,69.01,17.0,1248.0,74.0,19.0,2.155245
1,CRSHOS,7.0,81504.0,1.0,1.0,0.0,21.43,1364.0,87.2,Others,...,11.30842,22.515,23.195,1798.0,138.03,14.28,2494.0,125.0,12.99,3.157639
2,FUJ4X1,5.0,92000.0,4.0,1.0,0.0,13.8,1299.0,70.0,Others,...,11.429555,11.265,11.27,1299.0,70.0,13.8,1498.0,89.84,18.6,2.507157
3,QMVK6E,8.0,33249.0,1.0,1.0,0.0,21.27,1396.0,88.76,Black/Silver,...,10.41181,9.995,8.68,1197.0,81.83,18.6,1197.0,81.86,18.9,2.397441
4,4SWHFC,1.0,65000.0,4.0,1.0,0.0,17.0,1497.0,118.0,White,...,11.082158,15.23,10.94,1497.0,117.3,17.4,1497.0,100.0,17.8,2.786861


In [4]:
train_proc, artifacts = preprocess_train(train_raw)
test_proc  = preprocess_test(test_raw, artifacts)


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error
import xgboost as xgb
from catboost import CatBoostRegressor

# ----------------------------------------
# 1) ÏÇ¨Ïö©Ìï† Feature ÏÑ†ÌÉù
# ----------------------------------------
candidate_features = [
    "Age",
    "Kilometers_log",
    "Kilometers_Driven",
    "Engine",
    "Power",
    "Mileage",

    "New_Price",
    "NewPrice_missing",
    "NewPrice_log",
    "model_newprice_median",
    "brand_newprice_median",

    # üî• ÏÉàÎ°ú Ï∂îÍ∞ÄÌïú group stats
    "model_engine_median",
    "brand_engine_median",
    "model_power_median",
    "brand_power_median",
    "model_mileage_median",
    "brand_mileage_median",

    "Fuel_Type",
    "Transmission",
    "Location",
    "Brand",
    "Model",
]

FEATURES = [c for c in candidate_features if c in train_proc.columns]
print("ÏÇ¨Ïö© FEATURES:", FEATURES)

X = train_proc[FEATURES]
y = np.log1p(train_proc["Price"])   # üî• ÌÉÄÍ≤üÏùÄ log1p(Price)


ÏÇ¨Ïö© FEATURES: ['Age', 'Kilometers_log', 'Kilometers_Driven', 'Engine', 'Power', 'Mileage', 'New_Price', 'NewPrice_missing', 'NewPrice_log', 'model_newprice_median', 'brand_newprice_median', 'model_engine_median', 'brand_engine_median', 'model_power_median', 'brand_power_median', 'model_mileage_median', 'brand_mileage_median', 'Fuel_Type', 'Transmission', 'Location', 'Brand', 'Model']


In [6]:
# ----------------------------------------
# 2) K-Fold ÏÖãÏóÖ
# ----------------------------------------
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# ----------------------------------------
# 3) XGBoost / CatBoost ÌååÎùºÎØ∏ÌÑ∞
#    (XGBÎäî ÏïÑÍπå Ï∞æÏùÄ Best Params Í≥†Ï†ï)
# ----------------------------------------
BEST_XGB_PARAMS = {
    "n_estimators":     1000,
    "max_depth":        6,
    "learning_rate":    0.03,
    "subsample":        0.8,
    "colsample_bytree": 0.7,
    "tree_method":      "hist",
    "random_state":     42,
    "n_jobs":           -1,
}

CAT_PARAMS = dict(
    iterations=1500,
    depth=8,
    learning_rate=0.03,
    loss_function="RMSE",   # log(Price) Í∏∞Ï§Ä ÏÜêÏã§
    eval_metric="MAPE",
    random_seed=42,
    verbose=False,
)

# ÏïôÏÉÅÎ∏î Í∞ÄÏ§ëÏπò (ÏõêÌïòÎ©¥ ÎÇòÏ§ëÏóê Î∞îÍøîÏÑú Îã§Ïãú ÎèåÎ†§Î¥êÎèÑ Îê®)
w_xgb = 0.5
w_cat = 0.5

xgb_mapes = []
cat_mapes = []
ens_mapes = []


In [7]:

# ----------------------------------------
# 4) CV Î£®ÌîÑ: XGB, Cat, ÏïôÏÉÅÎ∏î MAPE Í≥ÑÏÇ∞
# ----------------------------------------
for fold, (tr_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    # ---- XGBoost ----
    xgb_model = xgb.XGBRegressor(**BEST_XGB_PARAMS)
    xgb_model.fit(X_tr, y_tr)
    pred_log_xgb = xgb_model.predict(X_val)
    pred_xgb = np.expm1(pred_log_xgb)

    # ---- CatBoost ----
    cat_model = CatBoostRegressor(**CAT_PARAMS)
    cat_model.fit(X_tr, y_tr)
    pred_log_cat = cat_model.predict(X_val)
    pred_cat = np.expm1(pred_log_cat)

    # Ï†ïÎãµ Î≥µÏõê
    y_val_real = np.expm1(y_val)

    # Í∞úÎ≥Ñ MAPE
    mape_xgb = mean_absolute_percentage_error(y_val_real, pred_xgb)
    mape_cat = mean_absolute_percentage_error(y_val_real, pred_cat)

    # üî• ÏïôÏÉÅÎ∏î: (Í∞ÄÏ§ë)ÌèâÍ∑†
    pred_ens = w_xgb * pred_xgb + w_cat * pred_cat
    mape_ens = mean_absolute_percentage_error(y_val_real, pred_ens)

    xgb_mapes.append(mape_xgb)
    cat_mapes.append(mape_cat)
    ens_mapes.append(mape_ens)

    print(f"[Fold {fold}] XGB MAPE: {mape_xgb:.5f} | Cat MAPE: {mape_cat:.5f} | ENS MAPE: {mape_ens:.5f}")

print("===================================")
print("XGB   CV MAPE mean :", np.mean(xgb_mapes))
print("Cat   CV MAPE mean :", np.mean(cat_mapes))
print("Ensem CV MAPE mean :", np.mean(ens_mapes))
print("===================================")


[Fold 1] XGB MAPE: 0.16185 | Cat MAPE: 0.14828 | ENS MAPE: 0.15304
[Fold 2] XGB MAPE: 0.11621 | Cat MAPE: 0.11382 | ENS MAPE: 0.11337
[Fold 3] XGB MAPE: 0.11820 | Cat MAPE: 0.11872 | ENS MAPE: 0.11657
[Fold 4] XGB MAPE: 0.11205 | Cat MAPE: 0.10981 | ENS MAPE: 0.10915
[Fold 5] XGB MAPE: 0.12139 | Cat MAPE: 0.11673 | ENS MAPE: 0.11745
XGB   CV MAPE mean : 0.1259409016117391
Cat   CV MAPE mean : 0.12147252737950336
Ensem CV MAPE mean : 0.12191501944469521


In [9]:
# ----------------------------------------
# 5) Ï†ÑÏ≤¥ trainÏúºÎ°ú Îã§Ïãú ÌïôÏäµ: XGB + Cat Îëò Îã§
# ----------------------------------------
final_xgb = xgb.XGBRegressor(**BEST_XGB_PARAMS)
final_xgb.fit(X, y)

final_cat = CatBoostRegressor(**CAT_PARAMS)
final_cat.fit(X, y)

X_test = test_proc[FEATURES]

# XGB ÏòàÏ∏°
test_pred_log_xgb = final_xgb.predict(X_test)
test_pred_xgb = np.expm1(test_pred_log_xgb)

# Cat ÏòàÏ∏°
test_pred_log_cat = final_cat.predict(X_test)
test_pred_cat = np.expm1(test_pred_log_cat)

# ÏïôÏÉÅÎ∏î ÏòàÏ∏°
test_pred_ens = w_xgb * test_pred_xgb + w_cat * test_pred_cat

# ----------------------------------------
# 6) Ï†úÏ∂ú ÌååÏùº 3Í∞ú (XGB / Cat / Ensemble)
# ----------------------------------------
sub_xgb = pd.DataFrame({
    "ID": test_raw["ID"],
    "Price": test_pred_xgb,
})
sub_xgb.to_csv("submission_xgb_logprice_groupstats.csv", index=False)
print("Saved: submission_xgb_logprice_groupstats.csv")

sub_cat = pd.DataFrame({
    "ID": test_raw["ID"],
    "Price": test_pred_cat,
})
sub_cat.to_csv("submission_catboost_logprice_groupstats.csv", index=False)
print("Saved: submission_catboost_logprice_groupstats.csv")

sub_ens = pd.DataFrame({
    "ID": test_raw["ID"],
    "Price": test_pred_ens,
})
sub_ens.to_csv("submission_xgb_cat_ensemble_groupstats.csv", index=False)
print("Saved: submission_xgb_cat_ensemble_groupstats.csv")

Saved: submission_xgb_logprice_groupstats.csv
Saved: submission_catboost_logprice_groupstats.csv
Saved: submission_xgb_cat_ensemble_groupstats.csv
