In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tf-idf-result/tfidf_word.joblib
/kaggle/input/tf-idf-result/tfidf_word_tr.npz
/kaggle/input/tf-idf-result/tfidf_char.joblib
/kaggle/input/tf-idf-result/tfidf_char_te.npz
/kaggle/input/tf-idf-result/tfidf_char_tr.npz
/kaggle/input/tf-idf-result/tfidf_word_te.npz
/kaggle/input/bert-embeddings/bert_embeddings.npz
/kaggle/input/svd-result/tfidf_svd_word.npz
/kaggle/input/svd-result/svd_char.joblib
/kaggle/input/svd-result/svd_word.joblib
/kaggle/input/svd-result/tfidf_svd_char.npz
/kaggle/input/objective-quest-2025-dataset/sample_submission.csv
/kaggle/input/objective-quest-2025-dataset/train.csv
/kaggle/input/objective-quest-2025-dataset/test.csv


KeyboardInterrupt: 

In [None]:
import re
import os 
from pathlib import Path
from sklearn.model_selection import GroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import torch
from transformers import AutoTokenizer, AutoModel

In [None]:
# list semua folder di /kaggle/input
print(os.listdir("/kaggle/input"))
print(os.listdir("/kaggle/input/objective-quest-2025-dataset"))

In [None]:
# load CSV
BASE = Path("/kaggle/input/objective-quest-2025-dataset")
train = pd.read_csv(BASE/"train.csv")
test  = pd.read_csv(BASE/"test.csv")
sample_submission = pd.read_csv(BASE/"sample_submission.csv")

print(train.head())
print(test.head())
print(sample_submission.head())

In [None]:
# Load texts
texts_dir = BASE / "file_putusan"
id2text = {}
for p in texts_dir.rglob("*"):
    if p.is_file():
        try:
            id2text[p.stem] = p.read_text(encoding="utf-8", errors="ignore")
        except:
            id2text[p.stem] = p.read_text(encoding="latin-1", errors="ignore")

train["text"] = train["id"].map(id2text).fillna("")
test["text"]  = test["id"].map(id2text).fillna("")

### Cleaning Data

In [None]:
def clean(s: str) -> str:
    """
    Cleaning dokumen putusan (ringkas & aman untuk BERT/TF-IDF):
    - Hapus header/footer umum (Halaman x dari y, Direktori MA, dll.)
    - Hapus email & nomor telepon panjang
    - Normalisasi '2 (dua)' -> '2'
    - Rapikan spasi, newline, dan tanda hubung
    - TIDAK melakukan lowercase (biar fleksibel: BERT cased vs TF-IDF)
    """
    if not isinstance(s, str) or not s:
        return ""

    # Normalisasi newline
    s = s.replace("\r\n", "\n").replace("\r", "\n")

    # Header/footer & boilerplate umum
    # Hapus nomor halaman
    s = re.sub(r"Halaman\s+\d+\s+dari\s+\d+.*?\n", " ", s, flags=re.I)
    # Hapus direktori putusan
    s = re.sub(r"Direktori Putusan Mahkamah Agung.*?\n", " ", s, flags=re.I)
    # Hapus header "Mahkamah Agung Republik Indonesia" meski kepotong/typo
    s = re.sub(r"mah?kamah.*agung.*indonesi[a-z]*", " ", s, flags=re.I)
    # Hapus kata 'PUTUSAN' (sering dicetak besar/berjarak)
    s = re.sub(r"\bP\s*U\s*T\s*U\s*S\s*A\s*N\b", " ", s, flags=re.I)

    # Hapus email & nomor telepon panjang
    s = re.sub(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", " ", s)
    s = re.sub(r"\b(?:\+?62|0)\d{8,}\b", " ", s)

    # Normalisasi angka “2 (dua)” -> “2” (jaga angka penting)
    s = re.sub(r"(\d+)\s*\(\s*[A-Za-z\s]+\s*\)", r"\1", s)

    # Rapikan tanda hubung & spasi
    s = re.sub(r"\s*-\s*", "-", s)     # “laki - laki” -> “laki-laki”
    s = re.sub(r"[ \t]+", " ", s)      # multi-spasi -> satu spasi
    s = re.sub(r"\n{2,}", "\n", s)     # newline berturut -> 1 newline

    return s.strip()

train["text_bert"] = train["text"].map(clean)
test["text_bert"]  = test["text"].map(clean)

train["text_tfidf"] = train["text_bert"].str.lower()
test["text_tfidf"]  = test["text_bert"].str.lower()

# save

train.to_pickle("train_clean.pkl")
test.to_pickle("test_clean.pkl")


### Feature Extraction & Feature Engineering

In [None]:
# Feature Extraction

# Triggers & units 
RE_AMAR_TRIG   = re.compile(r"(menjatuhkan\s+(?:pidana(?:\s+penjara)?|hukuman|kurungan)|menghukum)", re.I)
RE_TUNTUT_TRIG = re.compile(r"(menuntut|tuntut\w*)", re.I)

RE_YEARS  = re.compile(r"(\d+)\s*(tahun|thn|th)\b", re.I)
RE_MONTHS = re.compile(r"(\d+)\s*(bulan|bln|bl)\b", re.I)
RE_DAYS   = re.compile(r"(\d+)\s*hari\b", re.I)

# Pasal + UU (robust, ambil banyak)
RE_PASAL_UU = re.compile(
    r"pasal\s+(\d+[A-Za-z]?)"                              # pasal 127 / 127A
    r"(?:\s+ayat\s*\(\s*\d+\s*\))?"                        # opsional ayat
    r"(?:\s+huruf\s*[a-z])?"                               # opsional huruf
    r".{0,120}?"
    r"(?:uu|undang-?undang)\s*(?:no\.?|nomor)?\s*(\d+)"    # UU nomor
    r"(?:\s*(?:tahun)?\s*(\d{4}))?",                       # opsional tahun
    re.I | re.S
)

# Denda & subsider
RE_RP_NUMBER = re.compile(r"(?:rp\.?|rupiah)\s*([0-9\.\,]+)", re.I)
RE_SCALED_MONEY = re.compile(
    r"(?:(?:sebesar|senilai)\s*)?([0-9\.\,]+)\s*(ribu|juta|miliar|milyar)\s*(?:rupiah|rp\.?)?",
    re.I
)
SCALE = {"ribu":1_000, "juta":1_000_000, "miliar":1_000_000_000, "milyar":1_000_000_000}

# Life/death
RE_LIFE  = re.compile(r"seumur\s+h?idup", re.I)
RE_DEATH = re.compile(r"(pidana|hukuman)\s+mati", re.I)

# Mitigasi/aggravasi
MITIGASI = [r"belum\s+pernah\s+dihukum", r"menyesal", r"bersikap\s+sopan", r"mengakui\s+perbuatan"]
AGGRAV   = [r"residivis", r"perbuatan\s+meresahkan", r"tidak\s+menyesal"]

LIFE_MONTHS    = 600.0
MAX_REASONABLE = 720.0
WINDOW_AFTER   = 300  # ambil sedikit ekor setelah kalimat pemicu

def _to_num(s):
    return float(str(s).replace(".","").replace(",", "."))

def _money_id(text: str) -> float:
    """Kembalikan denda dalam Rupiah (ambil angka terbesar yang muncul)."""
    if not isinstance(text, str): 
        return 0.0
    vals = []
    for v in RE_RP_NUMBER.findall(text or ""):
        try: vals.append(int(_to_num(v)))
        except: pass
    for v,unit in RE_SCALED_MONEY.findall(text or ""):
        try: vals.append(int(_to_num(v) * SCALE[unit.lower()]))
        except: pass
    return float(max(vals)) if vals else 0.0

def _months_from_span(span: str) -> float:
    if not span: return np.nan
    total = 0.0
    y = RE_YEARS.search(span); m = RE_MONTHS.search(span); d = RE_DAYS.search(span)
    if y: total += int(y.group(1)) * 12
    if m: total += int(m.group(1))
    if d: total += int(d.group(1)) / 30.0
    return total if total > 0 else np.nan

def _pick_best_duration(text: str, trigger: re.Pattern) -> float:
    best = np.nan
    for m in trigger.finditer(text or ""):
        start, end = m.span()
        chunk = (text or "")[start:end + WINDOW_AFTER]
        v = _months_from_span(chunk)
        if np.isnan(v) or v > MAX_REASONABLE: 
            continue
        best = v if np.isnan(best) else max(best, v)
    return best

def _legal_refs_all(text: str):
    return RE_PASAL_UU.findall(text or "")

def _legal_ref_primary(text: str) -> str:
    m = _legal_refs_all(text)
    if not m: return "UNKNOWN"
    with_year = [x for x in m if x[2]]
    if with_year:
        pasal,no,yr = with_year[0]
        return f"UU{no}/{yr}_pasal{pasal}"
    # fallback tanpa tahun -> pilih yang paling sering
    from collections import Counter
    cnt = Counter((p,n) for (p,n,_) in m)
    (pasal,no), _ = cnt.most_common(1)[0]
    return f"UU{no}/XXXX_pasal{pasal}"

def _subsider_months(text: str) -> float:
    m = re.search(r"subsidi(?:air|er)\s+(\d+)\s*(bulan|bln|hari)", text or "", flags=re.I)
    if not m: return 0.0
    val, unit = m.groups()
    val = int(val)
    if unit.lower().startswith("hari"): 
        return val/30.0
    return float(val)

def _count_keywords(text: str, patterns):
    t = text or ""
    return int(any(re.search(p, t, flags=re.I) for p in patterns))

def extract_features_doc(raw_text: str, normalized_text: str = None):
    """
    Return dict fitur TANPA bb_gram:
      - amar_bulan, tuntutan_bulan
      - denda_rp, subsider_bulan
      - legal_ref_primary, legal_ref_count
      - has_mitigasi, has_aggrav
    """
    if not isinstance(raw_text, str) or not raw_text:
        return {
            "amar_bulan": np.nan, "tuntutan_bulan": np.nan,
            "denda_rp": 0.0, "subsider_bulan": 0.0,
            "legal_ref_primary": "UNKNOWN", "legal_ref_count": 0,
            "has_mitigasi": 0, "has_aggrav": 0
        }

    t = normalized_text if isinstance(normalized_text, str) else raw_text

    # life/death override
    if RE_LIFE.search(raw_text) or RE_DEATH.search(raw_text):
        amar = LIFE_MONTHS
    else:
        amar = _pick_best_duration(t, RE_AMAR_TRIG)

    tuntut = _pick_best_duration(t, RE_TUNTUT_TRIG)
    denda  = _money_id(raw_text)
    subs   = _subsider_months(raw_text)

    refs = _legal_refs_all(raw_text)
    legal_primary = _legal_ref_primary(raw_text)
    legal_count   = len(refs)

    has_mitig = _count_keywords(raw_text, MITIGASI)
    has_aggr  = _count_keywords(raw_text, AGGRAV)

    return {
        "amar_bulan": amar,
        "tuntutan_bulan": tuntut,
        "denda_rp": denda,
        "subsider_bulan": subs,
        "legal_ref_primary": legal_primary,
        "legal_ref_count": legal_count,
        "has_mitigasi": has_mitig,
        "has_aggrav": has_aggr,
    }

In [None]:
# Apply to dataframe
SRC = "text_bert" # bert cleaning result
if "_norm_text" not in train.columns:
    train["_norm_text"] = train[SRC]
    test["_norm_text"] = test[SRC]

feat_train = pd.DataFrame([extract_features_doc(rt, nt) for rt,nt in zip(train[SRC], train["_norm_text"])])
feat_test = pd.DataFrame([extract_features_doc(rt, nt) for rt,nt in zip(test[SRC], test["_norm_text"])])

train = pd.concat([train, feat_train], axis = 1)
test = pd.concat([test, feat_test], axis = 1)

In [None]:
# check
train.shape
train.columns
train.info()

In [None]:
# Feature Engineering

# Encoding
train["legal_ref_primary"] = train["legal_ref_primary"].astype(str)
test["legal_ref_primary"] = test["legal_ref_primary"].astype(str)
train["legal_ref_le"], ref_vals = pd.factorize(train["legal_ref_primary"])
test["legal_ref_le"] = test["legal_ref_primary"].astype(pd.CategoricalDtype(categories=ref_vals)).cat.codes
test["legal_ref_le"] = test["legal_ref_le"].replace(-1, ref_vals.size)

# Data type normalization 
for col in ["amar_bulan", "tuntutan_bulan", "subsider_bulan"]:
    train[col] = train[col].astype("float32")
    test[col] = test[col].astype("float32")
train["denda_rp"] = train["denda_rp"].astype("float64")
test["denda_rp"] = test["denda_rp"].astype("float64")
for col in ["has_mitigasi", "has_aggrav", "legal_ref_count", "legal_ref_le"]:
    train[col] = train[col].astype("int32")
    test[col] = test[col].astype("int32")

feat_cols = [
    "amar_bulan", "tuntutan_bulan", "denda_rp", "subsider_bulan", "legal_ref_primary",
    "legal_ref_count", "legal_ref_le", "has_mitigasi", "has_aggrav"
]

# Merge id for join purpose
id_col = "doc_id" if "doc_id" in train.columns else train.index.name or "idx"
if id_col not in train.columns:
    train = train.reset_index().rename(columns={"index":"idx"})
    test = test.reset_index().rename(columns={"index":"idx"})
    id_col = "idx"

train_feats = train[[id_col] + feat_cols].copy()
test_feats = test[[id_col] + feat_cols].copy()

# save
train_feats.to_parquet("train_feats.parquet", index=False)
test_feats.to_parquet("test_feats.parquet", index=False)

### Hybrid Feature Construction

In [None]:
# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack

tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=200_000,
    min_df=2,
    max_df=0.9,
    sublinear_tf=True
)

print("Fitting TF-IDF...")
Xtr_tfidf = tfidf.fit_transform(train["text_tfidf"])
Xte_tfidf = tfidf.transform(test["text_tfidf"])
Xtr_tfidf = csr_matrix(Xtr_tfidf, dtype=np.float32)
Xte_tfidf = csr_matrix(Xte_tfidf, dtype=np.float32)

In [None]:
# Domain Specific Numeric Features (tabular)

if "denda_rp" in train.columns:
    train["log1p_denda"] = np.log1p(train["denda_rp"].fillna(0))
    test["log1p_denda"] = np.log1p(test["denda_rp"].fillna(0))
else:
    train["log1p_denda"] = 0.0
    test["log1p_denda"] = 0.0

num_cols = [
    "amar_bulan", "tuntutan_bulan", "subsider_bulan", "legal_ref_le",
    "legal_ref_count", "has_mitigasi", "has_aggrav", "log1p_denda",
]

train["text_len"] = train["text_bert"].str.len().fillna(0).astype("float32")
test["text_len"] = test["text_bert"].str.len().fillna(0).astype("float32")
num_cols.append("text_len")

for c in num_cols:
    if c not in train.columns: train[c] = 0
    if c not in test.columns: test[c] = 0
    if train[c].dtype.kind not in "fi":
        train[c] = pd.to_numeric(train[c], errors="coerce")
        test[c] = pd.to_numeric(test[c], errors="coerce")

train[num_cols] = train[num_cols].fillna(-1)
test[num_cols] = test[num_cols].fillna(-1)

num_tr = csr_matrix(train[num_cols].to_numpy(np.float32))
num_te = csr_matrix(test[num_cols].to_numpy(np.float32))

Xtr = hstack([Xtr_tfidf, num_tr], dtype=np.float32).tocsr()
Xte = hstack([Xte_tfidf, num_te], dtype=np.float32).tocsr()
print("Shapes:", Xtr.shape, Xte.shape)

In [None]:
# Sematic -> embeddings

from sklearn.decomposition import TruncatedSVD

# load embeddings
data = np.load("/kaggle/input/bert-embeddings/bert_embeddings.npz")
E_tr, E_te = data["E_tr"], data["E_te"]
print("Embedding shapes:", E_tr.shape, E_te.shape)

# sanity check
assert E_tr.shape[0] == len(train), "Mismatch: n_train vs E_tr"
assert E_te.shape[0] == len(test),  "Mismatch: n_test vs E_te"

# cast to float32 to minimize RAM
E_tr = E_tr.astype(np.float32, copy=False)
E_te = E_te.astype(np.float32, copy=False)

# SVD to compress dimension
USE_SVD = True
svd_dim = 256
if USE_SVD:
    print(f"SVD reducing BERT from {E_tr.shape[1]} -> {svd_dim} ...")
    svd = TruncatedSVD(n_components=svd_dim, random_state=42)
    E_tr = svd.fit_transform(E_tr).astype(np.float32)
    E_te = svd.transform(E_te).astype(np.float32)

Etr_sp = csr_matrix(E_tr)
Ete_sp = csr_matrix(E_te)

Xtr = hstack([Xtr, Etr_sp], dtype=np.float32).tocsr()
Xte = hstack([Xte, Ete_sp], dtype=np.float32).tocsr()

print("Shapes with BERT:", Xtr.shape, Xte.shape)

### Target-Encoding

In [None]:
from sklearn.model_selection import GroupKFold
from scipy.sparse import csr_matrix, hstack

# determine the "legal_ref_primary"
UU_COL = "legal_ref_primary" if "legal_ref_primary" in train.columns else "legal_ref"
uu_train = train[UU_COL].astype(str).fillna("UNKNOWN")
uu_test  = test [UU_COL].astype(str).fillna("UNKNOWN")

# target and group kfold setup
CAND_TGT = ["lama hukuman (bulan)", "lama_hukuman_bulan"]
TARGET_COL = next(c for c in CAND_TGT if c in train.columns)

CLIP_TRAIN_MAX = 240
y_raw  = train[TARGET_COL].astype(float).values
y_clip = np.clip(y_raw, None, CLIP_TRAIN_MAX).astype(np.float32)
y_log  = np.log1p(y_clip)

groups = train.get("legal_ref_primary", train.get("legal_ref", "UNK")).astype(str).values
gkf = GroupKFold(n_splits=5)
splits = list(gkf.split(Xtr, y_log, groups))

# global mean and target encoding initialization
global_mean = float(y_clip.mean())
te_oof = np.zeros(len(train), dtype=np.float32)
te_test_blend = np.zeros(len(test), dtype=np.float32)

for tr_idx, va_idx in splits:
    g_tr = uu_train.iloc[tr_idx]
    y_tr = y_clip[tr_idx]

    # mean per group from fold-train
    grp_mean = (
        pd.DataFrame({"g": g_tr.values, "y": y_tr})
        .groupby("g")["y"].mean()
    )

    # assign to VAL based on mapping from fold-train gorup
    te_oof[va_idx] = (
        uu_train.iloc[va_idx].map(grp_mean).fillna(global_mean)
        .to_numpy(dtype=np.float32)
    )

    # for test
    te_fold_test = (
        uu_test.map(grp_mean).fillna(global_mean)
        .to_numpy(dtype=np.float32)
    )
    te_test_blend += te_fold_test / len(splits)

# shrinkage to global mean
counts = uu_train.value_counts()
cnt_tr = uu_train.map(counts).astype(float).to_numpy()
cnt_te = uu_test.map(counts).fillna(0).astype(float).to_numpy()

m = 50.0  # prior strength
te_oof  = (cnt_tr/(cnt_tr+m))*te_oof  + (m/(cnt_tr+m))*global_mean
te_test_blend = (cnt_te/(cnt_te+m))*te_test_blend + (m/(cnt_te+m))*global_mean

# add TE to main feature
Xtr = hstack([Xtr, csr_matrix(te_oof.reshape(-1,1))], format="csr")
Xte = hstack([Xte, csr_matrix(te_test_blend.reshape(-1,1))], format="csr")

print("TE feature added (aligned with GroupKFold). Shapes:", Xtr.shape, Xte.shape)


In [None]:
# Train LGBM + BERT features
import numpy as np, lightgbm as lgb, pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

# Target & CV setup
CAND_TGT = ["lama hukuman (bulan)", "lama_hukuman_bulan"]
TARGET_COL = next(c for c in CAND_TGT if c in train.columns)

CLIP_TRAIN_MAX = 240
y_raw  = train[TARGET_COL].astype(float).values
y_clip = np.clip(y_raw, None, CLIP_TRAIN_MAX).astype(np.float32)
y_log  = np.log1p(y_clip)

groups = train.get("legal_ref_primary", train.get("legal_ref","UNK")).astype(str).values
gkf = GroupKFold(n_splits=5)
splits = list(gkf.split(Xtr, y_log, groups))

# bobot ringan utk tail
q90, q95 = np.quantile(y_clip, [0.90, 0.95])
w = np.ones_like(y_clip, dtype=np.float32)
w[y_clip>q90] *= 1.25; w[y_clip>q95] *= 1.5

# LightGBM params
lgb_params = dict(
    objective="huber",   
    alpha=0.7,
    metric="rmse",
    learning_rate=0.03,
    num_leaves=64,
    max_bin=63,
    min_data_in_leaf=60,
    feature_fraction=0.8,
    bagging_fraction=0.7,
    bagging_freq=1,
    lambda_l2=1.0,
    force_col_wise=True,
    num_threads=4,
    verbose=-1,
    seed=42,
)

# train
oof = np.zeros(len(train), dtype=np.float32)
pred_te = np.zeros(len(test), dtype=np.float32)

print("Training LGBM (dengan fitur BERT)...")
for fold, (tr_idx, va_idx) in enumerate(splits, 1):
    dtr = lgb.Dataset(Xtr[tr_idx], label=y_log[tr_idx], weight=w[tr_idx], free_raw_data=False)
    dva = lgb.Dataset(Xtr[va_idx], label=y_log[va_idx], free_raw_data=False)

    model = lgb.train(
        lgb_params, dtr, num_boost_round=6000,
        valid_sets=[dva], valid_names=["val"],
        callbacks=[lgb.early_stopping(400, verbose=False), lgb.log_evaluation(250)]
    )

    va = np.clip(np.expm1(model.predict(Xtr[va_idx], num_iteration=model.best_iteration)), 0, CLIP_TRAIN_MAX)
    te = np.clip(np.expm1(model.predict(Xte,        num_iteration=model.best_iteration)), 0, CLIP_TRAIN_MAX)

    oof[va_idx] = va.astype(np.float32)
    pred_te += (te.astype(np.float32) / gkf.n_splits)

    rmse = mean_squared_error(y_clip[va_idx], va, squared=False)
    print(f"Fold {fold} RMSE: {rmse:.4f}")

rmse_oof = mean_squared_error(y_clip, oof, squared=False)
print(f"OOF RMSE (LGBM + BERT): {rmse_oof:.4f}")

### Calibration

In [None]:
from sklearn.linear_model import LinearRegression

# fit global calibration in OOF
cal = LinearRegression().fit(oof.reshape(-1,1), y_clip)
a, b = float(cal.coef_[0]), float(cal.intercept_)
print(f"[Calib] y ≈ {a:.4f} * y_pred + {b:.4f}")

SUB_CLIP_MAX = 360
pred_cal = np.clip(a*pred_te + b, 0, SUB_CLIP_MAX).astype(np.float32)

if 'sample_submission' in globals():
    sub = sample_submission.copy()
    sub_col = sub.columns[-1]
    sub[sub_col] = pred_cal
else:
    id_col = 'doc_id' if 'doc_id' in test.columns else ('id' if 'id' in test.columns else test.columns[0])
    sub_col = "lama hukuman (bulan)"
    sub = pd.DataFrame({id_col: test[id_col].values, sub_col: pred_cal})

sub.to_csv("submission.csv", index=False)
print("Saved -> submission.csv | shape:", sub.shape)
print(sub.head())