In [None]:
# === Standardized path configuration added automatically ===
from pathlib import Path
BASE_DIR = Path.cwd()  # project root when running the notebook
DATA_DIR = BASE_DIR / 'data'
TRAIN_CSV = DATA_DIR / 'train' / 'train.csv'
TEST_CSV = DATA_DIR / 'test' / 'test.csv'
# You can now use TRAIN_CSV and TEST_CSV instead of hardcoded strings.


In [None]:
import pandas as pd
import numpy as np

# Load test predictions
deberta = pd.read_csv("/kaggle/input/preds-data/deberta_base_test_preds.csv")
roberta = pd.read_csv("/kaggle/input/preds-data/roberta_base_test_preds.csv")
tfidf = pd.read_csv("/kaggle/input/preds-data/submission_tfidf_single (11).csv")

# Blend predictions
blend = (
    0.55 * deberta[['deberta_base_EAP', 'deberta_base_HPL', 'deberta_base_MWS']].values +
    0.15 * roberta[['roberta_base_EAP', 'roberta_base_HPL', 'roberta_base_MWS']].values +
    0.30 * tfidf[['EAP', 'HPL', 'MWS']].values
)
# blend = (
#     0.49 * deberta[['deberta_base_EAP', 'deberta_base_HPL', 'deberta_base_MWS']].values +
#     0.20 * roberta[['roberta_base_EAP', 'roberta_base_HPL', 'roberta_base_MWS']].values +
#     0.30 * tfidf[['EAP', 'HPL', 'MWS']].values
# )
# Create final submission
submission = pd.DataFrame(blend, columns=['EAP', 'HPL', 'MWS'])
submission.insert(0, 'id', deberta['id'])
submission.to_csv("submission_blend.csv", index=False)

print("✅ submission_blend.csv created!")


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

# === Load OOF predictions ===
deberta_oof = pd.read_csv("/kaggle/input/oof-preds/debertav3-base_oof.csv")
roberta_oof = pd.read_csv("/kaggle/input/oof-preds/roberta_oof_preds.csv")
tfidf_oof = pd.read_csv("/kaggle/input/oof-preds/oof_preds.csv")

# === Rename TF-IDF columns to prevent name collision ===
tfidf_oof = tfidf_oof.rename(columns={
    'roberta_base_EAP': 'EAP_tfidf',
    'roberta_base_HPL': 'HPL_tfidf',
    'roberta_base_MWS': 'MWS_tfidf'
})

# === Merge on ID ===
meta_df = deberta_oof.merge(roberta_oof, on='id')
meta_df = meta_df.merge(tfidf_oof, on='id')

# === Generate pseudo-labels from DeBERTa OOF (or replace with true labels if available) ===
meta_df['label'] = meta_df[['debertav3-base_EAP', 'debertav3-base_HPL', 'debertav3-base_MWS']].values.argmax(axis=1)

# === Prepare train data ===
X_meta = meta_df.drop(columns=['id', 'label'])
y_meta = meta_df['label']

# === Train Logistic Regression with 5-Fold CV ===
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
meta_preds_oof = np.zeros((len(X_meta), 3))

for train_idx, val_idx in cv.split(X_meta, y_meta):
    clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=5000)
    clf.fit(X_meta.iloc[train_idx], y_meta.iloc[train_idx])
    meta_preds_oof[val_idx] = clf.predict_proba(X_meta.iloc[val_idx])

# === Evaluate
print("✅ Meta-model OOF log loss:", log_loss(y_meta, meta_preds_oof))


In [None]:
from sklearn.linear_model import LogisticRegression
import pandas as pd

# === Load test set base model predictions ===
deberta_test = pd.read_csv("/kaggle/input/preds-data/debertav3-base_best_preds.csv")
roberta_test = pd.read_csv("/kaggle/input/preds-data/roberta_base_test_preds.csv")
tfidf_test = pd.read_csv("/kaggle/input/preds-data/submission_tfidf_single (11).csv")

# ✅ Rename TF-IDF columns to match training set
tfidf_test = tfidf_test.rename(columns={
    'EAP': 'EAP_tfidf',
    'HPL': 'HPL_tfidf',
    'MWS': 'MWS_tfidf'
})

# === Merge test features in same order
meta_test = deberta_test.merge(roberta_test, on='id')
meta_test = meta_test.merge(tfidf_test, on='id')
X_test_meta = meta_test.drop(columns=['id'])

# === Fit final model on full OOF
final_meta_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=5000)
final_meta_model.fit(X_meta, y_meta)

# === Predict
test_preds_meta = final_meta_model.predict_proba(X_test_meta)

# === Create submission
submission = pd.DataFrame(test_preds_meta, columns=['EAP', 'HPL', 'MWS'])
submission.insert(0, 'id', meta_test['id'])
submission.to_csv("submission_lgstack.csv", index=False)

print("✅ submission_lgstack.csv created successfully!")


In [None]:
!pip install lightgbm -q


In [None]:
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from lightgbm import early_stopping

# === LightGBM Stacking Training ===
oof_preds_lgb = np.zeros((len(X_meta), 3))
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(cv.split(X_meta, y_meta)):
    print(f"🟢 Fold {fold+1}")
    train_set = lgb.Dataset(X_meta.iloc[train_idx], label=y_meta.iloc[train_idx])
    val_set = lgb.Dataset(X_meta.iloc[val_idx], label=y_meta.iloc[val_idx])

    params = {
        "objective": "multiclass",
        "num_class": 3,
        "metric": "multi_logloss",
        "verbosity": 1,
        "seed": 42,
    }

    model = lgb.train(
        params,
        train_set,
        num_boost_round=1000,
        valid_sets=[train_set, val_set],
        valid_names=["train", "val"],
        callbacks=[early_stopping(stopping_rounds=50)]
    )

    oof_preds_lgb[val_idx] = model.predict(X_meta.iloc[val_idx])

logloss = log_loss(y_meta, oof_preds_lgb)
print(f"✅ LightGBM Meta-Model OOF Log Loss: {logloss:.5f}")


In [None]:
import pandas as pd
import lightgbm as lgb
from lightgbm import early_stopping
import numpy as np

# === Load test predictions from base models
deberta_test = pd.read_csv("/kaggle/input/preds-data/debertav3-base_best_preds.csv")
roberta_test = pd.read_csv("/kaggle/input/preds-data/roberta_base_test_preds.csv")
tfidf_test = pd.read_csv("/kaggle/input/preds-data/submission_tfidf_single (11).csv")
# === Rename TF-IDF columns to match training
tfidf_test = tfidf_test.rename(columns={
    'roberta_base_EAP': 'EAP_tfidf',
    'roberta_base_HPL': 'HPL_tfidf',
    'roberta_base_MWS': 'MWS_tfidf'
})

# === Merge test features
meta_test = deberta_test.merge(roberta_test, on='id')
meta_test = meta_test.merge(tfidf_test, on='id')
X_test_meta = meta_test.drop(columns=['id'])

# === Train LightGBM meta-model on full OOF data
train_set = lgb.Dataset(X_meta, label=y_meta)

params = {
    "objective": "multiclass",
    "num_class": 3,
    "metric": "multi_logloss",
    "verbosity": 1,
    "seed": 42,
}

final_model = lgb.train(
    params,
    train_set,
    num_boost_round=200,
)

# === Predict on test set
test_preds_lgb = final_model.predict(X_test_meta)

# === Create submission
submission = pd.DataFrame(test_preds_lgb, columns=['EAP', 'HPL', 'MWS'])
submission.insert(0, 'id', meta_test['id'])
submission.to_csv("submission_lgbstack.csv", index=False)

print("✅ submission_lgbstack.csv created successfully!")


In [None]:
import pandas as pd
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler

# Load submission files
deberta = pd.read_csv('/kaggle/input/final-model-data/deberta_final.csv')
roberta = pd.read_csv('/kaggle/input/final-model-data/deberta_final.csv')
lr = pd.read_csv('/kaggle/input/final-model-data/LR_submission_tfidf_single.csv')

# Rename columns to avoid clashes
deberta = deberta.rename(columns={"EAP": "EAP_deb", "HPL": "HPL_deb", "MWS": "MWS_deb"})
roberta = roberta.rename(columns={"EAP": "EAP_rob", "HPL": "HPL_rob", "MWS": "MWS_rob"})
lr = lr.rename(columns={"EAP": "EAP_lr", "HPL": "HPL_lr", "MWS": "MWS_lr"})

# Merge on 'id'
merged = deberta.merge(roberta, on="id").merge(lr, on="id")

# Features and pseudo-targets (averaged soft labels)
X = merged.drop(columns="id")
X_scaled = StandardScaler().fit_transform(X)

y_EAP = merged[["EAP_deb", "EAP_rob", "EAP_lr"]].mean(axis=1)
y_HPL = merged[["HPL_deb", "HPL_rob", "HPL_lr"]].mean(axis=1)
y_MWS = merged[["MWS_deb", "MWS_rob", "MWS_lr"]].mean(axis=1)

# Train RidgeCV regressors
ridge_eap = RidgeCV().fit(X_scaled, y_EAP)
ridge_hpl = RidgeCV().fit(X_scaled, y_HPL)
ridge_mws = RidgeCV().fit(X_scaled, y_MWS)

# Predict and normalize
eap_pred = ridge_eap.predict(X_scaled)
hpl_pred = ridge_hpl.predict(X_scaled)
mws_pred = ridge_mws.predict(X_scaled)

total = eap_pred + hpl_pred + mws_pred
submission = pd.DataFrame({
    "id": merged["id"],
    "EAP": eap_pred / total,
    "HPL": hpl_pred / total,
    "MWS": mws_pred / total
})

# Save final submission
submission.to_csv("submission.csv", index=False)
