In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from hull_tactical.data_loading import load_train_test
from hull_tactical.preprocessing import build_cleaned_data
from hull_tactical.feature_selection import run_feature_selection
from hull_tactical.config import TARGET_COL, NON_FEATURE_COLS, PREFIX_MAP
from hull_tactical.paths import RESULTS_DIR

In [None]:
train, _ = load_train_test()

full_cleaned, train_clean, val_clean, test_clean, high_na_cols = (
    build_cleaned_data(train)
)

print("Full cleaned shape :", full_cleaned.shape)
print("Train cleaned shape:", train_clean.shape)
print("Val cleaned shape  :", val_clean.shape)
print("Test cleaned shape :", test_clean.shape)
print("High-NA cols (with *_missing flags):", len(high_na_cols))

In [None]:
print("Running feature selection on cleaned data...")

fs_res = run_feature_selection(full_cleaned)

selected = fs_res["selected_features"]
importance_df = fs_res.get("importance_df", None)

print("\nFeature selection completed.")
print("Number of selected features:", len(selected))
print("First 20 selected features:", selected[:20])

In [None]:
if importance_df is not None:
    print("Importance DataFrame shape:", importance_df.shape)
    display(importance_df.head(10))
else:
    print("No importance_df found in fs_res.")

In [None]:
if importance_df is not None:
    top_n = 30
    top_imp = importance_df.head(top_n)

    plt.figure(figsize=(8, 0.3 * top_n + 2))
    plt.barh(top_imp["feature"][::-1], top_imp["gain"][::-1])   # reversed for top at top
    plt.xlabel("Gain")
    plt.title(f"Top {top_n} features by LightGBM gain")
    plt.tight_layout()
    plt.show()
else:
    print("importance_df is None, skip plot.")

In [None]:
corr_path = RESULTS_DIR / "correlation_stats_trainset.xlsx"

if corr_path.exists():
    print("Loading correlation stats from:", corr_path)

    with pd.ExcelFile(corr_path) as xls:
        feature_pair_df = pd.read_excel(xls, sheet_name="Feature_Feature_Top")   # high-corr pairs
        feat_target_df = pd.read_excel(xls, sheet_name="Feature_Target")         # feature vs target

    print("\nTop feature-feature correlation pairs:")
    display(feature_pair_df.head(20))

    print("\nFeature-target correlation (top by |corr|):")
    display(feat_target_df.head(20))
else:
    print("File not found:", corr_path)
    print("Make sure `compute_feature_pair_corr` or FS pipeline has been run.")