In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from hull_tactical.data_loading import load_train_test   # your module
from hull_tactical.config import TARGET_COL, NON_FEATURE_COLS

In [None]:
train, test = load_train_test()

print("Train shape:", train.shape)
print("Test shape :", test.shape)

display(train.head())
display(train.describe(include="all").T)

In [None]:
if "date_id" in train.columns:
    print("date_id range in train:",
          int(train["date_id"].min()),
          "â†’",
          int(train["date_id"].max()))
    
    date_counts = train["date_id"].value_counts().sort_index()
    print("\nNumber of rows per date_id (first 10):")
    display(date_counts.head(10))

    plt.figure(figsize=(10, 4))
    date_counts.plot()
    plt.title("Number of rows per date_id (train)")
    plt.xlabel("date_id")
    plt.ylabel("count")
    plt.tight_layout()
    plt.show()
else:
    print("Column 'date_id' not found in train.")

In [None]:
na_rate = train.isna().mean().sort_values(ascending=False)
na_count = train.isna().sum().sort_values(ascending=False)

na_df = pd.DataFrame({
    "na_count": na_count,
    "na_rate": na_rate,
})
display(na_df.head(30))

plt.figure(figsize=(8, 4))
na_df["na_rate"].head(30).plot(kind="bar")
plt.ylabel("NA rate")
plt.title("Top 30 columns by missing rate (train)")
plt.tight_layout()
plt.show()

In [None]:
if TARGET_COL in train.columns:
    plt.figure(figsize=(6, 4))
    sns.histplot(train[TARGET_COL].dropna(), bins=50, kde=True)
    plt.title(f"Distribution of {TARGET_COL} (train)")
    plt.xlabel(TARGET_COL)
    plt.tight_layout()
    plt.show()

    print(train[TARGET_COL].describe())
else:
    print(f"TARGET_COL '{TARGET_COL}' not in train columns.")