In [None]:
from pathlib import Path
import os

os.chdir(Path.cwd().parent)
assert (Path.cwd() / "pyproject.toml").exists()

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

from pmrisk.config import settings
from pmrisk.labeling.labels import build_labeled_df
from pathlib import Path

In [None]:
df = pd.read_parquet(Path(settings.data_processed_dir) / "fd001_train.parquet")

In [None]:
labeled_path = Path(settings.data_processed_dir) / "fd001_train_labeled.parquet"
if labeled_path.exists():
    df_labeled = pd.read_parquet(labeled_path)
else:
    df_labeled = build_labeled_df(df)

In [None]:
print(f"Rows: {len(df)}")
print(f"Engines: {df['engine_id'].nunique()}")
print("\nCycles per engine:")
print(df.groupby("engine_id")["cycle"].max().describe())

In [None]:
print("Missingness (top 10):")
print(df.isna().mean().sort_values(ascending=False).head(10))

In [None]:
print(f"Positive rate: {df_labeled['label'].mean():.4f}")
print("\nLabel counts:")
print(df_labeled["label"].value_counts())

In [None]:
lifetimes = df.groupby("engine_id")["cycle"].max()
plt.hist(lifetimes, bins=30, edgecolor="black")
plt.xlabel("Engine Lifetime (max cycle)")
plt.ylabel("Count")
plt.title("Distribution of Engine Lifetimes")
plt.show()

In [None]:
remaining_positive = df_labeled[df_labeled["label"] == 1]["remaining"]
plt.hist(remaining_positive, bins=30, edgecolor="black")
plt.xlabel("Remaining Cycles")
plt.ylabel("Count")
plt.title("Distribution of Remaining Cycles (label=1)")
plt.axvline(settings.horizon_n, color="r", linestyle="--", label=f"horizon_n={settings.horizon_n}")
plt.legend()
plt.show()

In [None]:
print("Remaining min/max:", df_labeled["remaining"].min(), df_labeled["remaining"].max())
print("Positives per engine (sum label) describe:")
print(df_labeled.groupby("engine_id")["label"].sum().describe())

In [None]:
print("Rows after window filter:", len(df_labeled), "of", len(df))