In [1]:
import pandas as pd
dir_path = '/kaggle/input/target-reply-rul-estimation-of-turbofan-engines/'
train_file = 'train_challenge.txt'
test_file = 'test_challenge.txt'

index_names = ['unit_nr', 'time_cycles']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names = ['s_{}'.format(i+1) for i in range(0,21)]
col_names = index_names + setting_names + sensor_names

train = pd.read_csv((dir_path+train_file), sep=r'\s+', header=None, names=col_names)
test = pd.read_csv((dir_path+test_file), sep=r'\s+', header=None, names=col_names)

In [2]:
# Add RUL column in train dataset
def add_remaining_useful_life(df):
    # Get the total number of cycles for each unit
    grouped_by_unit = df.groupby(by="unit_nr")
    max_cycle = grouped_by_unit["time_cycles"].max()

    # Merge the max cycle back into the original frame
    result_frame = df.merge(max_cycle.to_frame(name='max_cycle'), left_on='unit_nr', right_index=True)

    # Calculate remaining useful life for each row
    remaining_useful_life = result_frame["max_cycle"] - result_frame["time_cycles"]
    result_frame["RUL"] = remaining_useful_life

    # drop max_cycle as it's no longer needed
    result_frame = result_frame.drop("max_cycle", axis=1)
    return result_frame

train = add_remaining_useful_life(train)
train[index_names+['RUL']]

Unnamed: 0,unit_nr,time_cycles,RUL
0,1,1,148
1,1,2,147
2,1,3,146
3,1,4,145
4,1,5,144
...,...,...,...
53754,260,312,4
53755,260,313,3
53756,260,314,2
53757,260,315,1


In [3]:
import numpy as np

UNIT_COL, TIME_COL, RUL_COL = "unit_nr", "time_cycles", "RUL"

# choose your window params
W       = 50      # window length
STRIDE  = 5       # hop between windows 
MIN_W   = W       # enforce full windows

# sensors/features to include INSIDE the window
SENSOR_COLS  = [c for c in train.columns if c.startswith("s_")]
SETTING_COLS = [c for c in train.columns if c.startswith("setting_")]
REGIME_DUMMIES = [c for c in train.columns if c.startswith("regime_")]

BASE_IN_WIN = SENSOR_COLS + SETTING_COLS + REGIME_DUMMIES r

def extract_window_features(df_win: pd.DataFrame) -> dict:
    """
    Causal features computed strictly inside the window.
    Keep it simple to start; expand later if needed.
    """
    out = {}
    # simple stats per sensor
    for s in SENSOR_COLS:
        w = df_win[s].to_numpy()
        out[f"{s}_mean"]   = float(np.nanmean(w))
        out[f"{s}_std"]    = float(np.nanstd(w, ddof=0))
        out[f"{s}_last"]   = float(w[-1])
        # linear trend (slope) inside window (robust? can switch to Theil–Sen if you like)
        x = np.arange(len(w))
        if np.all(np.isfinite(w)) and len(w) >= 2:
            slope = np.polyfit(x, w, 1)[0]
        else:
            slope = 0.0
        out[f"{s}_slope"] = float(slope)

    # include window-end settings/regime (use last value in window)
    last = df_win.iloc[-1]
    for c in SETTING_COLS + REGIME_DUMMIES:
        out[c] = float(last[c]) if c in df_win.columns else 0.0

    out["cycle_idx"] = int(last[TIME_COL])
    return out

def make_sliding_windows(df: pd.DataFrame,
                         window=W,
                         stride=STRIDE,
                         min_w=MIN_W) -> tuple[pd.DataFrame, np.ndarray, np.ndarray]:
    """
    Returns:
      X_win: DataFrame of features
      y_win: ndarray of RUL at window end
      groups: ndarray of engine ids for GroupKFold
    """
    rows = []
    targets = []
    groups = []

    # ensure sorted
    df = df.sort_values([UNIT_COL, TIME_COL])

    for uid, g in df.groupby(UNIT_COL):
        g = g.reset_index(drop=True)
        n = len(g)
        # slide over indices
        start_idx = 0
        while start_idx + min_w <= n:
            end_idx = start_idx + window
            if end_idx > n:
                break  # require full window
            win = g.iloc[start_idx:end_idx]

            # label is RUL at window end (causal and well-defined in train)
            rul_end = int(win[RUL_COL].iloc[-1])

            feat = {
                UNIT_COL: int(uid),
                TIME_COL: int(win[TIME_COL].iloc[-1]),
                "rul_end": rul_end
            }
            feat.update(extract_window_features(win))
            rows.append(feat)
            targets.append(rul_end)
            groups.append(uid)

            start_idx += stride

    X_win = pd.DataFrame(rows).reset_index(drop=True)
    y_win = np.asarray(targets, dtype=float)
    groups = np.asarray(groups, dtype=int)
    return X_win, y_win, groups

# build training windows from TRAIN ONLY (train already has RUL)
X_win, y_win, groups = make_sliding_windows(train)
print(X_win.shape, y_win.shape, groups.shape)


(8298, 91) (8298,) (8298,)


In [15]:
safe_solvers = ["auto", "cholesky", "svd", "lsqr"]

ridge_base = Pipeline([
    ("pre", pre),
    ("model", Ridge(random_state=0, max_iter=5000))
])

param_grid = {
    "model__alpha": np.logspace(-4, 3, 30),
    "model__solver": safe_solvers,
    "model__fit_intercept": [True, False],
    "model__tol": [1e-4, 1e-3]  

gs = GridSearchCV(
    estimator=ridge_base,
    param_grid=param_grid,
    scoring={"MAE": mae_scorer, "RMSE": rmse_scorer, "R2": r2_scorer},
    refit="RMSE",
    cv=cv,
    n_jobs=-1,
    return_train_score=True,
    verbose=1
)
from tqdm.auto import tqdm
from sklearn.model_selection import ParameterGrid
from sklearn.base import clone

param_grid_pb = {
    "model__alpha": np.logspace(-4, 3, 30),
    "model__solver": ["auto", "cholesky", "svd", "lsqr"],  # no 'sparse_cg'
    "model__fit_intercept": [True, False],
    "model__tol": [1e-4, 1e-3]
}

grid = list(ParameterGrid(param_grid_pb))
results = []

for params in tqdm(grid, desc="Ridge grid", unit="cfg"):
    pipe = clone(ridge_base).set_params(**params)
    try:
        scores = cross_validate(
            pipe, X_win[feat_cols], y_win, groups=groups, cv=cv,
            scoring={"MAE": mae_scorer, "RMSE": rmse_scorer, "R2": r2_scorer},
            return_train_score=False, n_jobs=-1, error_score=np.nan
        )
        rmse_mean = -np.nanmean(scores["test_RMSE"])
        mae_mean  = -np.nanmean(scores["test_MAE"])
        r2_mean   =  np.nanmean(scores["test_R2"])
        results.append({**params, "RMSE": rmse_mean, "MAE": mae_mean, "R2": r2_mean})
    except ValueError:
        # This config failed on all folds; record as NaNs and continue
        results.append({**params, "RMSE": np.nan, "MAE": np.nan, "R2": np.nan})
        continue

df_pb = pd.DataFrame(results)
df_pb_valid = df_pb.dropna(subset=["RMSE"]).sort_values("RMSE", ascending=True)
print(df_pb_valid.head(10))

best_params = df_pb_valid.iloc[0][[c for c in df_pb_valid.columns if c.startswith("model__")]].to_dict()
print("Best (tqdm search) params:", best_params)

best_pipe = clone(ridge_base).set_params(**best_params)
eval_model(best_pipe, f"Ridge* (tqdm search) (W={W}, stride={STRIDE})")


Ridge grid:   0%|          | 0/480 [00:00<?, ?cfg/s]

    model__alpha  model__fit_intercept model__solver  model__tol       RMSE  \
51      0.000530                  True      cholesky      0.0010  32.261768   
50      0.000530                  True      cholesky      0.0001  32.261768   
49      0.000530                  True          auto      0.0010  32.261768   
48      0.000530                  True          auto      0.0001  32.261768   
52      0.000530                  True           svd      0.0001  32.261768   
53      0.000530                  True           svd      0.0010  32.261768   
67      0.000924                  True      cholesky      0.0010  32.262735   
66      0.000924                  True      cholesky      0.0001  32.262735   
65      0.000924                  True          auto      0.0010  32.262735   
64      0.000924                  True          auto      0.0001  32.262735   

          MAE        R2  
51  24.097650  0.691261  
50  24.097650  0.691261  
49  24.097650  0.691261  
48  24.097650  0.691261  


In [20]:
def last_full_window_per_engine(df_test: pd.DataFrame, window=W) -> pd.DataFrame:
    rows = []
    for uid, g in df_test.sort_values([UNIT_COL, TIME_COL]).groupby(UNIT_COL):
        g = g.reset_index(drop=True)
        if len(g) < window:
            # OPTION: allow partial window for small engines
            win = g.iloc[:]  # partial
        else:
            win = g.iloc[-window:]
        feat = {
            UNIT_COL: int(uid),
            TIME_COL: int(win[TIME_COL].iloc[-1]),
        }
        feat.update(extract_window_features(win))
        rows.append(feat)
    return pd.DataFrame(rows).reset_index(drop=True)

X_test_win = last_full_window_per_engine(test, window=W)
X_test_feat_cols = [c for c in X_test_win.columns if c not in [UNIT_COL, TIME_COL]]

In [21]:
best_ridge = Ridge(
    alpha=0.0005298316906283707,
    solver="cholesky",
    fit_intercept=True,
    tol=1e-3,
    random_state=0
)

pre = ColumnTransformer([("scaler", StandardScaler(), feat_cols)], remainder="drop")
pipe = Pipeline([("pre", pre), ("model", best_ridge)])

# Fit on ALL training windows, then predict on the test last-window features
pipe.fit(X_win[feat_cols], y_win)

y_pred_test = pipe.predict(X_test_win[feat_cols])

pred_df = pd.DataFrame({
    "unit_nr": X_test_win[UNIT_COL].astype(int),
    "RUL": np.clip(np.round(y_pred_test).astype(int), 0, None)
}).sort_values("unit_nr")

pred_df.head()

Unnamed: 0,unit_nr,RUL
0,1,41
1,2,127
2,3,113
3,4,97
4,5,12
