In [1]:
#EXCEL_PATH = r"D:\FILIP\DOKTORSKE STUDIJE\III GODINA\AIC M21 CASOPIS\MATLAB CODE\1.PRIPREMLJENA BAZA PODATAKA\FUNDAMENTAL PERIOD PYTHON.xlsx"

In [2]:
# Great it works!

# Can you now provide directly in the same way to jupyter-lab updated script for: 
# Script 6 — Genetic Programming Symbolic Regression (GEP-style)
# Script 5 — LASSO Polynomial Regression (sparse closed-form)
# Script 4 — Additive Spline GAM (explicit equation)
# Script 3 — Model Tree (piecewise linear equations per region)
# Script 2 — MARS (py-earth) to get piecewise-linear equations
# Script 1 — Symbolic Regression (PySR) to discover equations

In [3]:
# JUPYTER CELL — Script 3: Model Tree (piecewise linear per region)
# Dataset: ['NoSt','NoSp','LoSp','OP','MWS'] -> 'TFP'  (original units)

import os, json, math, warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

warnings.filterwarnings("ignore")

# ---------- CONFIG ----------
EXCEL_PATH = r"D:\FILIP\DOKTORSKE STUDIJE\III GODINA\AIC M21 CASOPIS\MATLAB CODE\1.PRIPREMLJENA BAZA PODATAKA\FUNDAMENTAL PERIOD PYTHON.xlsx"
SHEET      = 0
FEATURES   = ["NoSt","NoSp","LoSp","OP","MWS"]
TARGET     = "TFP"

TEST_SIZE   = 0.20
RANDOM_SEED = 42

# Tree shape / regularization (tune for simplicity vs accuracy)
MAX_DEPTH            = 4          # smaller => fewer regions
MIN_SAMPLES_LEAF     = 20
MIN_SAMPLES_SPLIT    = 40
MIN_IMPURITY_DECREASE= 0.0

# Leaf linear model choice
USE_RIDGE = True                  # True = RidgeCV for stability, False = OLS LinearRegression
RIDGE_ALPHAS = np.logspace(-6, 3, 15)  # if USE_RIDGE

OUTDIR = "out_model_tree_period"
os.makedirs(OUTDIR, exist_ok=True)

# ---------- LOAD DATA ----------
df = pd.read_excel(EXCEL_PATH, sheet_name=SHEET)
missing = [c for c in FEATURES + [TARGET] if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}\nPresent: {list(df.columns)}")

X = df[FEATURES].apply(pd.to_numeric, errors="coerce").values
y = pd.to_numeric(df[TARGET], errors="coerce").values
mask = np.isfinite(X).all(axis=1) & np.isfinite(y)
X, y = X[mask], y[mask]

X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED
)

# ---------- TRAIN A CART TREE (to define regions) ----------
tree = DecisionTreeRegressor(
    max_depth=MAX_DEPTH,
    min_samples_leaf=MIN_SAMPLES_LEAF,
    min_samples_split=MIN_SAMPLES_SPLIT,
    min_impurity_decrease=MIN_IMPURITY_DECREASE,
    random_state=RANDOM_SEED,
)
tree.fit(X_tr, y_tr)

# ---------- EXTRACT LEAF ASSIGNMENTS ----------
leaf_id_tr = tree.apply(X_tr)
leaf_id_te = tree.apply(X_te)
unique_leaves = np.unique(leaf_id_tr)  # only leaves seen in train

# ---------- FIT LINEAR MODEL IN EACH TRAIN LEAF ----------
leaf_models = {}   # leaf_id -> (model, coef, intercept)
leaf_rows   = {}   # leaf_id -> indices in train
for lid in unique_leaves:
    idx = np.where(leaf_id_tr == lid)[0]
    leaf_rows[lid] = idx
    Xl = X_tr[idx]
    yl = y_tr[idx]
    if USE_RIDGE:
        mdl = RidgeCV(alphas=RIDGE_ALPHAS, fit_intercept=True, store_cv_values=False)
    else:
        mdl = LinearRegression(fit_intercept=True)
    mdl.fit(Xl, yl)
    leaf_models[lid] = (mdl, mdl.coef_.ravel().copy(), float(mdl.intercept_))

# ---------- PREDICTION USING THE MODEL TREE ----------
def predict_model_tree(Xmat, tree, leaf_models):
    leaf_ids = tree.apply(Xmat)
    yhat = np.empty(len(Xmat), dtype=float)
    for lid in np.unique(leaf_ids):
        mdl, coef, intercept = leaf_models[lid]
        idx = np.where(leaf_ids == lid)[0]
        yhat[idx] = intercept + Xmat[idx] @ coef
    return yhat

yhat_tr = predict_model_tree(X_tr, tree, leaf_models)
yhat_te = predict_model_tree(X_te, tree, leaf_models)

def metrics(y_true, y_pred):
    return dict(
        R2   = r2_score(y_true, y_pred),
        MAE  = mean_absolute_error(y_true, y_pred),
        RMSE = math.sqrt(((y_true - y_pred)**2).mean()),
    )

m_train = metrics(y_tr, yhat_tr)
m_test  = metrics(y_te, yhat_te)

# ---------- BUILD HUMAN-READABLE REGION CONDITIONS ----------
# Walk the tree to collect path constraints to each leaf
from sklearn.tree import _tree

t = tree.tree_
feature_names = FEATURES
def node_to_rule_paths():
    paths = []
    def recurse(node_id, conditions):
        if t.feature[node_id] != _tree.TREE_UNDEFINED:
            fname = feature_names[t.feature[node_id]]
            thresh = t.threshold[node_id]
            # left: feature <= thresh
            recurse(t.children_left[node_id],  conditions + [(fname, "<=", thresh)])
            # right: feature > thresh
            recurse(t.children_right[node_id], conditions + [(fname, ">",  thresh)])
        else:
            # leaf
            paths.append((node_id, conditions))
    recurse(0, [])
    return dict(paths)  # leaf_id -> list of (feature, op, threshold)

leaf_rules = node_to_rule_paths()

# ---------- PRETTY MATH DISPLAY ----------
from sympy import symbols, latex, Eq
from IPython.display import display, Math

sym_vars = {name: symbols(name, real=True) for name in FEATURES}

def rule_to_text(rule):
    if not rule:
        return "True"
    parts = []
    for (feat, op, thr) in rule:
        parts.append(f"{feat} {op} {thr:.6g}")
    return " and ".join(parts)

def linear_eq_to_sympy(coef, intercept):
    expr = intercept
    for c, name in zip(coef, FEATURES):
        if abs(c) > 0:
            expr = expr + c * sym_vars[name]
    return expr

# Display each region’s equation (limit for readability)
MAX_REGIONS_TO_DISPLAY = 12  # increase if you want all
displayed = 0
equation_lines = []

print("=== Model Tree — Fundamental Period (TFP) ===")
print(f"Leaves in train: {len(unique_leaves)} | max_depth={MAX_DEPTH} | min_samples_leaf={MIN_SAMPLES_LEAF}")
print("Train:", {k: round(v, 6) for k, v in m_train.items()})
print("Test :", {k: round(v, 6) for k, v in m_test.items()})
print("\nPiecewise linear regions:\n")

regions_summary = []
for lid in unique_leaves:
    mdl, coef, intercept = leaf_models[lid]
    rule = leaf_rules[lid]
    n_tr = len(leaf_rows[lid])
    # Local fit quality on the leaf (train subset)
    yl = y_tr[leaf_rows[lid]]
    yhat_l = intercept + X_tr[leaf_rows[lid]] @ coef
    r2_local = r2_score(yl, yhat_l) if len(yl) > 1 else float("nan")

    # Build textual equation
    coef_terms = " ".join([f"{c:+.6g}*{name}" for c, name in zip(coef, FEATURES) if abs(c) > 0])
    eq_text = f"Region (leaf {lid}):  IF {rule_to_text(rule)}  THEN  y = {intercept:+.6g} {coef_terms}"
    equation_lines.append(eq_text)

    # Pretty math (SymPy)
    if displayed < MAX_REGIONS_TO_DISPLAY:
        cond_ltx = " \\;\\wedge\\; ".join(
            [f"{f} {op} {thr:.6g}" for (f,op,thr) in rule]
        ) if rule else " \\text{(all)}"
        expr = linear_eq_to_sympy(coef, intercept)
        display(Math(r"\text{IF } " + cond_ltx + r"\;\; \text{ THEN }\;\; y = " + latex(expr)))
        displayed += 1

    regions_summary.append({
        "leaf_id": int(lid),
        "n_train_in_leaf": int(n_tr),
        "r2_local_train": float(r2_local),
        "intercept": float(intercept),
        **{f"coef_{name}": float(c) for c, name in zip(coef, FEATURES)}
    })

# If there are more regions than displayed, let the user know
if len(unique_leaves) > MAX_REGIONS_TO_DISPLAY:
    print(f"\n(Displayed first {MAX_REGIONS_TO_DISPLAY} regions. All regions are saved to file.)")

# ---------- SAVE ARTIFACTS ----------
with open(os.path.join(OUTDIR, "equations_regions.txt"), "w", encoding="utf-8") as f:
    f.write("Model Tree piecewise equations\n\n")
    for line in equation_lines:
        f.write(line + "\n")

pd.DataFrame(regions_summary).to_csv(os.path.join(OUTDIR, "regions_summary.csv"), index=False)

with open(os.path.join(OUTDIR, "metrics.json"), "w", encoding="utf-8") as f:
    json.dump({
        "file": EXCEL_PATH,
        "sheet": SHEET,
        "features": FEATURES,
        "target": TARGET,
        "tree": {
            "max_depth": MAX_DEPTH,
            "min_samples_leaf": MIN_SAMPLES_LEAF,
            "min_samples_split": MIN_SAMPLES_SPLIT,
            "min_impurity_decrease": MIN_IMPURITY_DECREASE
        },
        "leaf_model": "RidgeCV" if USE_RIDGE else "LinearRegression",
        "train": m_train,
        "test":  m_test,
        "n_leaves_train": int(len(unique_leaves))
    }, f, indent=2)

# For convenience, also print the Python-equation for each region in the notebook (compact)
print("\nCompact Python-style equations per region:")
for line in equation_lines[:MAX_REGIONS_TO_DISPLAY]:
    print(line)
if len(equation_lines) > MAX_REGIONS_TO_DISPLAY:
    print(f"... (see {os.path.join(OUTDIR, 'equations_regions.txt')})")

=== Model Tree — Fundamental Period (TFP) ===
Leaves in train: 16 | max_depth=4 | min_samples_leaf=20
Train: {'R2': 0.996001, 'MAE': 0.036269, 'RMSE': 0.049198}
Test : {'R2': 0.99604, 'MAE': 0.037475, 'RMSE': 0.051122}

Piecewise linear regions:



<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>

<IPython.core.display.Math object>


(Displayed first 12 regions. All regions are saved to file.)

Compact Python-style equations per region:
Region (leaf 4):  IF NoSt <= 11.5 and NoSt <= 6.5 and NoSt <= 4.5 and OP <= 37.5  THEN  y = +0.0667877 +0.0430646*NoSt -0.00577211*NoSp +0.00672284*LoSp +0.00286364*OP -0.00462771*MWS
Region (leaf 5):  IF NoSt <= 11.5 and NoSt <= 6.5 and NoSt <= 4.5 and OP > 37.5  THEN  y = -0.122967 +0.0803131*NoSt +0.00566042*NoSp +0.0210989*LoSp +0.0012841*OP -0.00103785*MWS
Region (leaf 7):  IF NoSt <= 11.5 and NoSt <= 6.5 and NoSt > 4.5 and OP <= 37.5  THEN  y = +0.0823619 +0.0480779*NoSt -0.0132086*NoSp +0.0144039*LoSp +0.00548382*OP -0.00901275*MWS
Region (leaf 8):  IF NoSt <= 11.5 and NoSt <= 6.5 and NoSt > 4.5 and OP > 37.5  THEN  y = -0.595647 +0.118348*NoSt +0.00674362*NoSp +0.0694038*LoSp +0.00244141*OP -0.00177801*MWS
Region (leaf 11):  IF NoSt <= 11.5 and NoSt > 6.5 and OP <= 62.5 and OP <= 37.5  THEN  y = +0.13966 +0.0473787*NoSt -0.0215717*NoSp +0.0204052*LoSp +0.008579*OP -0.014149

In [4]:
# Simpler model: lower MAX_DEPTH, raise MIN_SAMPLES_LEAF, or increase MIN_IMPURITY_DECREASE.

# Stability in small leaves: keep USE_RIDGE=True (it auto-selects α via CV). Set USE_RIDGE=False to get exact OLS.

# You’ll get explicit linear formulas per region with the exact split conditions, suitable for engineering use and documentation.