In [1]:
#EXCEL_PATH = r"D:\FILIP\DOKTORSKE STUDIJE\III GODINA\AIC M21 CASOPIS\MATLAB CODE\1.PRIPREMLJENA BAZA PODATAKA\FUNDAMENTAL PERIOD PYTHON.xlsx"

In [5]:
# Great it works!

# Can you now provide directly in the same way to jupyter-lab updated script for: 
# Script 6 — Genetic Programming Symbolic Regression (GEP-style)
# Script 5 — LASSO Polynomial Regression (sparse closed-form)
# Script 4 — Additive Spline GAM (explicit equation)
# Script 3 — Model Tree (piecewise linear equations per region)
# Script 2 — MARS (py-earth) to get piecewise-linear equations
# Script 1 — Symbolic Regression (PySR) to discover equations

SyntaxError: invalid character '—' (U+2014) (3403033769.py, line 4)

In [4]:
# JUPYTER CELL — Script 1: Symbolic Regression with pretty math display
# Dataset: X = ['NoSt','NoSp','LoSp','OP','MWS'], y = 'TFP' (original units)

import os, math, json, warnings, re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

warnings.filterwarnings("ignore")

# ---------- CONFIG ----------
EXCEL_PATH = r"D:\FILIP\DOKTORSKE STUDIJE\III GODINA\AIC M21 CASOPIS\MATLAB CODE\1.PRIPREMLJENA BAZA PODATAKA\FUNDAMENTAL PERIOD PYTHON.xlsx"
SHEET = 0
FEATURES = ["NoSt","NoSp","LoSp","OP","MWS"]
TARGET = "TFP"

TEST_SIZE   = 0.20
RANDOM_SEED = 42

# PySR knobs (used only if PySR+Julia available)
PYSR_NITER     = 50
PYSR_MAXSIZE   = 30
PYSR_MAX_EVALS = 800
PYSR_BIN_OPS   = ["+", "-", "*", "/", "^"]
PYSR_UN_OPS    = ["sqrt"]
PYSR_CONSTRAINTS = {"^": (5, 1)}

# gplearn (fallback) knobs
GP_POP_SIZE   = 3000
GP_GENS       = 30
GP_TOURN_SIZE = 20
GP_PARSIMONY  = 0.001
GP_CONST_MIN, GP_CONST_MAX = -5.0, 5.0

OUTDIR = "out_symbolic_period"
os.makedirs(OUTDIR, exist_ok=True)

# ---------- LOAD DATA ----------
df = pd.read_excel(EXCEL_PATH, sheet_name=SHEET)
missing = [c for c in FEATURES + [TARGET] if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in Excel: {missing}\nPresent: {list(df.columns)}")

X = df[FEATURES].apply(pd.to_numeric, errors="coerce").values
y = pd.to_numeric(df[TARGET], errors="coerce").values
mask = np.isfinite(X).all(axis=1) & np.isfinite(y)
X, y = X[mask], y[mask]

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

def metrics(y_true, y_pred):
    return dict(
        R2=r2_score(y_true, y_pred),
        MAE=mean_absolute_error(y_true, y_pred),
        RMSE=math.sqrt(((y_true - y_pred)**2).mean()),
    )

# ---------- Pretty math display (SymPy) ----------
from sympy import sympify, Abs, sqrt, log, exp, latex, simplify, symbols
from IPython.display import Math, display

def show_math_equation(eq_py: str, var_order):
    """
    Render 'y = <expr>' in proper math notation using SymPy.
    Maps:
      pdiv(a,b) -> a/(b + 1e-9)
      psqrt(x)  -> sqrt(|x|)
      plog(x)   -> log(1 + |x|)
      pexp(x)   -> exp(x)
    """
    # Create SymPy symbols for variables
    syms = symbols(" ".join(var_order), real=True)
    loc = {name: sym for name, sym in zip(var_order, syms)}
    # Map protected functions to SymPy expressions
    loc.update({
        "pdiv": lambda a, b: a/(b + 1e-9),
        "psqrt": lambda x: sqrt(Abs(x)),
        "plog": lambda x: log(1 + Abs(x)),
        "pexp": lambda x: exp(x),
    })
    try:
        expr = sympify(eq_py, locals=loc)
        expr = simplify(expr)
        display(Math(r"y = " + latex(expr)))
    except Exception as e:
        print("Could not render pretty math; showing raw expression instead.")
        print("y =", eq_py)

def save_results(name, eq_py, yhat_tr, yhat_te):
    mt = metrics(y_te, yhat_te); mtr = metrics(y_tr, yhat_tr)
    print(f"\n=== {name} — Symbolic Regression (original units) ===")
    print("Train:", {k: round(v, 6) for k, v in mtr.items()})
    print("Test :", {k: round(v, 6) for k, v in mt.items()})
    print("\nClosed-form (Python):\n", "y = " + eq_py)
    # Pretty math:
    show_math_equation(eq_py, FEATURES)

    with open(os.path.join(OUTDIR, f"{name.lower()}_equation_python.txt"), "w", encoding="utf-8") as f:
        f.write("y = " + eq_py + "\n")
    with open(os.path.join(OUTDIR, f"{name.lower()}_metrics.json"), "w", encoding="utf-8") as f:
        json.dump({"train": mtr, "test": mt}, f, indent=2)

used_backend = None

# ---------- Try PySR first ----------
try:
    from pysr import PySRRegressor

    model = PySRRegressor(
        niterations=PYSR_NITER,
        maxsize=PYSR_MAXSIZE,
        populations=30,
        population_size=60,
        parsimony=1e-4,
        progress=True,
        binary_operators=PYSR_BIN_OPS,
        unary_operators=PYSR_UN_OPS,
        loss="L2DistLoss()",
        model_selection="best",
        max_evals=PYSR_MAX_EVALS,
        random_state=RANDOM_SEED,
        constraints=PYSR_CONSTRAINTS,
    ).fit(X_tr, y_tr, variable_names=FEATURES)

    # Pick simplest within 5% of best test MAE
    candidates = []
    for _, row in model.equations_.iterrows():
        # 'equation' is a Python-evaluable string produced by PySR (in terms of variable names)
        eq_py = row["equation"]
        # Evaluate this candidate
        f = model.lambda_format()(row["equation"])
        yhat_tr = np.array(f(X_tr), float)
        yhat_te = np.array(f(X_te), float)
        candidates.append({
            "complexity": int(row["complexity"]),
            "py": eq_py,
            "train": metrics(y_tr, yhat_tr),
            "test":  metrics(y_te, yhat_te),
            "yhat_tr": yhat_tr, "yhat_te": yhat_te
        })
    if not candidates:
        raise RuntimeError("PySR returned no parsable equations.")

    best_mae = min(c["test"]["MAE"] for c in candidates)
    pool = [c for c in candidates if c["test"]["MAE"] <= 1.05*best_mae]
    best = sorted(pool, key=lambda c: (c["complexity"], c["test"]["MAE"]))[0]

    used_backend = "PySR"
    save_results("PySR", best["py"], best["yhat_tr"], best["yhat_te"])

except Exception:
    # ---------- Fallback: gplearn (pure Python) ----------
    from gplearn.genetic import SymbolicRegressor
    from gplearn.functions import make_function

    def _pdiv(x, y):
        return np.divide(x, np.where(np.abs(y) < 1e-9, np.sign(y)*1e-9 + (y==0)*1e-9, y))
    def _psqrt(x):
        return np.sqrt(np.abs(x))
    def _plog(x):
        return np.log1p(np.abs(x))
    def _pexp(x):
        return np.exp(np.clip(x, -20, 20))

    pdiv = make_function(function=_pdiv, name="pdiv", arity=2)
    psqrt = make_function(function=_psqrt, name="psqrt", arity=1)
    plog  = make_function(function=_plog,  name="plog",  arity=1)
    pexp  = make_function(function=_pexp,  name="pexp",  arity=1)

    gp = SymbolicRegressor(
        function_set=("add","sub","mul",pdiv,psqrt,plog,pexp),
        metric="rmse",
        population_size=GP_POP_SIZE,
        generations=GP_GENS,
        tournament_size=GP_TOURN_SIZE,
        const_range=(GP_CONST_MIN, GP_CONST_MAX),
        init_depth=(2,6),
        init_method="half and half",
        p_crossover=0.8,
        p_subtree_mutation=0.01,
        p_hoist_mutation=0.01,
        p_point_mutation=0.08,
        parsimony_coefficient=GP_PARSIMONY,
        max_samples=0.9,
        random_state=RANDOM_SEED,
        n_jobs=1,
        verbose=1,
    )
    gp.fit(X_tr, y_tr)

    # Convert gplearn program string to a Python-evaluable expression with your feature names
    def program_to_python(expr_str, var_names):
        s = expr_str
        for i, n in enumerate(var_names):
            s = re.sub(rf"\bX{i}\b", n, s)
        s = s.replace("add(", "ADD(").replace("sub(", "SUB(").replace("mul(", "MUL(")
        def bin_to_infix(text, token, op):
            while token in text:
                idx = text.find(token + "(")
                if idx == -1: break
                depth, j, comma = 0, idx + len(token) + 1, None
                while j < len(text):
                    if text[j] == "(":
                        depth += 1
                    elif text[j] == ")":
                        if depth == 0: break
                        depth -= 1
                    elif text[j] == "," and depth == 0:
                        comma = j; break
                    j += 1
                depth2, k = 0, comma + 1
                while k < len(text):
                    if text[k] == "(":
                        depth2 += 1
                    elif text[k] == ")":
                        if depth2 == 0: break
                        depth2 -= 1
                    k += 1
                a = text[idx + len(token) + 1:comma]
                b = text[comma + 1:k]
                repl = "(" + a.strip() + f" {op} " + b.strip() + ")"
                text = text[:idx] + repl + text[k+1:]
            return text
        s = bin_to_infix(s, "ADD", "+")
        s = bin_to_infix(s, "SUB", "-")
        s = bin_to_infix(s, "MUL", "*")
        return s

    expr_str = str(gp._program)
    eq_py = program_to_python(expr_str, FEATURES)

    # Evaluate safely
    def safe_eval_py(expr, Xmat, names):
        env = {
            "np": np,
            "pdiv": lambda a,b: a/np.where(np.abs(b)<1e-9, np.sign(b)*1e-9 + (b==0)*1e-9, b),
            "psqrt": lambda x: np.sqrt(np.abs(x)),
            "plog":  lambda x: np.log1p(np.abs(x)),
            "pexp":  lambda x: np.exp(np.clip(x, -20, 20)),
        }
        vals = {n: Xmat[:, i] for i, n in enumerate(names)}
        return np.asarray(eval(expr, env, vals), float)

    yhat_tr = safe_eval_py(eq_py, X_tr, FEATURES)
    yhat_te = safe_eval_py(eq_py, X_te, FEATURES)

    used_backend = "gplearn"
    save_results("GP-Fallback", eq_py, yhat_tr, yhat_te)

print(f"\nArtifacts saved in: {os.path.abspath(OUTDIR)}")
print("Backend used:", used_backend or "unknown")


Compiling Julia backend...
    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    12.94      2.88224e+24       16         0.527362         0.503087      1.52m
   1     7.15      2.49127e+07       16         0.522699         0.545126      1.32m
   2     7.87      1.99066e+07       13         0.433478         0.421782      1.53m
   3    11.46      4.83744e+07       13          0.42794         0.469934      1.48m
   4    15.95      5.13517e+07       19         0.377906         0.414835      2.13m
   5    17.25      9.17656e+07       20         0.366054         0.349769      1.65m
   6    20.45      6.48924e+07       30         0.320734         0.316857      1.99m
   7    23.14      3.47934e+07       30         0.317944         0.341224      1.91m
   8    24.30      2.46698e+07       24         

<IPython.core.display.Math object>


Artifacts saved in: C:\Users\filip\Documents\POGLAVLJE KNJIGE\out_symbolic_period
Backend used: gplearn
