In [1]:
#EXCEL_PATH = r"D:\FILIP\DOKTORSKE STUDIJE\III GODINA\AIC M21 CASOPIS\MATLAB CODE\1.PRIPREMLJENA BAZA PODATAKA\FUNDAMENTAL PERIOD PYTHON.xlsx"

In [2]:
# JUPYTER CELL — LASSO Polynomial (no Julia) for Fundamental Period (TFP)
# Produces a sparse, explicit equation in ORIGINAL UNITS and displays it as math in the notebook.

import os, math, json, warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LassoCV, Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

warnings.filterwarnings("ignore")

# ---------- CONFIG ----------
EXCEL_PATH = r"D:\FILIP\DOKTORSKE STUDIJE\III GODINA\AIC M21 CASOPIS\MATLAB CODE\1.PRIPREMLJENA BAZA PODATAKA\FUNDAMENTAL PERIOD PYTHON.xlsx"
SHEET = 0                     # 0 = first sheet, or set "SheetName"
FEATURES = ["NoSt","NoSp","LoSp","OP","MWS"]
TARGET = "TFP"

TEST_SIZE = 0.20
RANDOM_SEED = 42

# Polynomial expansion degree (start with 2; try 3 if needed)
POLY_DEGREE = 2
INCLUDE_BIAS = False          # keep intercept separate

# LASSO alpha search range (logspace); adjust to trade accuracy vs sparsity
ALPHA_MIN = 1e-6
ALPHA_MAX = 1e2
N_ALPHAS  = 200
MAX_ITER  = 20000

OUTDIR = "out_lasso_period"
os.makedirs(OUTDIR, exist_ok=True)

# ---------- LOAD DATA ----------
df = pd.read_excel(EXCEL_PATH, sheet_name=SHEET)
missing = [c for c in FEATURES + [TARGET] if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}\nPresent: {list(df.columns)}")

X_raw = df[FEATURES].apply(pd.to_numeric, errors="coerce").values
y = pd.to_numeric(df[TARGET], errors="coerce").values

mask = np.isfinite(X_raw).all(axis=1) & np.isfinite(y)
X_raw, y = X_raw[mask], y[mask]

X_tr, X_te, y_tr, y_te = train_test_split(
    X_raw, y, test_size=TEST_SIZE, random_state=RANDOM_SEED
)

# ---------- POLYNOMIAL DESIGN (in original units) ----------
poly = PolynomialFeatures(degree=POLY_DEGREE, include_bias=INCLUDE_BIAS)
Z_tr = poly.fit_transform(X_tr)
Z_te = poly.transform(X_te)

try:
    term_names = list(poly.get_feature_names_out(FEATURES))
except:
    term_names = list(poly.get_feature_names(FEATURES))

# ---------- LASSO CV ----------
alphas = np.logspace(np.log10(ALPHA_MIN), np.log10(ALPHA_MAX), N_ALPHAS)
lcv = LassoCV(alphas=alphas, cv=5, fit_intercept=True, max_iter=MAX_ITER)
lcv.fit(Z_tr, y_tr)
alpha_star = float(lcv.alpha_)

# Refit with chosen alpha (for stability)
mdl = Lasso(alpha=alpha_star, fit_intercept=True, max_iter=MAX_ITER)
mdl.fit(Z_tr, y_tr)

# ---------- METRICS ----------
def metrics(y_true, y_pred):
    return dict(
        R2=r2_score(y_true, y_pred),
        MAE=mean_absolute_error(y_true, y_pred),
        RMSE=math.sqrt(((y_true - y_pred)**2).mean()),
    )

yhat_tr = mdl.predict(Z_tr)
yhat_te = mdl.predict(Z_te)
m_train = metrics(y_tr, yhat_tr)
m_test  = metrics(y_te, yhat_te)

# ---------- BUILD EQUATION (Python string) ----------
intercept = float(mdl.intercept_)
coef = mdl.coef_.ravel()

nz = np.abs(coef) > 1e-12
coef_nz = coef[nz]
terms_nz = [term_names[i] for i, m in enumerate(nz) if m]

def python_expr(intercept, coef, names):
    parts = [f"{intercept:+.16g}"]
    for c, t in zip(coef, names):
        term = t.replace(" ", " * ").replace("^", "**")
        parts.append(f"{c:+.16g}*({term})")
    return "y = " + " ".join(parts)

expr_py  = python_expr(intercept, coef_nz, terms_nz)

# ---------- PRETTY MATH DISPLAY (SymPy, no files) ----------
from sympy import symbols, sympify, latex
from IPython.display import Math, display

# make sympy symbols for all base variables
sym_vars = {name: symbols(name, real=True) for name in FEATURES}

def term_to_sympy(term: str):
    """
    Convert PolynomialFeatures term notation into a SymPy expression.
    - Spaces mean multiplication (e.g., 'NoSt NoSp' -> NoSt*NoSp)
    - Caret means power (e.g., 'NoSt^2' -> NoSt**2)
    """
    expr = term.replace(" ", "*").replace("^", "**")
    return sympify(expr, locals=sym_vars)

sym_expr = sympify(intercept)
for c, t in zip(coef_nz, terms_nz):
    sym_expr += c * term_to_sympy(t)

# ---------- OUTPUT ----------
print("=== LASSO Polynomial (original units) — Fundamental Period (TFP) ===")
print(f"Chosen alpha: {alpha_star:.6g}   |   Degree: {POLY_DEGREE}")
print(f"Selected terms: {len(coef_nz)} / {coef.size}")
print("Train:", {k: round(v, 6) for k, v in m_train.items()})
print("Test :", {k: round(v, 6) for k, v in m_test.items()})
print("\nClosed-form (Python):\n", expr_py)

# Display pretty math:
display(Math(r"y = " + latex(sym_expr)))

# Save artifacts (no LaTeX files)
with open(os.path.join(OUTDIR, "equation_python.txt"), "w", encoding="utf-8") as f:
    f.write(expr_py + "\n")
pd.DataFrame({"term": ["1 (intercept)"] + terms_nz, "coefficient": [intercept] + list(coef_nz)}) \
  .to_csv(os.path.join(OUTDIR, "terms_selected.csv"), index=False)
with open(os.path.join(OUTDIR, "metrics.json"), "w", encoding="utf-8") as f:
    json.dump({
        "file": EXCEL_PATH,
        "sheet": SHEET,
        "features": FEATURES,
        "target": TARGET,
        "poly_degree": POLY_DEGREE,
        "alpha_star": alpha_star,
        "n_terms_selected": int(len(coef_nz)),
        "n_terms_total": int(coef.size),
        "train": m_train,
        "test": m_test
    }, f, indent=2)

=== LASSO Polynomial (original units) — Fundamental Period (TFP) ===
Chosen alpha: 4.88025e-05   |   Degree: 2
Selected terms: 20 / 20
Train: {'R2': 0.982619, 'MAE': 0.078616, 'RMSE': 0.102562}
Test : {'R2': 0.984527, 'MAE': 0.0765, 'RMSE': 0.101052}

Closed-form (Python):
 y = +0.9512389922312927 +0.008807229063456202*(NoSt) -0.0834724758243774*(NoSp) -0.1689800628944431*(LoSp) -0.00341621738902367*(OP) -0.01739632464974486*(MWS) +0.0004504184345576174*(NoSt**2) -0.002005591399244831*(NoSt * NoSp) +0.008816531479414315*(NoSt * LoSp) +0.0008061644538708566*(NoSt * OP) -0.0006982410486949265*(NoSt * MWS) +0.006932408619301114*(NoSp**2) +0.005725709297006055*(NoSp * LoSp) +0.0001578463789689262*(NoSp * OP) -0.0005923692870608095*(NoSp * MWS) +0.006325112861404335*(LoSp**2) +0.001383713241471476*(LoSp * OP) -0.0006529731861680111*(LoSp * MWS) -6.717064825533375e-05*(OP**2) +0.0001998763837484754*(OP * MWS) +0.0003934596023083607*(MWS**2)


<IPython.core.display.Math object>