In [None]:
import pandas as pd
import seaborn as sns
from numpy.linalg import eigvals
from src.dataset import load_dataset
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
import math


In [None]:
df_x_train= load_dataset("../../data/raw/train/X_train.csv")
df_y_train = load_dataset("../../data/raw/train/y_train.csv")
df_x_val= load_dataset("../../data/raw/val/X_validation.csv").drop("id", axis=1)
df_x_test= load_dataset("../../data/raw/test/X_test.csv").drop("id", axis=1)

In [None]:
import numpy as np


def generation_feuture(df, degree=4):

    new_df = df.copy()
    poly_cols = [
        c for c in df.columns
        if c != "id" and np.issubdtype(df[c].dtype, np.number)
    ]

    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly.fit_transform(df[poly_cols])
    poly_names = poly.get_feature_names_out(poly_cols)
    poly_df = pd.DataFrame(X_poly, columns=poly_names, index=df.index)
    poly_df = poly_df.drop(columns=poly_cols, errors="ignore")
    new_df= pd.concat([new_df, poly_df], axis=1)

    return add_engineered_features(new_df)


In [None]:
def add_engineered_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds engineered features to a copy of df.
    Assumes columns exist: trq_measured,oat,mgt,pa,ias,np,ng
    Keeps 'id' if present. Does not touch the target.
    """
    out = df.copy()

    # Avoid division-by-zero explosions (keeps sign, prevents inf)
    eps = 1e-9
    def safe_div(a, b):
        return a / (np.where(np.abs(b) < eps, np.sign(b) * eps + eps, b))

    out["eff_trq_pa"]      = safe_div(out["trq_measured"], out["pa"])     # torque per available power
    out["thermal_load"]    = safe_div(out["mgt"], out["oat"])             # thermal stress proxy
    out["comp_eff_np_ng"]  = safe_div(out["np"], out["ng"])               # compressor regime efficiency
    out["power_util_trq_np"]= safe_div(out["trq_measured"], out["np"])    # torque per net power

    out["np_pa"]           = safe_div(out["np"], out["pa"])               # net/available power
    out["ng_pa"]           = safe_div(out["ng"], out["pa"])               # speed per available power
    out["mgt_pa"]          = safe_div(out["mgt"], out["pa"])              # thermal per power
    out["trq_mgt"]         = safe_div(out["trq_measured"], out["mgt"])    # torque per temperature proxy

    return out

In [None]:
new_train_set=generation_feuture(df_x_train)
new_train_set.head()
df_train=pd.merge(new_train_set,df_y_train,on="id").drop("id", axis=1)
df_train["trq_target"] = df_train["trq_measured"] / (df_train["trq_margin"] / 100 + 1)

In [None]:
plt.figure(figsize=(50, 50))
sns.heatmap(df_train.corr(), annot=True, cmap='RdBu_r', center=0, fmt=".2f")
plt.title("Matrice di correlazioni")
plt.show()
plt.savefig('matrice_correlazione.png', dpi=300, bbox_inches='tight')
eigenvalues = eigvals(df_train.corr())
condition_index = max(eigenvalues) / min(eigenvalues)

print(f'Condition Index : {condition_index}')