In [None]:
# import librerie e dataset

from sklearn.linear_model import LinearRegression
import plotly.express as px
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error, make_scorer
import sklearn as sklearn
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_validate, KFold,GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import optuna
from sklearn.preprocessing import  TargetEncoder as tg
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
from optuna.visualization import plot_optimization_history, plot_param_importances
import optuna.visualization as vis
from category_encoders.binary import BinaryEncoder

df_original=pd.read_csv(r'path/to/your/dataset.csv')
sklearn.set_config(transform_output='pandas')




In [2]:
# filtraggio colonne utili

df_original = df_original[[
    # Dati demografici e socioeconomici
    'age', 
    'sex',
    'marital_status',
    'Race recode (White, Black, Other)',
    'Race recode (with detailed Asian and Native Hawaiian other PI)',
    'Origin recode NHIA (Hispanic, Non-Hisp)',
    'median_household_income_adj_2023', 'rural_urban_continuum',
    
    # Caratteristiche del tumore
    'primary_site', 'Schema ID (2018+)', 'ICD-O-3 Hist/behav',
    'clinical_grade', 'diagnostic_confirmation',
    'tumor_size_summary',
    
    # Stadio
    'eod_t', 'eod_n', 'eod_m', 'eod_stage_group',
    'eod_primary_tumor', 'eod_regional_nodes', 'eod_mets',
    'n_sentinel_lymph_nodes',
    
    # Metastasi
    'mets_at_bone', 'mets_at_brain', 'mets_at_liver', 'mets_at_lung', 
    'mets_at_dx_distand_ln', 'mets_at_dx_other',
    
    # Biomarcatori
    'E_R_binary', 'pr_binary', 'her2_binary',
    
    # Trattamento
    'days_from_diagnosis_to_treatment',
    'rx_summ_surg_prim_site', 'rx_summ_scope_reg_ln_sur', 'rx_summ_surg_oth_reg_dis',
    'rx_summ_surg_rad_seq', 'reason_no_surgery',
    'radiation', 'chemo_yes_no', 'rx_summ_systemic_sur_seq',
    
    # Storia clinica 
    'first_malignant_tumor',
    'n_benign_borderline_tumors', 'n_in_situ_malignant_tumors',
    'survival_months',
    
    # Fonte 
    'report_source'
]]


In [3]:
#sampling dataframe
df = df_original.sample(frac=1, random_state=42)

df.shape

(131974, 45)

In [4]:
# colonne numeriche e categoriche

# Colonne categoriche
cat_cols = [
    # Dati demografici
    'sex', 
    'marital_status',
    'Race recode (White, Black, Other)',
    'Race recode (with detailed Asian and Native Hawaiian other PI)',
    'Origin recode NHIA (Hispanic, Non-Hisp)',
    'rural_urban_continuum',
    
    # Caratteristiche del tumore
    'primary_site',
    'Schema ID (2018+)',
    'ICD-O-3 Hist/behav',
    'clinical_grade',
    'diagnostic_confirmation',
    'tumor_size_summary',
    
    # Stadiazione
    'eod_t', 'eod_n', 'eod_m', 'eod_stage_group',
    'eod_primary_tumor', 'eod_regional_nodes', 'eod_mets',
    'n_sentinel_lymph_nodes',
    
    # Metastasi
    'mets_at_bone', 'mets_at_brain', 'mets_at_liver', 'mets_at_lung',
    'mets_at_dx_distand_ln', 'mets_at_dx_other',
    
    # Biomarcatori
    'E_R_binary', 'pr_binary', 'her2_binary',
    
    # Trattamento
    'rx_summ_surg_prim_site', 'rx_summ_scope_reg_ln_sur', 'rx_summ_surg_oth_reg_dis',
    'rx_summ_surg_rad_seq', 'reason_no_surgery', 'radiation',
    'chemo_yes_no', 'rx_summ_systemic_sur_seq',
    
    # Storia clinica
    'first_malignant_tumor',
    
    # Fonte dei dati
    'report_source'
]

# Colonne numeriche
num_cols = [
    #'age',
    'days_from_diagnosis_to_treatment',
    'median_household_income_adj_2023',
    'n_in_situ_malignant_tumors',
    'n_benign_borderline_tumors'
]

In [5]:
#train_test_split
x_train, x_test, y_train, y_test=train_test_split(df.drop(columns='survival_months'),df['survival_months'], test_size=0.2, shuffle=True, random_state=42)

In [None]:
print(x_test.iloc[2])

# 1° regressione lineare

In [68]:
# encoder e pipeline
encoding=ColumnTransformer(
    [
        (
            'onehot',
            OneHotEncoder(sparse_output=False, min_frequency=5, handle_unknown='infrequent_if_exist'),
            cat_cols
        )
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
    force_int_remainder_cols=False
)

pipe_rl=Pipeline(
    [
        ('encoder', encoding),
        ('scaler', StandardScaler(with_mean=False)),
        ('linreg', LinearRegression())
    ]
)

pipe_rl.fit(x_train, y_train)
y_test_pred=pipe_rl.predict(x_test)
mean_absolute_error(y_test, y_test_pred)

6.35276298286698

In [None]:
# visualizzazione residui
x_test['survival_months']=y_test
x_test['predicted']=y_test_pred
x_test['residuals']=x_test['survival_months']-x_test['predicted']


fig = px.scatter(x_test, x="survival_months", y="residuals", hover_data=x_test.columns)

fig.add_hline(y=0, line_color="red", line_dash="dash")

# Show the plot
fig.show()

# 2° random forest

In [None]:
# primo tentativo
regressor=RandomForestRegressor(n_estimators=20, max_depth=20,random_state=42)

encoder=ColumnTransformer(
    [
        (
            'encoder',
            OneHotEncoder(sparse_output=False, min_frequency=5, handle_unknown='infrequent_if_exist'),cat_cols
        )  

    ],
    remainder='passthrough',
    verbose_feature_names_out=False,
    force_int_remainder_cols=False
)

pipe_rf=Pipeline(
    [
        ('encoder', encoder),
        ('scaler', StandardScaler()),
        ('rf', regressor)
    ]
)

pipe_rf.fit(x_train, y_train)
y_test_pred=pipe_rf.predict(x_test)
print(mean_absolute_error(y_test, y_test_pred))


6.258560916284831


In [None]:
#grid search

params = {
    'rf__n_estimators': [50, 150, 300],  
    'rf__criterion': ['squared_error', 'absolute_error'], 
    'encoder__encoder__min_frequency': [1, 7, 15],  
    'rf__max_depth': [10, 18, 26], 
    'rf__min_samples_split': [2, 7],  
    'rf__min_samples_leaf': [1, 3],  
    'rf__max_features': ['sqrt']
}

grid_search=GridSearchCV(
    estimator=pipe_rf, 
    param_grid=params,
    scoring= make_scorer(mean_absolute_error, greater_is_better=False),
    n_jobs=-1,# quanti addestramenti in parallelo
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    refit=True, # riaddestra il modello sui parametri migliori senza split e validation
    verbose=2
)

grid_search.fit(x_train, y_train)
y_test_pred=grid_search.predict(x_test)
print(mean_absolute_error(y_test, y_test_pred))

grid_search.best_params_

In [None]:
# random search, devo testare troppi iperparametri

params = {
    'rf__n_estimators': [50, 150, 300],  
    'rf__criterion': ['squared_error', 'absolute_error'],  # tolto 'friedman_mse'
    'encoder__encoder__min_frequency': [1, 7, 15],  
    'rf__max_depth': [10, 18, 26],  
    'rf__min_samples_split': [2, 7],  
    'rf__min_samples_leaf': [1, 3],  
    'rf__max_features': ['sqrt']
}

grid_search=RandomizedSearchCV(
    estimator=pipe_rf, 
    n_iter=20, # bisogna dirgli quante iterazioni
    param_distributions=params,  # cambia questo rispetto a gridsearch
    scoring= make_scorer(mean_absolute_error, greater_is_better=False),
    n_jobs=-1,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    refit=True,
    verbose=4
)

grid_search.fit(x_train, y_train)

y_test_pred=grid_search.predict(x_test)

print("MAE:", mean_absolute_error(y_test, y_test_pred))
print("Best parameters:", grid_search.best_params_)


In [None]:
# forse meglio usare optuna
#study= optuna.create_study(storage="sqlite:///model_selection.db", study_name="rf02_02", direction="minimize")
study= optuna.load_study(storage="sqlite:///model_selection.db", study_name="rf02_02",)


def objective_func(trial): # questa è una funzione obiettivo tipo
    params={ # questa è una funzione obiettivo tipo
    'rf__n_estimators': trial.suggest_categorical("rf__n_estimators",  [150,250,400]),
    'encoder__encoder__min_frequency' : trial.suggest_categorical('encoder__encoder__min_frequency',  [ 7, 15]),
    'rf__max_depth' : trial.suggest_categorical("rf__max_depth", [10, 18, 26]),
    'rf__min_samples_split' : trial.suggest_categorical('rf__min_samples_split', [2, 7]),
    'rf__min_samples_leaf' : trial.suggest_categorical('rf__min_samples_leaf', [1, 3]),
    }

    pipe_rf.set_params(**params)
    #usiamo cross validate. vedi documentazione

    cross_v=cross_validate(
        pipe_rf,
        x_train,
        y_train,
        scoring=make_scorer(mean_absolute_error,greater_is_better=False),
        cv=KFold(shuffle=True, random_state=42)
        )
    return abs(sum(cross_v['test_score'])/len(cross_v['test_score']))

n_trials = 10
with tqdm(total=n_trials) as pbar:
    def tqdm_callback(study, trial):
        pbar.update(1)
    
    study.optimize(objective_func, n_trials=n_trials, callbacks=[tqdm_callback])

best_params = study.best_params
pipe_rf.set_params(**best_params)
pipe_rf.fit(x_train, y_train)
y_test_pred = pipe_rf.predict(x_test)


mae = mean_absolute_error(y_test, y_test_pred)

print("Best parameters:", best_params)
print("MAE:", mae)

study.visualize()

In [None]:
# visualizzazione studio
study= optuna.load_study(storage="sqlite:///model_selection.db", study_name="rf02_02",)
vis.plot_optimization_history(study).show()
vis.plot_param_importances(study).show()
vis.plot_parallel_coordinate(study).show()

# 3° XGBoost

Selgo di usare xgboost, non perché gli altri algoritmi non si sono mostrati validi, ma perché usando dmatrix è più veloce. 

Ha appoggio GPU, testato ma non implementato in questo notebook

In [None]:
#faccio questo tipo di model selection: valuto diversi encoding e diversi iperparametri di xgboost
study = optuna.create_study(storage="sqlite:///model_selection.db", study_name="xgb_opt_03_01", direction="minimize")

def objective(trial):
    model_choice = trial.suggest_categorical("model_choice", ["onehot", 'binary', 'target'])

    if model_choice == "onehot":
        min_freq = trial.suggest_int("encoder__encoder__min_frequency", 1,40)
        encoder = OneHotEncoder(sparse_output=False, 
                                handle_unknown="infrequent_if_exist", 
                                min_frequency= min_freq)

        n_estimators=trial.suggest_categorical("xgb__n_estimators", [400,500,600,700,800])
        max_depth= trial.suggest_int("xgb__max_depth", 5, 25)
        learning_rate= trial.suggest_float("xgb__learning_rate", 0.01, 0.08)
        subsample= trial.suggest_float("xgb__subsample", 0.6, 1.0)
        colsample_bytree= trial.suggest_float("xgb__colsample_bytree", 0.4, 0.8)
        min_child_weight= trial.suggest_int("xgb__min_child_weight", 3, 10)
        objective= trial.suggest_categorical("xgb__objective",['reg:squarederror', 'reg:absoluteerror'] )

    elif model_choice == "target":
        smoothing=trial.suggest_categorical('encoder__encoder__smoothing',[0.1, 0.3, 1, 3, 10, 30, 100])
        encoder = tg(target_type="continuous", 
                                random_state=42, 
                                #min_samples_leaf=min_samples_leaf,
                                smooth=smoothing
                                )

        n_estimators=trial.suggest_categorical("xgb__n_estimators", [400,500,600,700,800])
        max_depth= trial.suggest_int("xgb__max_depth", 5, 25)
        learning_rate= trial.suggest_float("xgb__learning_rate", 0.01, 0.08)
        subsample= trial.suggest_float("xgb__subsample", 0.6, 1.0)
        colsample_bytree= trial.suggest_float("xgb__colsample_bytree", 0.4, 0.8)
        min_child_weight= trial.suggest_int("xgb__min_child_weight", 3, 10)
        objective= trial.suggest_categorical("xgb__objective",['reg:squarederror', 'reg:absoluteerror'] )

    else:  
        encoder = BinaryEncoder(cols=cat_cols, return_df=True)
        n_estimators=trial.suggest_categorical("xgb__n_estimators", [400,500,600,700,800])
        max_depth= trial.suggest_int("xgb__max_depth", 5, 25)
        learning_rate= trial.suggest_float("xgb__learning_rate", 0.01, 0.08)
        subsample= trial.suggest_float("xgb__subsample", 0.6, 1.0)
        colsample_bytree= trial.suggest_float("xgb__colsample_bytree", 0.4, 0.8)
        min_child_weight= trial.suggest_int("xgb__min_child_weight", 3, 10)
        objective= trial.suggest_categorical("xgb__objective",['reg:squarederror', 'reg:absoluteerror'] )

    preprocessor = ColumnTransformer(
        transformers=
        [
            ('cat',encoder, cat_cols),
            ('num', StandardScaler(), num_cols)
        ],
        remainder="passthrough",
        verbose_feature_names_out=False,
        force_int_remainder_cols=False
        )

    xgb = XGBRegressor(
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        subsample=subsample,#frazione di campioni usata per costruire ogni albero (stocasticità). Riduce overfitting
        colsample_bytree=colsample_bytree,
        min_child_weight=min_child_weight, #soglia minima sulla somma dei pesi (cioè degli "hessian", nel contesto XGBoost) nei nodi figli. Valori alti impediscono la crescita di nodi con pochi dati, aumentando il bias.
        objective=objective,
        learning_rate=learning_rate, 
        random_state=42,
        
        tree_method='hist',  # più veloce
        n_jobs=-1
        )

    pipe = Pipeline([
        ("encoder", preprocessor),
        ("regressor", xgb)
    ])

    cross_v = cross_validate(
        pipe,
        x_train,
        y_train,
        scoring=make_scorer(mean_absolute_error, greater_is_better=False),
        cv=KFold(shuffle=True, random_state=42)
    )

    return abs(cross_v['test_score'].mean())

# Lancio dell’ottimizzazione
n_trials = 2
with tqdm(total=n_trials) as pbar:
    def tqdm_callback(study, trial):
        pbar.update(1)

    study.optimize(objective, n_trials=n_trials, callbacks=[tqdm_callback])

In [None]:
# visualizzazione risultati
study = optuna.load_study(storage="sqlite:///model_selection2.db", study_name="xgb_opt_01_01")

vis.plot_optimization_history(study).show()
vis.plot_param_importances(study).show()
vis.plot_parallel_coordinate(study).show()


In [None]:
# model selection con solo target encoding
#study = optuna.create_study(storage="sqlite:///model_selection.db", study_name="xgb_opt_04_4", direction="minimize")
study=optuna.load_study(storage="sqlite:///model_selection.db", study_name="xgb_opt_04_4")
def objective(trial):
    #min_samples_leaf= trial.suggest_int("encoder__encoder__min_samples_leaf", 1,15)
    smoothing=trial.suggest_categorical('encoder__encoder__smoothing',[3,15])
    encoder = tg(target_type="continuous", 
                            random_state=42, 
                            #min_samples_leaf=min_samples_leaf,
                            smooth=smoothing
                            )

    n_estimators=trial.suggest_categorical("xgb__n_estimators", [1300,1800,2000,2500,3000])
    max_depth= trial.suggest_int("xgb__max_depth", 6, 15)
    learning_rate= trial.suggest_float("xgb__learning_rate", 0.01,0.03)
    colsample_bytree= trial.suggest_float("xgb__colsample_bytree", 0.6, 0.75)
    min_child_weight= trial.suggest_int("xgb__min_child_weight", 6, 7)

    
    preprocessor = ColumnTransformer(
        transformers=
        [
            ('cat',encoder, cat_cols),
            ('num', StandardScaler(), num_cols)
        ],
        remainder="passthrough",
        verbose_feature_names_out=False,
        force_int_remainder_cols=False
        )

    xgb = XGBRegressor(
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        subsample=0.6005480544522415,
        colsample_bytree=colsample_bytree,
        min_child_weight=min_child_weight,
        objective='reg:absoluteerror',
        learning_rate=learning_rate, 
        random_state=42,
        
        tree_method='hist',  # più veloce
        n_jobs=-1
        )
    
    pipe = Pipeline([
        ("encoder", preprocessor),
        ("xgb", xgb)
    ])

    cross_v = cross_validate(
        pipe,
        x_train,
        y_train,
        scoring=make_scorer(mean_absolute_error, greater_is_better=False),
        cv=KFold(shuffle=True, random_state=42)
    )

    return abs(cross_v['test_score'].mean())

n_trials =200
with tqdm(total=n_trials) as pbar:
    def tqdm_callback(study, trial):
        pbar.update(1)

    study.optimize(objective, n_trials=n_trials, callbacks=[tqdm_callback])

print(study.best_params)

In [None]:
# a seguito di diversi studi optuna, tentativi naive e manuali, qui sotto modello con quelli che sembrano essere gli iperparametri migliori
encoder=tg(target_type='continuous', random_state=42,smooth=3 )

preprocessor = ColumnTransformer(
    transformers=
    [
        ('cat',encoder, cat_cols),
        ('num', StandardScaler(), num_cols)
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
    force_int_remainder_cols=False
    )

xgb = XGBRegressor(
    n_estimators=3000,
    min_child_weight= 6,
    max_depth= 7,
    learning_rate=0.01278,
    colsample_bytree =0.65884,
    objective='reg:absoluteerror',
    random_state=42,
    tree_method='hist',  # più veloce
    n_jobs=-1
    )


pipe_xgb = Pipeline([
    ("encoder", preprocessor),
    ("xgb", xgb)
])

pipe_xgb.fit(x_train, y_train)

y_test_pred=pipe_xgb.predict(x_test)

mean_absolute_error(y_test, y_test_pred)
#5.764026641845703


In [None]:
# analisi dei residui


results = pd.DataFrame({
    'actual_y': y_test,
    'predicted_y': y_test_pred,
    'residuals': y_test - y_test_pred
})

fig = px.scatter(results, x="actual_y", y="residuals", hover_data=results.columns)
fig.add_hline(y=0, line_color="red", line_dash="dash")
fig.show()

In [None]:
# mae sul train set
y_test_pred=pipe_xgb.predict(x_train)
mean_absolute_error(y_train, y_test_pred)

5.48773193359375

# 4° Provo a correggere le previsioni con un modello per prevedere i residui.

L'obiettivo è prevedere l'errore e sottrarlo alla previsione vera e propria

In [87]:
# previsioni, calcolo residui, inizializzazione nuovo dataset, train test split

y_train_pred=pipe_xgb.predict(x_train)
train_residui=y_train-y_train_pred
df2=x_train
df2['residuals']=train_residui
df2 = df2.sample(frac=1, random_state=42)
res_x_train,res_x_test, res_y_train, res_y_test=train_test_split(df2.drop(columns='residuals'),df2['residuals'], test_size=0.2, shuffle=True, random_state=42)


In [None]:
# Uso un modello simile al precedente e faccio uno studio con optuna per cercare i migliori iperparametri

study = optuna.create_study(storage="sqlite:///model_selection.db", study_name="xgb_opt_05_2", direction="minimize")
#study=optuna.load_study(storage="sqlite:///model_selection.db", study_name="xgb_opt_04_4")
def objective(trial):
    #min_samples_leaf= trial.suggest_int("encoder__encoder__min_samples_leaf", 1,15)
    smoothing=trial.suggest_categorical('encoder__smoothing',[3,5,15])
    encoder = tg(target_type="continuous", 
                            random_state=42, 
                            #min_samples_leaf=min_samples_leaf,
                            smooth=smoothing
                            )

    n_estimators=trial.suggest_categorical("xgb__n_estimators", [1300,1800,2000,2500,3000])
    max_depth= trial.suggest_int("xgb__max_depth", 6, 15)
    learning_rate= trial.suggest_float("xgb__learning_rate", 0.01,0.03)
    colsample_bytree= trial.suggest_float("xgb__colsample_bytree", 0.2, 0.75)
    min_child_weight= trial.suggest_int("xgb__min_child_weight", 4, 8)
    subsample= trial.suggest_float("xgb__subsample", 0.6, 1.0)
    objective=trial.suggest_categorical('xgb__objective',['reg:absoluteerror','reg:squarederror'])
    
    preprocessor = ColumnTransformer(
        transformers=
        [
            ('cat',encoder, cat_cols),
            ('num', StandardScaler(), num_cols)
        ],
        remainder="passthrough",
        verbose_feature_names_out=False,
        force_int_remainder_cols=False
        )

    xgb = XGBRegressor(
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        min_child_weight=min_child_weight,
        objective=objective,
        learning_rate=learning_rate, 
        random_state=42,
        
        tree_method='hist',  # più veloce
        n_jobs=-1
        )
    
    pipe2 = Pipeline([
        ("preprocessor", preprocessor),
        ("xgb", xgb)
    ])

    cross_v = cross_validate(
        pipe2,
        res_x_train,
        res_y_train,
        scoring=make_scorer(mean_absolute_error, greater_is_better=False),
        cv=KFold(shuffle=True, random_state=42)
    )

    return abs(cross_v['test_score'].mean())

n_trials =200
with tqdm(total=n_trials) as pbar:
    def tqdm_callback(study, trial):
        pbar.update(1)

    study.optimize(objective, n_trials=n_trials, callbacks=[tqdm_callback])

print(study.best_params)

In [None]:
#visualizzazione risultati studio
study = optuna.load_study(storage="sqlite:///model_selection.db", study_name="xgb_opt_05_2")
vis.plot_optimization_history(study).show()
vis.plot_param_importances(study).show()
vis.plot_parallel_coordinate(study).show()


In [None]:
# secondo modello completo e fit con i migliori iperparametri 

# encoder e pipeline
y_train_pred=pipe_xgb.predict(x_train)
train_residui=y_train-y_train_pred#calcolo residui train
encoder=tg(target_type='continuous', random_state=42,smooth=5 )

preprocessor = ColumnTransformer(
    transformers=
    [
        ('cat',encoder, cat_cols),
        ('num', StandardScaler(), num_cols)
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
    force_int_remainder_cols=False
    )

xgb = XGBRegressor(
    n_estimators=2500,
    min_child_weight= 6,
    max_depth= 7,
    learning_rate=0.012078,
    colsample_bytree =0.65884,
    objective='reg:absoluteerror',
    random_state=42,
    tree_method='hist',  # più veloce
    n_jobs=-1
    )


pipe2 = Pipeline([
    ("encoder", preprocessor),
    ("xgb", xgb)
])


pipe2.fit(res_x_train, res_y_train)


In [106]:
# calcolo migliori previsioni e valutazione con MAE

y_test_pred=pipe_xgb.predict(x_test) # valori pred test y

y_residui_predetti=pipe2.predict(x_test)
test_final=y_test_pred+y_residui_predetti
mean_absolute_error(y_test, test_final)


5.749392509460449

In [None]:
# mae sul train set

y_train_pred=pipe_xgb.predict(x_train) # valori pred test y
test_residui=y_train - y_train_pred# valori residui differenza tra y_test e i predetti
y_residui_predetti=pipe2.predict(x_train)
final=y_train_pred+y_residui_predetti
mean_absolute_error(y_train, final)

5.422630786895752

In [None]:
# provo ad usare un regressore lineare come secondo modello 
encoder=tg(target_type='continuous', random_state=42,smooth=5 )

preprocessor = ColumnTransformer(
    transformers=
    [
        ('cat',encoder, cat_cols),
        ('num', StandardScaler(), num_cols)
    ],
    remainder="passthrough",
    verbose_feature_names_out=False,
    force_int_remainder_cols=False
    )


pipe_rl2=Pipeline(
    [
       
        ('preprocessor', preprocessor),
        ('linreg', LinearRegression())
    ]
)

pipe_rl2.fit(res_x_train, res_y_train)

y_test_pred=pipe_xgb.predict(x_test) # valori pred test y

piperl2_residui_predetti=pipe_rl2.predict(x_test)
test_final_lr=y_test_pred+piperl2_residui_predetti
mean_absolute_error(y_test, test_final_lr)

6.1274012720572175

In [None]:
# grafici dei residui prima e dopo la correzione con secondo modello

results = pd.DataFrame({
    'actual_y': y_test,
    'predicted_y': test_final,
    'residuals': y_test - test_final
})

fig = px.scatter(results, x="actual_y", y="residuals", hover_data=results.columns)
fig.add_hline(y=0, line_color="red", line_dash="dash")
fig.show()

In [None]:
# grafico residui veri e residui predetti

df_residui = pd.DataFrame({
    "residui_veri": test_residui,
    "residui_predetti": y_residui_predetti
})

fig = px.scatter(df_residui, x="residui_veri", y="residui_predetti",
                 title="Residui veri vs residui predetti dal secondo modello",
                 labels={"residui_veri": "Residui veri", "residui_predetti": "Residui predetti"})

fig.add_hline(y=0, line_color="red", line_dash="dash")
fig.add_vline(x=0, line_color="red", line_dash="dash")

#fig.update_layout(width=700, height=500)
fig.show()


# naive features engineering 

In [None]:
# grafico importanza features

booster = pipe_xgb.named_steps['xgb']
feature_names = pipe_xgb.named_steps['encoder'].get_feature_names_out() 
importances = booster.feature_importances_

df_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
})

df_sort = df_importance.sort_values(by='importance', ascending=False).head(10)

# Grafico interattivo a barre
fig = px.bar(df_sort, x='importance', y='feature', orientation='h', title="Top 10 Feature Importances")
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

print(df_sort)