# Build & Train

**Objetivo:**





## Setu Up

In [14]:
# ============================================================
# Unified Logging Function ‚Äî EXACT format matching results.txt
# ============================================================

import os
from datetime import datetime

def log_training_result(
    log_file,
    model_name,
    round_name,
    train_samples,
    test_samples,
    notes,
    metrics_str,
    model_file,
    preds_file,
    train_phase_counts="-",
    test_phase_counts="-",
    log_type="train"
):
    """
    Writes ONE clean tab-separated line to results.txt,
    following the exact format found in the user's example.
    """

    # Create header if file does not exist
    if not os.path.exists(log_file):
        with open(log_file, "w") as f:
            f.write(
                "timestamp\tmodel\tround\ttype\t"
                "train_samples\ttest_samples\t"
                "train_phase_counts\ttest_phase_counts\t"
                "notes\tmetrics\tmodel_file\tpreds_file\n"
            )

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    line = (
        f"{timestamp}\t"
        f"{model_name}\t"
        f"{round_name}\t"
        f"{log_type}\t"
        f"{train_samples}\t"
        f"{test_samples}\t"
        f"{train_phase_counts}\t"
        f"{test_phase_counts}\t"
        f"{notes}\t"
        f"{metrics_str}\t"
        f"{model_file}\t"
        f"{preds_file}\n"
    )

    with open(log_file, "a") as f:
        f.write(line)

    print(f"üìù Logged in {log_file}")

In [None]:
# ============================================================
# Global Parameters ‚Äî Model Training Notebook (XGBoost Baseline)
# ============================================================

import os

# -------------------------
# Base Project Folder
# -------------------------
BASE_DIR = "/Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso"

# -------------------------
# Input Split Files (from Phase 6)
# -------------------------
SPLIT_DIR = os.path.join(BASE_DIR, "data", "splits", ROUND_NAME)

TRAIN_FILE = os.path.join(SPLIT_DIR, f"{ROUND_NAME}_train.txt")
TEST_FILE  = os.path.join(SPLIT_DIR, f"{ROUND_NAME}_test.txt")

# -------------------------
# Output Directories for Models + Results
# -------------------------
MODEL_DIR   = os.path.join(BASE_DIR, "model", ROUND_NAME)
RESULTS_DIR = os.path.join(BASE_DIR, "results")

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# -------------------------
# Paths to save artifacts
# -------------------------
MODEL_OUT = os.path.join(MODEL_DIR, f"{MODEL_NAME}.json")
PREDS_OUT = os.path.join(MODEL_DIR, f"{MODEL_NAME}_preds.txt")

# -------------------------
# Central Experiment Log
# -------------------------
LOG_FILE = os.path.join(RESULTS_DIR, "results.txt")

# -------------------------
# Display Parameters
# -------------------------
print("üìå PARAMETERS LOADED FOR TRAINING NOTEBOOK")
print("-----------------------------------------")
print(f"ROUND_NAME  : {ROUND_NAME}")
print(f"MODEL_NAME  : {MODEL_NAME}")
print(f"NOTES       : {NOTES}\n")

print("üì• INPUT FILES:")
print(f"TRAIN_FILE  : {TRAIN_FILE}")
print(f"TEST_FILE   : {TEST_FILE}\n")

print("üíæ OUTPUT DIRECTORIES:")
print(f"MODEL_DIR   : {MODEL_DIR}")
print(f"RESULTS_DIR : {RESULTS_DIR}")
print(f"MODEL_OUT   : {MODEL_OUT}")
print(f"PREDS_OUT   : {PREDS_OUT}\n")

print(f"üìù LOG_FILE  : {LOG_FILE}")


üìå PARAMETERS LOADED FOR TRAINING NOTEBOOK
-----------------------------------------
ROUND_NAME  : round_01
MODEL_NAME  : xgboost_baseline
NOTES       : Baseline training ‚Äî No oversampling, all features

üì• INPUT FILES:
TRAIN_FILE  : /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/data/splits/round_01/round_01_train.txt
TEST_FILE   : /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/data/splits/round_01/round_01_test.txt

üíæ OUTPUT DIRECTORIES:
MODEL_DIR   : /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_01
RESULTS_DIR : /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/results
MODEL_OUT   : /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_01/xgboost_baseline.json
PREDS_OUT   : /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_01/xgboost_baseline_preds.txt

üìù LOG_FILE  : /Users/edmundobrown/Documen

## Load Train & Test Splits  
This cell loads the preprocessed dataset generated during the Feature Engineering stage.  
Using the paths defined in the global parameters, it:

- Loads the **train** and **test** split files generated in Phase 6  
- Extracts numerical features automatically  
- Separates `X_train`, `y_train`, `X_test`, `y_test`  
- Performs consistency checks (shape, missing values, expected columns)  
- Prints dataset summary to ensure everything is correct before training  

This is the first step of the training pipeline and prepares the data for the XGBoost model.



In [8]:
# ============================================================
# FASE 7 ‚Äî Load Train/Test Splits for Training
# ============================================================

import os
import pandas as pd
import numpy as np

print("üì• Loading train/test data for training...")
print(f"TRAIN_OUT: {TRAIN_FILE}")
print(f"TEST_OUT : {TEST_FILE}")

# ------------------------------------------------------------
# 1. Load DataFrames
# ------------------------------------------------------------
try:
    train_df = pd.read_csv(TRAIN_FILE)
    test_df  = pd.read_csv(TEST_FILE)
except Exception as e:
    raise RuntimeError(f"‚ùå Failed to load split files: {e}")

print("\n‚úÖ Loaded successfully!")
print(f"Train shape: {train_df.shape}")
print(f"Test shape : {test_df.shape}")

# ------------------------------------------------------------
# 2. Basic validation
# ------------------------------------------------------------
required_cols = ["hr_true"]

for col in required_cols:
    if col not in train_df.columns:
        raise ValueError(f"‚ùå Missing column in train_df: {col}")
    if col not in test_df.columns:
        raise ValueError(f"‚ùå Missing column in test_df: {col}")

# Check empty frames
if len(train_df) == 0 or len(test_df) == 0:
    raise ValueError("‚ùå Train or Test split is empty ‚Äî splitting process failed!")

# ------------------------------------------------------------
# 3. Build X/y matrices
# ------------------------------------------------------------
feature_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
if "hr_true" in feature_cols:
    feature_cols.remove("hr_true")

X_train = train_df[feature_cols].copy()
y_train = train_df["hr_true"].copy()

X_test  = test_df[feature_cols].copy()
y_test  = test_df["hr_true"].copy()

print("\nüîç Feature extraction complete:")
print(f" ‚Üí Number of features: {len(feature_cols)}")
print(f" ‚Üí Features: {feature_cols}")

# ------------------------------------------------------------
# 4. Sanity checks
# ------------------------------------------------------------
if X_train.isna().any().any() or X_test.isna().any().any():
    print("‚ö†Ô∏è Warning: Missing values detected ‚Äî consider imputing")

if np.isinf(X_train.values).any() or np.isinf(X_test.values).any():
    raise ValueError("‚ùå Infinite values detected in features!")

print("\nüìä FINAL SUMMARY:")
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"X_test : {X_test.shape}")
print(f"y_test : {y_test.shape}")

print("\n‚úÖ Train/Test successfully loaded and validated!")

üì• Loading train/test data for training...
TRAIN_OUT: /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/data/splits/round_01/round_01_train.txt
TEST_OUT : /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/data/splits/round_01/round_01_test.txt

‚úÖ Loaded successfully!
Train shape: (633, 33)
Test shape : (159, 33)

üîç Feature extraction complete:
 ‚Üí Number of features: 32
 ‚Üí Features: ['ppg_mean', 'ppg_std', 'ppg_min', 'ppg_max', 'ppg_range', 'imu_mean', 'imu_std', 'imu_p95', 'imu_energy', 'acc_rms', 'ppg_bp_low', 'ppg_bp_hr', 'ppg_bp_high', 'ppg_bp_hr_norm', 'ppg_f_dom', 'imu_bp_low', 'imu_bp_high', 'imu_jerk_mean', 'imu_jerk_std', 'coherence_ppg_imu', 'ppg_entropy', 'imu_entropy', 'sqi', 'fusion_ppg_imu', 'hr_candidate', 'phase_id', 'sqi_flag', 'motion_weight', 'hr_cand_weighted', 'ppg_hr_smooth', 'artifact_ratio', 'phase']

üìä FINAL SUMMARY:
X_train: (633, 32)
y_train: (633,)
X_test : (159, 32)
y_test : (159,)

‚úÖ Train/Test s

## Round_01 -  Baseline
Sess√£o de par√¢metros

‚úî Treino XGBoost

‚úî Salvamento de:
	‚Ä¢	modelo completo (.json)
	‚Ä¢	best iteration
	‚Ä¢	feature importance (.txt)
	‚Ä¢	predi√ß√µes (.txt)
	‚Ä¢	residuals (.txt)
	‚Ä¢	erro por faixa de HR (.txt)
	‚Ä¢	scatter true vs predicted (.txt CSV-style para NotebookLM)

‚úî Atualiza√ß√£o autom√°tica do results.tx

In [None]:
#

üìå TRAINING PARAMETERS
------------------------------
ROUND_NAME : round_01
MODEL_NAME : xgboost_baseline
MODEL_DIR  : /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_01
RESULTS_DIR: /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/results
MODEL_OUT  : /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_01/xgboost_baseline.json

üì¶ Data found and ready for training!
Train: (633, 32) | Test: (159, 32)

üöÄ Training XGBoost...

üìä Evaluating model...
MAE  = 3.636
RMSE = 5.728
R¬≤   = 0.550
Corr = 0.742
Best iteration: 178

üíæ Model saved to: /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_01/xgboost_baseline.json
üíæ Predictions saved to: /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_01/xgboost_baseline_preds.txt
üíæ Residuals saved to: /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/m

  err_stats = error_df.groupby("hr_bin")["err"].agg(["mean", "std", "count"])


In [16]:
# ============================================================
# FASE 7 ‚Äî Training (XGBoost) with Prefix Support
# ============================================================

import os
import pandas as pd
import numpy as np
from datetime import datetime
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr

# -------------------------
# Training Parameters
# -------------------------
ROUND_NAME = "round_01"                 # same round as FE/Split
MODEL_NAME = "xgboost_baseline"         # name of the model
PREFIX     = "r01"                      # <<=== NEW: prefix for saved files
NOTES      = "Baseline training ‚Äî No oversampling, all features"

# -------------------------
# Directory structure
# -------------------------
BASE_DIR = "/Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso"

SPLIT_DIR   = os.path.join(BASE_DIR, "data", "splits", ROUND_NAME)
MODEL_DIR   = os.path.join(BASE_DIR, "model", ROUND_NAME)
RESULTS_DIR = os.path.join(BASE_DIR, "results")

TRAIN_FILE = os.path.join(SPLIT_DIR, f"{ROUND_NAME}_train.txt")
TEST_FILE  = os.path.join(SPLIT_DIR, f"{ROUND_NAME}_test.txt")

LOG_FILE   = os.path.join(RESULTS_DIR, "results.txt")

# Output model artifacts (with PREFIX support)
MODEL_OUT  = os.path.join(MODEL_DIR, f"{PREFIX}_{MODEL_NAME}.json")
PREDS_OUT  = os.path.join(MODEL_DIR, f"{PREFIX}_preds.txt")
RESID_OUT  = os.path.join(MODEL_DIR, f"{PREFIX}_residuals.txt")
FI_OUT     = os.path.join(MODEL_DIR, f"{PREFIX}_feature_importance.txt")
ERRBIN_OUT = os.path.join(MODEL_DIR, f"{PREFIX}_error_bins.txt")

os.makedirs(MODEL_DIR, exist_ok=True)

print("üìå TRAINING PARAMETERS")
print("------------------------------")
print(f"ROUND_NAME : {ROUND_NAME}")
print(f"MODEL_NAME : {MODEL_NAME}")
print(f"PREFIX     : {PREFIX}")
print(f"MODEL_DIR  : {MODEL_DIR}")
print(f"MODEL_OUT  : {MODEL_OUT}\n")

# ============================================================
# LOAD TRAIN/TEST SPLITS
# ============================================================

print("üì• Loading train/test split...")

train_df = pd.read_csv(TRAIN_FILE)
test_df  = pd.read_csv(TEST_FILE)

feature_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols.remove("hr_true")

X_train = train_df[feature_cols]
y_train = train_df["hr_true"]

X_test  = test_df[feature_cols]
y_test  = test_df["hr_true"]

print(f"Train: {X_train.shape} | Test: {X_test.shape}")

# ============================================================
# TRAIN MODEL
# ============================================================

print("\nüöÄ Training XGBoost...")

model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=800,
    learning_rate=0.02,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    early_stopping_rounds=30
)

model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
best_iter = model.best_iteration

# ============================================================
# EVALUATE
# ============================================================

print("\nüìä Evaluating model...")

y_pred = model.predict(X_test)
residuals = y_test - y_pred

mae  = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)
corr, _ = pearsonr(y_test, y_pred)

print(f"MAE  = {mae:.3f}")
print(f"RMSE = {rmse:.3f}")
print(f"R¬≤   = {r2:.3f}")
print(f"Corr = {corr:.3f}")
print(f"Best iteration: {best_iter}")

# ============================================================
# SAVE MODEL + ARTIFACTS (NOW WITH PREFIX)
# ============================================================

model.save_model(MODEL_OUT)

pd.DataFrame({
    "phase": test_df["phase"],
    "hr_true": y_test,
    "hr_pred": y_pred
}).to_csv(PREDS_OUT, index=False)

pd.DataFrame({"residual": residuals}).to_csv(RESID_OUT, index=False)

# Feature importance
fi_df = pd.DataFrame({
    "feature": feature_cols,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False)
fi_df.to_csv(FI_OUT, index=False)

# Error bins
err_df = pd.DataFrame({
    "hr_true": y_test,
    "err": abs(residuals)
})
err_df["hr_bin"] = pd.cut(err_df["hr_true"], bins=5)
err_df.groupby("hr_bin")["err"].agg(["mean", "std", "count"]).to_csv(ERRBIN_OUT)

print(f"\nüíæ Model saved to: {MODEL_OUT}")
print(f"üíæ Predictions saved to: {PREDS_OUT}")
print(f"üíæ Residuals saved to: {RESID_OUT}")
print(f"üíæ Feature importance saved to: {FI_OUT}")
print(f"üíæ Error bins saved to: {ERRBIN_OUT}")

# ============================================================
# UPDATE RESULTS.TXT
# ============================================================

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
metrics_str = f"{mae:.4f},{rmse:.4f},{r2:.4f},{corr:.4f}"

log_line = (
    f"{timestamp}\t"
    f"{MODEL_NAME}\t"
    f"{ROUND_NAME}\t"
    f"train\t"
    f"{len(train_df)}\t{len(test_df)}\t"
    f"{NOTES}\t"
    f"{metrics_str}\t"
    f"{os.path.basename(MODEL_OUT)}\t"
    f"{os.path.basename(PREDS_OUT)}\n"
)

with open(LOG_FILE, "a") as f:
    f.write(log_line)

print("\nüìù Training entry appended to results.txt!")
print("üéâ Training pipeline complete!")

üìå TRAINING PARAMETERS
------------------------------
ROUND_NAME : round_01
MODEL_NAME : xgboost_baseline
PREFIX     : r01
MODEL_DIR  : /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_01
MODEL_OUT  : /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_01/r01_xgboost_baseline.json

üì• Loading train/test split...
Train: (633, 32) | Test: (159, 32)

üöÄ Training XGBoost...

üìä Evaluating model...
MAE  = 3.671
RMSE = 5.742
R¬≤   = 0.548
Corr = 0.740
Best iteration: 182

üíæ Model saved to: /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_01/r01_xgboost_baseline.json
üíæ Predictions saved to: /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_01/r01_preds.txt
üíæ Residuals saved to: /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_01/r01_residuals.txt
üíæ Feature importance saved to: /Users/edmundobrown/Doc

  err_df.groupby("hr_bin")["err"].agg(["mean", "std", "count"]).to_csv(ERRBIN_OUT)


## Round_02 - com oversampling inteligente 

In [None]:
# ============================================================
# FASE 7 ‚Äî Training Notebook (Round 02 ‚Äì Oversampled Dataset)
# ============================================================

import os
import pandas as pd
import numpy as np
from datetime import datetime
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr

# ============================================================
# PARAMETERS
# ============================================================

ROUND_NAME  = "round_02"
MODEL_NAME  = "xgboost_oversampling_v1"
PREFIX      = "r02"                         # <<< NOVO
NOTES       = "HR-aware SMOTE-like oversampling on feature space"

GRAVA_LOG   = True                          # <<< NOVO (default=True)

BASE_DIR = "/Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso"

# ------------------------------------------------------------
# Split folder generated by oversampling code
# ------------------------------------------------------------
SPLIT_DIR = os.path.join(BASE_DIR, "data", "splits", ROUND_NAME)

TRAIN_FILE = os.path.join(SPLIT_DIR, f"{ROUND_NAME}_train.txt")
TEST_FILE  = os.path.join(SPLIT_DIR, f"{ROUND_NAME}_test.txt")

# ------------------------------------------------------------
# Model output directory
# ------------------------------------------------------------
MODEL_DIR = os.path.join(BASE_DIR, "model", ROUND_NAME)
os.makedirs(MODEL_DIR, exist_ok=True)

# ------------------------------------------------------------
# Outputs with PREFIX
# ------------------------------------------------------------
MODEL_OUT      = os.path.join(MODEL_DIR, f"{PREFIX}_{MODEL_NAME}.json")
PREDS_OUT      = os.path.join(MODEL_DIR, f"{PREFIX}_{MODEL_NAME}_preds.txt")
RESID_OUT      = os.path.join(MODEL_DIR, f"{PREFIX}_{MODEL_NAME}_residuals.txt")
FEATIMP_OUT    = os.path.join(MODEL_DIR, f"{PREFIX}_{MODEL_NAME}_feature_importance.txt")
ERRORBINS_OUT  = os.path.join(MODEL_DIR, f"{PREFIX}_{MODEL_NAME}_error_bins.txt")

# results.txt logger
RESULTS_FILE = os.path.join(BASE_DIR, "results", "results.txt")

print("üìå PARAMETERS:")
print(f"ROUND_NAME : {ROUND_NAME}")
print(f"MODEL_NAME : {MODEL_NAME}")
print(f"PREFIX     : {PREFIX}")
print(f"GRAVA_LOG  : {GRAVA_LOG}")
print(f"TRAIN_FILE : {TRAIN_FILE}")
print(f"TEST_FILE  : {TEST_FILE}")
print(f"MODEL_DIR  : {MODEL_DIR}")

# ============================================================
# LOGGER (optional)
# ============================================================

def log_train(
    log_path,
    round_name,
    model_name,
    train_count,
    test_count,
    notes,
    metrics,
    model_file,
    preds_file
):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    header = (
        "timestamp\tmodel\tround\ttype\ttrain_samples\t"
        "test_samples\tnotes\tmetrics\tmodel_file\tpreds_file\n"
    )

    if not os.path.exists(log_path):
        with open(log_path, "w") as f:
            f.write(header)

    line = (
        f"{timestamp}\t{model_name}\t{round_name}\ttrain\t"
        f"{train_count}\t{test_count}\t"
        f"{notes}\t{metrics}\t"
        f"{model_file}\t{preds_file}\n"
    )

    with open(log_path, "a") as f:
        f.write(line)

    print(f"üìù Logged training entry ‚Üí {log_path}")

# ============================================================
# LOAD DATA
# ============================================================

print("\nüì• Loading train/test data...")
train_df = pd.read_csv(TRAIN_FILE)
test_df  = pd.read_csv(TEST_FILE)

print(f"Train: {train_df.shape} | Test: {test_df.shape}")

feature_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols.remove("hr_true")

X_train = train_df[feature_cols]
y_train = train_df["hr_true"]

X_test  = test_df[feature_cols]
y_test  = test_df["hr_true"]

print(f"‚úî Using {len(feature_cols)} features")

# ============================================================
# MODEL TRAINING
# ============================================================

print("\nüöÄ Training model...")

model = XGBRegressor(
    objective="reg:squarederror",
    tree_method="auto",
    n_estimators=900,
    learning_rate=0.018,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    early_stopping_rounds=40,
)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

print("‚úî Model trained!")

# ============================================================
# EVALUATION
# ============================================================

print("\nüìä Evaluating model...")

y_pred = model.predict(X_test)

mae  = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)
corr, _ = pearsonr(y_test, y_pred)

metrics_str = f"MAE={mae:.4f},RMSE={rmse:.4f},R2={r2:.4f},Corr={corr:.4f}"

print(f"MAE  = {mae:.3f}")
print(f"RMSE = {rmse:.3f}")
print(f"R¬≤   = {r2:.3f}")
print(f"Corr = {corr:.3f}")
print(f"Best iteration: {model.best_iteration}")

# ============================================================
# SAVE ARTIFACTS
# ============================================================

print("\nüíæ Saving model + artifacts...")

model.save_model(MODEL_OUT)

pd.DataFrame({
    "hr_true": y_test,
    "hr_pred": y_pred,
}).to_csv(PREDS_OUT, index=False)

pd.DataFrame({
    "hr_true": y_test,
    "hr_pred": y_pred,
    "residual": y_test - y_pred
}).to_csv(RESID_OUT, index=False)

pd.DataFrame({
    "feature": feature_cols,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False).to_csv(FEATIMP_OUT, index=False)

error_df = pd.DataFrame({
    "hr_true": y_test,
    "err": np.abs(y_test - y_pred)
})
error_bins = error_df.groupby(pd.cut(error_df["hr_true"], bins=6))["err"].agg(["mean", "std", "count"])
error_bins.to_csv(ERRORBINS_OUT)

print("‚úî Files saved:")
print(MODEL_OUT)
print(PREDS_OUT)
print(RESID_OUT)
print(FEATIMP_OUT)
print(ERRORBINS_OUT)

# ============================================================
# LOG RESULTS (OPTIONAL)
# ============================================================

if GRAVA_LOG:
    log_train(
        log_path=RESULTS_FILE,
        round_name=ROUND_NAME,
        model_name=f"{PREFIX}_{MODEL_NAME}",
        train_count=len(train_df),
        test_count=len(test_df),
        notes=NOTES,
        metrics=metrics_str,
        model_file=os.path.basename(MODEL_OUT),
        preds_file=os.path.basename(PREDS_OUT),
    )
else:
    print("üõë GRAVA_LOG=False ‚Üí results.txt NOT updated")

print("\nüéâ ROUND_02 TRAINING COMPLETE!")

üìå PARAMETERS:
ROUND_NAME : round_02
MODEL_NAME : xgboost_oversampling_v1
PREFIX     : r02
GRAVA_LOG  : True
TRAIN_FILE : /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/data/splits/round_02/round_02_train.txt
TEST_FILE  : /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/data/splits/round_02/round_02_test.txt
MODEL_DIR  : /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_02

üì• Loading train/test data...
Train: (1147, 33) | Test: (287, 33)
‚úî Using 32 features

üöÄ Training model...
‚úî Model trained!

üìä Evaluating model...
MAE  = 2.680
RMSE = 4.917
R¬≤   = 0.858
Corr = 0.927
Best iteration: 897

üíæ Saving model + artifacts...
‚úî Files saved:
/Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_02/r02_xgboost_oversampling_v1.json
/Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_02/r02_xgboost_oversampling_v1_preds.txt
/Users

  error_bins = error_df.groupby(pd.cut(error_df["hr_true"], bins=6))["err"].agg(["mean", "std", "count"])


## Round_03 - Oversampling focado em hr_true > 115.5 BPM

In [23]:
# ============================================================
# TRAINING NOTEBOOK ‚Äî Round_03 (Extreme HR Refinement)
# ============================================================

import os
import pandas as pd
import numpy as np
from datetime import datetime
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr

# ============================================================
# PARAMETERS
# ============================================================

ROUND_NAME  = "round_03"
MODEL_NAME  = "xgboost_extreme_hr_refinement"
PREFIX      = "r03"
NOTES       = (
    "Round_03 | Extreme HR refinement (>115.5 BPM) | "
    "Targeted oversampling x20 on real samples (feature-space)"
)

GRAVA_LOG   = True   # default=True

BASE_DIR = "/Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso"

# ------------------------------------------------------------
# Split folder generated by oversampling code
# ------------------------------------------------------------
SPLIT_DIR = os.path.join(BASE_DIR, "data", "splits", ROUND_NAME)

TRAIN_FILE = os.path.join(SPLIT_DIR, f"{ROUND_NAME}_train.txt")
TEST_FILE  = os.path.join(SPLIT_DIR, f"{ROUND_NAME}_test.txt")

# ------------------------------------------------------------
# Model output directory
# ------------------------------------------------------------
MODEL_DIR = os.path.join(BASE_DIR, "model", ROUND_NAME)
os.makedirs(MODEL_DIR, exist_ok=True)

# ------------------------------------------------------------
# Outputs with PREFIX
# ------------------------------------------------------------
MODEL_OUT      = os.path.join(MODEL_DIR, f"{PREFIX}_{MODEL_NAME}.json")
PREDS_OUT      = os.path.join(MODEL_DIR, f"{PREFIX}_{MODEL_NAME}_preds.txt")
RESID_OUT      = os.path.join(MODEL_DIR, f"{PREFIX}_{MODEL_NAME}_residuals.txt")
FEATIMP_OUT    = os.path.join(MODEL_DIR, f"{PREFIX}_{MODEL_NAME}_feature_importance.txt")
ERRORBINS_OUT  = os.path.join(MODEL_DIR, f"{PREFIX}_{MODEL_NAME}_error_bins.txt")

# results.txt logger
RESULTS_FILE = os.path.join(BASE_DIR, "results", "results.txt")

print("üìå PARAMETERS:")
print(f"ROUND_NAME : {ROUND_NAME}")
print(f"MODEL_NAME : {MODEL_NAME}")
print(f"PREFIX     : {PREFIX}")
print(f"GRAVA_LOG  : {GRAVA_LOG}")
print(f"TRAIN_FILE : {TRAIN_FILE}")
print(f"TEST_FILE  : {TEST_FILE}")
print(f"MODEL_DIR  : {MODEL_DIR}")

# ============================================================
# LOGGER (optional)
# ============================================================

def log_train(
    log_path,
    round_name,
    model_name,
    train_count,
    test_count,
    notes,
    metrics,
    model_file,
    preds_file
):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    header = (
        "timestamp\tmodel\tround\ttype\ttrain_samples\t"
        "test_samples\tnotes\tmetrics\tmodel_file\tpreds_file\n"
    )

    if not os.path.exists(log_path):
        with open(log_path, "w") as f:
            f.write(header)

    line = (
        f"{timestamp}\t{model_name}\t{round_name}\ttrain\t"
        f"{train_count}\t{test_count}\t"
        f"{notes}\t{metrics}\t"
        f"{model_file}\t{preds_file}\n"
    )

    with open(log_path, "a") as f:
        f.write(line)

    print(f"üìù Logged training entry ‚Üí {log_path}")

# ============================================================
# LOAD DATA
# ============================================================

print("\nüì• Loading train/test data...")
train_df = pd.read_csv(TRAIN_FILE)
test_df  = pd.read_csv(TEST_FILE)

print(f"Train: {train_df.shape} | Test: {test_df.shape}")

feature_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols.remove("hr_true")

X_train = train_df[feature_cols]
y_train = train_df["hr_true"]

X_test  = test_df[feature_cols]
y_test  = test_df["hr_true"]

print(f"‚úî Using {len(feature_cols)} features")

# ============================================================
# MODEL TRAINING
# ============================================================

print("\nüöÄ Training model...")

model = XGBRegressor(
    objective="reg:squarederror",
    tree_method="auto",
    n_estimators=900,
    learning_rate=0.018,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    early_stopping_rounds=40,
)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

print("‚úî Model trained!")

# ============================================================
# EVALUATION
# ============================================================

print("\nüìä Evaluating model...")

y_pred = model.predict(X_test)

mae  = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)
corr, _ = pearsonr(y_test, y_pred)

metrics_str = f"MAE={mae:.4f},RMSE={rmse:.4f},R2={r2:.4f},Corr={corr:.4f}"

print(f"MAE  = {mae:.3f}")
print(f"RMSE = {rmse:.3f}")
print(f"R¬≤   = {r2:.3f}")
print(f"Corr = {corr:.3f}")
print(f"Best iteration: {model.best_iteration}")

# ============================================================
# SAVE ARTIFACTS
# ============================================================

print("\nüíæ Saving model + artifacts...")

model.save_model(MODEL_OUT)

pd.DataFrame({
    "hr_true": y_test,
    "hr_pred": y_pred,
}).to_csv(PREDS_OUT, index=False)

pd.DataFrame({
    "hr_true": y_test,
    "hr_pred": y_pred,
    "residual": y_test - y_pred
}).to_csv(RESID_OUT, index=False)

pd.DataFrame({
    "feature": feature_cols,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False).to_csv(FEATIMP_OUT, index=False)

error_df = pd.DataFrame({
    "hr_true": y_test,
    "err": np.abs(y_test - y_pred)
})
error_bins = error_df.groupby(
    pd.cut(error_df["hr_true"], bins=6)
)["err"].agg(["mean", "std", "count"])
error_bins.to_csv(ERRORBINS_OUT)

print("‚úî Files saved:")
print(MODEL_OUT)
print(PREDS_OUT)
print(RESID_OUT)
print(FEATIMP_OUT)
print(ERRORBINS_OUT)

# ============================================================
# LOG RESULTS (OPTIONAL)
# ============================================================

if GRAVA_LOG:
    log_train(
        log_path=RESULTS_FILE,
        round_name=ROUND_NAME,
        model_name=f"{PREFIX}_{MODEL_NAME}",
        train_count=len(train_df),
        test_count=len(test_df),
        notes=NOTES,
        metrics=metrics_str,
        model_file=os.path.basename(MODEL_OUT),
        preds_file=os.path.basename(PREDS_OUT),
    )
else:
    print("üõë GRAVA_LOG=False ‚Üí results.txt NOT updated")

print("\nüéâ ROUND_03 TRAINING COMPLETE!")

üìå PARAMETERS:
ROUND_NAME : round_03
MODEL_NAME : xgboost_extreme_hr_refinement
PREFIX     : r03
GRAVA_LOG  : True
TRAIN_FILE : /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/data/splits/round_03/round_03_train.txt
TEST_FILE  : /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/data/splits/round_03/round_03_test.txt
MODEL_DIR  : /Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_03

üì• Loading train/test data...
Train: (1259, 33) | Test: (315, 33)
‚úî Using 32 features

üöÄ Training model...
‚úî Model trained!

üìä Evaluating model...
MAE  = 2.268
RMSE = 3.578
R¬≤   = 0.946
Corr = 0.973
Best iteration: 891

üíæ Saving model + artifacts...
‚úî Files saved:
/Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_03/r03_xgboost_extreme_hr_refinement.json
/Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_03/r03_xgboost_extreme_hr_refinemen

  error_bins = error_df.groupby(


## Round_04 - Bayesian Hyperparameter Tuning (XGBoost)

In [25]:
# ============================================================
# ROUND FINAL ‚Äî Bayesian Hyperparameter Optimization (XGBoost)
# Target: MAE minimization
# ============================================================

import os
import optuna
import pandas as pd
import numpy as np
from datetime import datetime

from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from scipy.stats import pearsonr

# ============================================================
# PARAMETERS
# ============================================================

ROUND_NAME   = "round_04"
PREFIX       = "r04"
MODEL_NAME   = "xgboost_bayesian_refinement"
N_TRIALS     = 40            # 30‚Äì50 √© um bom equil√≠brio
TIMEOUT_SEC  = None          # ou ex: 3600

BASE_DIR = "/Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso"

SPLIT_DIR  = os.path.join(BASE_DIR, "data", "splits", ROUND_NAME)
MODEL_DIR  = os.path.join(BASE_DIR, "model", ROUND_NAME)
RESULTS_DIR = os.path.join(BASE_DIR, "results", ROUND_NAME)

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

TRAIN_FILE = os.path.join(SPLIT_DIR, f"{ROUND_NAME}_train.txt")
TEST_FILE  = os.path.join(SPLIT_DIR, f"{ROUND_NAME}_test.txt")

BEST_MODEL_OUT = os.path.join(MODEL_DIR, f"{PREFIX}_{MODEL_NAME}.json")
STUDY_OUT      = os.path.join(RESULTS_DIR, f"{PREFIX}_{MODEL_NAME}_optuna_trials.csv")
REPORT_OUT     = os.path.join(RESULTS_DIR, f"{PREFIX}_{MODEL_NAME}_bayesian_report.txt")

# ============================================================
# LOAD DATA
# ============================================================

train_df = pd.read_csv('/Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/data/splits/round_03/round_03_train.txt')
test_df  = pd.read_csv('/Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/data/splits/round_03/round_03_test.txt')

feature_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols.remove("hr_true")

X_train = train_df[feature_cols]
y_train = train_df["hr_true"]

X_test  = test_df[feature_cols]
y_test  = test_df["hr_true"]

print(f"üì• Data loaded ‚Äî Train: {X_train.shape}, Test: {X_test.shape}")

# ============================================================
# OBJECTIVE FUNCTION
# ============================================================

def objective(trial):

    params = {
        "objective": "reg:squarederror",
        "tree_method": "auto",
        "n_estimators": trial.suggest_int("n_estimators", 400, 1200),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.05, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "min_child_weight": trial.suggest_float("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.5, 5.0),
        "random_state": 42,
        "early_stopping_rounds": 40,
    }

    model = XGBRegressor(**params)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        verbose=False
    )

    preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, preds)

    return mae

# ============================================================
# RUN OPTUNA STUDY
# ============================================================

print("\nüöÄ Starting Bayesian Optimization...")
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=N_TRIALS, timeout=TIMEOUT_SEC)

print("\n‚úÖ Optimization finished!")
print("Best MAE:", study.best_value)
print("Best params:", study.best_params)

# ============================================================
# TRAIN FINAL MODEL WITH BEST PARAMS
# ============================================================

best_params = study.best_params
best_params.update({
    "objective": "reg:squarederror",
    "tree_method": "auto",
    "random_state": 42,
})

final_model = XGBRegressor(**best_params)

final_model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

final_preds = final_model.predict(X_test)

mae  = mean_absolute_error(y_test, final_preds)
corr, _ = pearsonr(y_test, final_preds)

print(f"\nüéØ FINAL METRICS")
print(f"MAE  : {mae:.4f}")
print(f"Corr : {corr:.4f}")

final_model.save_model(BEST_MODEL_OUT)

# ============================================================
# SAVE STUDY + REPORT
# ============================================================

study_df = study.trials_dataframe()
study_df.to_csv(STUDY_OUT, index=False)

with open(REPORT_OUT, "w") as f:
    f.write("=" * 70 + "\n")
    f.write("BAYESIAN OPTIMIZATION REPORT ‚Äî REPOUSO\n")
    f.write("=" * 70 + "\n\n")

    f.write(f"Round      : {ROUND_NAME}\n")
    f.write(f"Model      : {PREFIX}_{MODEL_NAME}\n")
    f.write(f"Timestamp  : {datetime.now()}\n")
    f.write(f"Trials     : {len(study_df)}\n\n")

    f.write("BEST METRICS\n")
    f.write("-" * 30 + "\n")
    f.write(f"MAE  : {mae:.4f}\n")
    f.write(f"Corr : {corr:.4f}\n\n")

    f.write("BEST PARAMETERS\n")
    f.write("-" * 30 + "\n")
    for k, v in study.best_params.items():
        f.write(f"{k}: {v}\n")

print("\nüíæ Bayesian artifacts saved:")
print(BEST_MODEL_OUT)
print(STUDY_OUT)
print(REPORT_OUT)

print("\nüéâ BAYESIAN OPTIMIZATION COMPLETED ‚Äî REPOUSO CLOSED")

[I 2025-12-13 11:32:05,794] A new study created in memory with name: no-name-16a47a65-e309-451d-9470-f3cbcfa24533


üì• Data loaded ‚Äî Train: (1259, 32), Test: (315, 32)

üöÄ Starting Bayesian Optimization...


[I 2025-12-13 11:32:07,995] Trial 0 finished with value: 2.504495144659716 and parameters: {'n_estimators': 659, 'learning_rate': 0.006354707782955294, 'max_depth': 8, 'min_child_weight': 4.706921622905245, 'subsample': 0.7373059414325992, 'colsample_bytree': 0.941954404282085, 'gamma': 0.6985926897699268, 'reg_alpha': 0.9725143356181946, 'reg_lambda': 1.792011745440565}. Best is trial 0 with value: 2.504495144659716.
[I 2025-12-13 11:32:09,119] Trial 1 finished with value: 2.627502387992136 and parameters: {'n_estimators': 465, 'learning_rate': 0.012966322695226868, 'max_depth': 6, 'min_child_weight': 7.6462246457166625, 'subsample': 0.717044781771506, 'colsample_bytree': 0.9720752092677275, 'gamma': 1.7264452814189368, 'reg_alpha': 0.6782853645797952, 'reg_lambda': 2.3770668968686834}. Best is trial 0 with value: 2.504495144659716.
[I 2025-12-13 11:32:09,717] Trial 2 finished with value: 2.446602166952619 and parameters: {'n_estimators': 427, 'learning_rate': 0.04950564861367119, 'ma


‚úÖ Optimization finished!
Best MAE: 2.091900943959911
Best params: {'n_estimators': 437, 'learning_rate': 0.03131704699699944, 'max_depth': 8, 'min_child_weight': 2.797902463027109, 'subsample': 0.9076917159874688, 'colsample_bytree': 0.7007692990514951, 'gamma': 0.24740913683859356, 'reg_alpha': 0.5850549869029064, 'reg_lambda': 0.9054869160845842}

üéØ FINAL METRICS
MAE  : 2.0907
Corr : 0.9760

üíæ Bayesian artifacts saved:
/Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/model/round_04/r04_xgboost_bayesian_refinement.json
/Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/results/round_04/r04_xgboost_bayesian_refinement_optuna_trials.csv
/Users/edmundobrown/Documents/MLGeral/AI-HealthCare/HREstimation/repouso/results/round_04/r04_xgboost_bayesian_refinement_bayesian_report.txt

üéâ BAYESIAN OPTIMIZATION COMPLETED ‚Äî REPOUSO CLOSED
