<a href="https://colab.research.google.com/github/dmdiegoar/Quant-code-t0/blob/main/XGB_Sinsentido.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

COMIENZA EL UMBRAL

**apruebas**

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from xgboost import XGBClassifier
from google.colab import files
from scipy import stats
import time
from imblearn.over_sampling import SMOTE # Import SMOTE
import matplotlib.pyplot as plt # Import for plotting (optional for this step)
import seaborn as sns # Import for plotting (optional for this step)
import os # Import os for creating directories


# --- Control Flag ---
run_full_model = True # Set to True to run model training, tuning, and prediction; Set to False to only run data/feature analysis
debug_mode = False
# --- End Control Flag ---


# Functions for features
def add_lagged_price_features(df, etiqueta="close_lag", dato="Close"):
    for lag in range(1, 6):
        df[f'{etiqueta}_{lag}'] = df[dato].shift(lag)
    return df

def calculate_RSI(series, period=7):
    delta = series.diff(1)
    gain = delta.where(delta > 0, 0).rolling(window=period).mean()
    loss = -delta.where(delta < 0, 0).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def calculate_ROC(series, period=5):
    return ((series - series.shift(period)) / series.shift(period)) * 100

def calculate_PPO(series, fast_period=5, slow_period=9, signal_period=5):
    ema_fast = series.ewm(span=fast_period, adjust=False).mean()
    ema_slow = series.ewm(span=slow_period, adjust=False).mean()
    ppo = (ema_fast - ema_slow) / ema_slow * 100
    signal_line = ppo.ewm(span=signal_period, adjust=False).mean()
    histogram = ppo - signal_line
    return ppo, signal_line, histogram

def calculate_EWO(series, fast_period=5, slow_period=35, signal_period=5):
    ema_fast = series.ewm(span=fast_period, adjust=False).mean()
    ema_slow = series.ewm(span=slow_period, adjust=False).mean()
    ewo = (ema_fast - ema_slow) / ema_slow * 100
    signal_line = ewo.ewm(span=signal_period, adjust=False).mean()
    histogram = ewo - signal_line
    return ewo, signal_line, histogram

def calculate_volatility(series, window=20):
    return series.rolling(window).std().round(6)

def calculate_sma5(series, period=5):
    return series.rolling(window=period).mean().round(4)

def calculate_sma13(series, period=13):
    return series.rolling(window=period).mean().round(4)

def calculate_sma26(series, period=26):
    return series.rolling(window=period).mean().round(4)

def calculate_sma50(series, period=50):
    return series.rolling(window=period).mean().round(4)

def calculate_sma200(series, period=200):
    return series.rolling(window=period).mean().round(4)

def calculate_cyclical_price_position(df, price_col, window_size, prefix):
    """Calculates cyclical encoding of price position within a rolling window."""
    # Calculate rolling max and min based on the specific price column
    rolling_max = df[price_col].rolling(window=window_size, min_periods=1).max()
    rolling_min = df[price_col].rolling(window=window_size, min_periods=1).min()

    # Calculate normalized position (handle division by zero if max == min)
    price_range = rolling_max - rolling_min
    # Use epsilon to avoid division by zero, but also consider the case where max == min
    epsilon = 1e-9
    # Add a small value to the denominator only if price_range is effectively zero
    denominator = price_range + np.where(price_range < epsilon, epsilon, 0)
    normalized_price = (df[price_col] - rolling_min) / denominator


    # Apply cyclical transformation
    df[f'{prefix}_{price_col.lower()}_pos_{window_size}d_sin'] = np.sin(2 * np.pi * normalized_price)
    df[f'{prefix}_{price_col.lower()}_pos_{window_size}d_cos'] = np.cos(2 * np.pi * normalized_price)

    return df


def create_features(df, umbral, n_days_high=1):
    df = add_lagged_price_features(df, "close_lag", "Close")
    df = add_lagged_price_features(df, "open_lag", "Open")
    df = add_lagged_price_features(df, "high_lag", "High")
    df['Pct_change'] = df['Close'].pct_change()
    for lag in range(1, 6):
        df[f'lag_change{lag}'] = df['Pct_change'].shift(lag)
    df['RSI'] = calculate_RSI(df['Close'])
    df['ROC'] = calculate_ROC(df['Close'])
    df['PPO'], df['PPO_Signal'], df['PPO_Histogram'] = calculate_PPO(df['Close'])
    df['EWO'], df['EWO_Signal'], df['EWO_Histogram'] = calculate_EWO(df['Close'])
    df['SMA5'] = calculate_sma5(df['Close'])
    df['SMA13'] = calculate_sma13(df['Close'])
    df['SMA26'] = calculate_sma26(df['Close'])
    df['SMA50'] = calculate_sma50(df['Close'])
    df['SMA200'] = calculate_sma200(df['Close'])
    df['Volatility'] = calculate_volatility(df['Close'])

    # --- New Feature: Max Gain from Open over Past N Days ---
    # Calculate the maximum High price over the *next N days* for *each historical day*.
    # Use rolling().max() with min_periods=1 to handle ends of series.
    # Then shift to align with the start of the N-day window (the current day's Open).
    max_high_over_next_n_days_hist = df['High'].rolling(window=n_days_high, min_periods=1).max().shift(-n_days_high + 1)


    # Calculate the potential max gain from Open for *each historical day*
    # Using the Open price of that historical day
    epsilon = 1e-9 # To prevent division by zero
    df['Max_Gain_from_Open_Current'] = (max_high_over_next_n_days_hist - df['Open']) / (df['Open'] + epsilon)

    # --- Add lagged versions of the new feature ---
    for lag in range(1, 7): # Create lags from 1 to 6
        df[f'Max_Gain_from_Open_Lag_{lag}'] = df['Max_Gain_from_Open_Current'].shift(lag)

    # --- Add Cyclical Features for Day of Week ---
    # Day of week: Monday=0, Sunday=6
    df['day_of_week'] = df.index.dayofweek
    # Apply sine and cosine transformations
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    # Drop the original day_of_week column
    df.drop('day_of_week', axis=1, inplace=True)

    # --- Add Cyclical Features for Price Position within Rolling Ranges ---
    window_sizes = [5, 20, 252] # 5d, 20d, approx 52 weeks
    price_cols = ['Open', 'High', 'Low', 'Close']

    for window_size in window_sizes:
        for price_col in price_cols:
             df = calculate_cyclical_price_position(df, price_col, window_size, prefix='price')


    # Calculate the target based on tomorrow's Open vs Max High over next n_days_high days
    # Use rolling().max() with min_periods=1 for the target as well.
    # Shift to align with the start of the N-day window for the target (tomorrow's Open).
    max_high_next_n_days_target = df['High'].rolling(window=n_days_high, min_periods=1).max().shift(-n_days_high + 1)
    open_next_day = df['Open'].shift(-1)
    df['Label_raw'] = ((max_high_next_n_days_target - open_next_day) / (open_next_day + epsilon) > umbral).astype(int)
    df['Label'] = df['Label_raw'].shift(-1) # Target for the next day



    original_index = df.index


    return df

# Definir fecha de corte manualmente (cambiar diariamente)
end_date = dt.datetime(2025, 9, 14)  # Ejemplo: cambiar a 2025-07-18 mañana

tk =[ "ALUA.BA", "BBAR.BA", "BMA.BA", "COME.BA", "CRES.BA", "EDN.BA", "GGAL.BA", "IRSA.BA", "LOMA.BA", "METR.BA", "PAMP.BA", "SUPV.BA", "TECO2.BA", "TGNO4.BA", "TGSU2.BA", "TRAN.BA", "TXAR.BA", "VALO.BA", "YPFD.BA"]


# results = [] # el del test, para que no lo reinicie - REMOVED
resultsp = [] # las predicciones, para que no lo reinicie

umbral = 0.019
lapso = 1 # Lapso is no longer directly used for the target definition, but keeping it doesn't hurt
n_days_high_target = 1 # Define the number of days for the High target (used for both target and new feature)

# Define clipping bounds - adjust based on feature distributions
lower_bound = -1e9
upper_bound = 1e9


# Select features - Add the new feature and its lags
features = ['RSI', 'ROC', 'PPO', 'PPO_Signal', 'PPO_Histogram', 'EWO', 'EWO_Signal', 'EWO_Histogram', 'Volatility', 'SMA5', 'SMA13', 'SMA26', 'SMA50', 'SMA200' ] + [f'lag_change{i}' for i in range(1, 6)] + \
          [f'close_lag_{i}' for i in range(1, 6)] + [f'open_lag_{i}' for i in range(1, 6)]+ [f'high_lag_{i}' for i in range(1, 6)] + \
          ['Max_Gain_from_Open_Current'] + [f'Max_Gain_from_Open_Lag_{i}' for i in range(1, 7)] + \
          ['day_of_week_sin', 'day_of_week_cos']

# Add all the new cyclical price position features
window_sizes = [5, 20, 252]
price_cols = ['Open', 'High', 'Low', 'Close']
for window_size in window_sizes:
    for price_col in price_cols:
        features.append(f'price_{price_col.lower()}_pos_{window_size}d_sin')
        features.append(f'price_{price_col.lower()}_pos_{window_size}d_cos')


for papel in tk:

  symbol=papel
  #symbol="COME.BA"
  # Fechas dinámicas
  start_date = dt.datetime(2001, 1, 1)  # Inicio fijo
  train_end = end_date - pd.Timedelta(days=780)  # 6 meses antes de end_date (ajustable)
  next_day = end_date + pd.Timedelta(days=1)  # Predicción para el día siguiente


  # Download data for the current ticker inside the loop
  print(f"\nDownloading data for {symbol}...")
  # Download data up to the day BEFORE the prediction date (end_date)
  # Corrected: Download data up to end_date. The last row of this data will be end_date.
  df = yf.download(symbol, start=start_date, end=end_date, auto_adjust=False)

  # Verify data download
  if df.empty:
      print(f"Warning: No data downloaded for {symbol}. Skipping.")
      continue # Skip to the next ticker


  # Handle MultiIndex columns and ensure standard column names - More robust logic
  required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
  processed_df = None # Initialize processed_df

  if isinstance(df.columns, pd.MultiIndex):
      print(f"MultiIndex columns detected for {symbol}.")
      try:
          # Attempt to extract columns by looking for standard names in ANY level of the MultiIndex tuple
          extracted_data = {}
          for std_name in required_cols:
              matching_col_tuple = None
              # Iterate through all column tuples
              for col_tuple in df.columns:
                  # Check if the standard name exists in ANY level of the current tuple
                  if std_name in col_tuple:
                       matching_col_tuple = col_tuple
                       break # Found a match in this tuple

              if matching_col_tuple:
                  extracted_data[std_name] = df[matching_col_tuple]
              else:
                  print(f"Warning: Could not find standard column '{std_name}' in any level of MultiIndex for {symbol}. Column missing.")
                  # Continue to look for other required columns, processed_df will be checked later

          if len(extracted_data) == len(required_cols):
              processed_df = pd.DataFrame(extracted_data)
              processed_df.index = df.index # Preserve original index
              print(f"Successfully extracted and flattened MultiIndex columns for {symbol}.")
          else:
              missing_cols = [name for name in required_cols if name not in extracted_data]
              print(f"Warning: Could not extract all required columns from MultiIndex for {symbol}. Missing: {missing_cols}. Skipping ticker.")
              continue # Skip to the next ticker

      except Exception as e:
          print(f"Warning: An error occurred while processing MultiIndex columns for {symbol}: {e}. Skipping.")
          #print(f"Original columns: {df.columns.tolist()}")
          continue # Skip to the next ticker

  else: # If not MultiIndex columns, assume standard flat DataFrame is already present
      print(f"No MultiIndex columns detected for {symbol}. Checking for standard columns.")
      # Check if the required columns are directly present
      if all(col in df.columns for col in required_cols):
          processed_df = df[required_cols].copy() # Select required columns and make a copy
          print(f"Using existing standard columns for {symbol}.")
      else:
          missing_cols = [col for col in required_cols if col not in df.columns]
          print(f"Warning: Required standard columns not found in flat DataFrame for {symbol}. Missing: {missing_cols}. Skipping ticker.")
          #print(f"Available columns: {df.columns.tolist()}")
          continue # Skip to the next ticker

  # Ensure df is set to processed_df if successful
  df = processed_df

  # Handle MultiIndex index if present (less common with single ticker download but possible)
  if isinstance(df.index, pd.MultiIndex):
      print(f"MultiIndex index detected for {symbol}. Attempting to flatten index.")
      try:
          # Assuming the MultiIndex index structure is ('Ticker', 'Date')
          if 'Ticker' in df.index.names:
               df = df.xs(symbol, level='Ticker', axis=0)
               print(f"Índice aplanado para {symbol}.")
          else:
               print(f"Warning: MultiIndex index detected for {symbol} but 'Ticker' level not found. Skipping index flattening.")
               # If 'Ticker' level is not there, maybe it's just a date/time MultiIndex?
               # Or a different structure. For now, proceed without flattening index if Ticker level is missing.


      except KeyError:
          print(f"Warning: Could not select ticker from MultiIndex index for {symbol}. Skipping.")
          continue # Skip to the next ticker
      except Exception as e:
          print(f"Warning: An error occurred while flattening MultiIndex index for {symbol}: {e}. Skipping.")
          continue # Skip to the next ticker


  df.index = pd.to_datetime(df.index)
  if not df.index.is_unique:
      print(f"Advertencia: Índice con fechas duplicadas para {symbol}. Eliminando duplicados...")
      df = df[~df.index.duplicated(keep='first')]

  if df.empty:
      print(f"Warning: DataFrame is empty after initial processing and cleaning for {symbol}. Skipping.")
      continue


  # Ensure numeric types and handle potential non-numeric data
  for col in required_cols:
      if col in df.columns: # Ensure column exists before processing
          df[col] = pd.to_numeric(df[col], errors='coerce')
      else:
           # This should ideally not happen if previous checks passed, but as a safeguard:
           print(f"Error: Required column '{col}' not found in df for {symbol} before numeric conversion. Skipping ticker.")
           df = pd.DataFrame() # Set df to empty to skip further processing
           break # Exit column processing loop


  if df.empty: # Check again if df became empty due to missing columns
       continue # Skip to the next ticker

  # Drop rows where essential price data is missing after coercion
  df.dropna(subset=['Open', 'High', 'Low', 'Close'], inplace=True)


  if df.empty:
      print(f"Warning: DataFrame is empty after dropping rows with missing price data for {symbol}. Skipping.")
      continue


  df['Open']= df['Open'].round(2)
  df['High']= df['High'].round(2)
  df['Low']= df['Low'].round(2)
  df['Close']= df['Close'].round(2)
  df['Adj Close']= df['Adj Close'].round(2)

  print("Últimas filas del DataFrame antes de crear features:")
  print(df.tail())


  # Crear features with the new target definition
  df = create_features(df, umbral=umbral, n_days_high=n_days_high_target) # Pass n_days_high_target, removed lapso

  # Verify data is not empty after feature creation and dropna
  if df.empty:
      print(f"Warning: DataFrame is empty after feature creation and dropna for {symbol}. Skipping.")
      continue





  # The last row of the DataFrame is used for the final prediction
  # This row contains features calculated using data up to the last available date (end_date)
  # Corrected: Ensure X_predict_latest is taken from the *end* of the DataFrame AFTER feature creation and dropna
  X_predict_latest = df[features].iloc[-1:] # Get the very last row for prediction

  print(X_predict_latest)
  df.dropna(inplace=True)
  # Ensure y_train_full and y_test are aligned with the DataFrame index after dropping NaNs
  # This alignment will be done later when splitting the data
# Verify data after creating features
  print(f"\nÚltimas filas del DataFrame después de crear features:")
  print(df.tail())
  print(df.columns)

  time.sleep(2)

  # --- Add Historical Target Distribution Check ---
  print(f"\nOverall historical target distribution for {symbol} (before train/test split):")
  historical_target_distribution = df["Label"].value_counts(normalize=True)
  print(historical_target_distribution)




  if debug_mode:

    print(f"\nDEBUG: Reached feature distribution table analysis section for {symbol}.")
    print(f"Generating feature distribution analysis table for {symbol} by target class...")

    # Use the full historical data (after feature creation and dropna) for this analysis
    analysis_df = df[features + ['Label']].copy() # Use all features for table analysis

    if analysis_df.empty:
        print(f"Warning: DataFrame is empty for feature distribution analysis for {symbol}.")
    elif 'Label' not in analysis_df.columns:
        print(f"Error: 'Label' column not found in DataFrame for feature distribution analysis for {symbol}.")
    elif len(analysis_df['Label'].unique()) < 2:
        print(f"Warning: Only one class exists in 'Label' for feature distribution analysis for {symbol}. Cannot group by class.")
        # Even if only one class, we can still show the describe table for that class
        print(f"\nFeature Distribution Analysis Table for {symbol} (Only one class):")
        display(analysis_df.describe().transpose()) # Show describe for the single class
    else: # This else should be aligned with the if/elif above
        # Group by Label and calculate descriptive statistics for all features
        feature_distribution_table = analysis_df.groupby('Label').describe().transpose()

        print(f"\nFeature Distribution Analysis Table for {symbol} by Target Class:")
        display(feature_distribution_table) # Use display for better formatting

    print(f"Finished generating feature distribution analysis table for {symbol}.") # This print should be aligned with the if/elif/else block
    # --- End Feature Distribution Analysis Table ---


  # --- Start of block for full model run (training, tuning, prediction, results table) ---
  if run_full_model:
      # Dividir datos en entrenamiento y prueba (moved inside the if run_full_model block)
      # X and y now include the latest data up to end_date + 1 day before splitting
      X = df[features]
      y = df['Label']

      # Train data is up to train_end (exclusive of train_end + 1)
      X_train_full = X[df.index <= train_end]
      y_train_full = y[df.index <= train_end]

      # Test data is from train_end + 1 day up to end_date (inclusive)
      X_test = X[(df.index > train_end) & (df.index <= end_date)]
      y_test = y[(df.index > train_end) & (df.index <= end_date)]




      # Check if original training data is sufficient BEFORE proceeding
      if not X_train_full.empty and not y_train_full.empty and len(y_train_full.unique()) > 1:

          # Ensure all features are present in X_train_full before calculating correlation or training
          missing_features_train = [f for f in features if f not in X_train_full.columns]
          if missing_features_train:
              print(f"ERROR: Missing features in training data for {symbol}: {missing_features_train}. Skipping model training and prediction.")
              # Append entry indicating skip due to missing features
              last_data_date = df.index[-1] if not df.empty else None
              last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
              resultsp.append({
                   'Papel': symbol,
                   'Fecha Predicción': next_day,
                   'Fecha Datos': last_data_date,
                   'Predicción': f'Skipped (Missing Features in Train Data: {", ".join(missing_features_train)})',
                   'Precio actual': last_close,
                   'Probabilidad Alcista (Modelo)': None,
                   'Umbral de Clasificación': None,
                   'Mejores hiperparámetros (Incluye scale_pos_weight)': 'Skipped (Missing Features in Train Data)',
                   'Precision Test (Alcista)': None,
                   'Recall Test (Alcista)': None,
                   'F1 Test (Alcista)': None,
                   'ROC-AUC Test': None,
                   'clase 1 en test (cleaned)': None,
                   'Features Limpias (Predicción)': None,
                   'Features Escaladas (Predicción)': None
              })
              continue # Skip to the next ticker


          correlation = df[features + ["Label"]].corr()["Label"].sort_values(ascending=False)
          print(f"Correlacion con label para {symbol}:")
          print(correlation)

          # Initialize test metrics before evaluation (only if model runs)
          precision_test_alcista = None
          recall_test_alcista = None
          f1_test_alcista = None
          roc_auc_test = None
          ratio_1_test = None
          best_model = None # Initialize best_model to None
          best_threshold = 0.5 # Initialize best_threshold to default


          # Optimizar hiperparámetros con RandomizedSearchCV
          print(f"Optimizar hiperparámetros con RandomizedSearchCV para {symbol}")
          param_dist = {
              'learning_rate': [0.01, 0.05, 0.1, 0.2],
              'max_depth': [3, 5, 7, 9],
              'n_estimators': [100, 500, 900],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'gamma': [0, 0.1, 0.2],
              'scale_pos_weight': [0.5, 1, 2, 5, 10, 20, 50, 100] # Incluir scale_pos_weight en la búsqueda
          }

          # Inicializar el clasificador XGBoost sin scale_pos_weight fijo (se tuneará)
          xgb = XGBClassifier(objective='binary:logistic', random_state=42)

          # Usar TimeSeriesSplit para cross-validation
          n_splits = 5  # Puedes ajustar el número de splits
          tscv = TimeSeriesSplit(n_splits=n_splits)

          # Definir scorer para maximizar Precision de la Clase 1 (for RandomizedSearchCV)
          precision_scorer = make_scorer(precision_score, pos_label=1, zero_division=0) # zero_division=0 para manejar casos sin predicciones positivas

          # Clean X_train_full and y_train_full before fitting RandomizedSearchCV and SMOTE
          X_train_full_cleaned_for_tuning = X_train_full.replace([np.inf, -np.inf], np.nan)
          X_train_full_cleaned_for_tuning.dropna(inplace=True)
          y_train_full_cleaned_for_tuning = y_train_full.loc[X_train_full_cleaned_for_tuning.index] # Ensure y matches cleaned X

          # Explicit check, conversion, and fallback for non-finite values before fitting RandomizedSearchCV
          X_train_full_cleaned_for_tuning = X_train_full_cleaned_for_tuning.astype(np.float64) # Ensure dtype
          if not np.isfinite(X_train_full_cleaned_for_tuning).all().all():
              print(f"\nWarning: Non-finite values detected in X_train_full_cleaned_for_tuning for {symbol} before RandomizedSearchCV fit. Attempting to fill with median.")
              for col in X_train_full_cleaned_for_tuning.columns:
                  finite_values = X_train_full_cleaned_for_tuning[col][np.isfinite(X_train_full_cleaned_for_tuning[col])]
                  if not finite_values.empty:
                      median_val = finite_values.median()
                      X_train_full_cleaned_for_tuning[col].replace([np.inf, -np.inf], np.nan, inplace=True)
                      X_train_full_cleaned_for_tuning[col].fillna(median_val, inplace=True)
                  else:
                      print(f"Warning: Column '{col}' in X_train_full_cleaned_for_tuning is all non-finite. Filling with 0.")
                      X_train_full_cleaned_for_tuning[col].fillna(0, inplace=True)

          # --- Apply SMOTE to the training data ---
          X_train_res, y_train_res = X_train_full_cleaned_for_tuning.copy(), y_train_full_cleaned_for_tuning.copy() # Initialize with cleaned data
          # Check if resampling is needed and possible
          if len(y_train_full_cleaned_for_tuning.unique()) > 1 and y_train_full_cleaned_for_tuning.value_counts().min() > 1: # SMOTE needs at least 2 samples of the minority class
              try:
                  print(f"\nApplying SMOTE to training data for {symbol}...")
                  sm = SMOTE(random_state=42)
                  X_train_res, y_train_res = sm.fit_resample(X_train_full_cleaned_for_tuning, y_train_full_cleaned_for_tuning)
                  print(f"Original training data shape: {X_train_full_cleaned_for_tuning.shape}, Resampled shape: {X_train_res.shape}")
                  print(f"Original training target distribution: {y_train_full_cleaned_for_tuning.value_counts()}, Resampled target distribution: {y_train_res.value_counts()}")
              except Exception as e:
                  print(f"\nWarning: Could not apply SMOTE to training data for {symbol}: {e}. Proceeding with original imbalanced training data.")
                  # X_train_res and y_train_res remain the original cleaned training data
          elif len(y_train_full_cleaned_for_tuning.unique()) == 1:
               print(f"\nWarning: Only one class in training data for {symbol} after cleaning. Cannot apply SMOTE.")
               # X_train_res and y_train_res remain the original cleaned training data
          else: # Not enough samples for SMOTE (e.g., only 1 minority sample)
              print(f"\nWarning: Not enough samples in minority class for SMOTE for {symbol}. Proceeding with original imbalanced training data.")
              # X_train_res and y_train_res remain the original cleaned training data


          # Check if resampled data is sufficient for tuning and evaluation
          if not X_train_res.empty and not y_train_res.empty and len(y_train_res.unique()) > 1:
              # Perform tuning, evaluation, and prediction within a general try-except block
              # to prevent script crash on problematic tickers
              try:
                  # Fit RandomizedSearchCV on the resampled training data
                  random_search = RandomizedSearchCV(xgb, param_distributions=param_dist, n_iter=20, cv=tscv, scoring=precision_scorer, n_jobs=-1, random_state=42) # Usar precision_scorer
                  random_search.fit(X_train_res, y_train_res) # Fit on resampled data
                  print(f"Mejores hiperparámetros para {symbol}:", random_search.best_params_)

                  # Usar el mejor modelo encontrado por RandomizedSearchCV
                  best_model = random_search.best_estimator_

                  # Ensure all features are present in the fitted model before calculating importance
                  model_features = list(best_model.get_booster().feature_names)
                  missing_features_model = [f for f in features if f not in model_features]
                  if missing_features_model:
                       print(f"Warning: Missing features in fitted model for {symbol}: {missing_features_model}. Feature importance will only be shown for available features.")
                       # Update the features list to only include features the model was trained on
                       # This prevents errors later when trying to select features for prediction/evaluation
                       features_for_prediction = model_features
                  else:
                       features_for_prediction = features # Use the full list if all were trained


                  # Get and print feature importance (fitted on resampled data) - Use available features
                  feature_importance_dict = best_model.get_booster().get_score(importance_type='weight')
                  feature_importance = pd.Series(feature_importance_dict).sort_values(ascending=False)
                  print(f"\nFeature Importance for {symbol}:")
                  print(feature_importance)


                  # Optimize the threshold for maximum F1-score on the full original training set (NOT resampled)
                  # Threshold optimization should reflect real-world data distribution
                  X_train_full_cleaned_for_threshold = X_train_full.replace([np.inf, -np.inf], np.nan)
                  X_train_full_cleaned_for_threshold.dropna(inplace=True)
                  y_train_full_cleaned_for_threshold = y_train_full.loc[X_train_full_cleaned_for_threshold.index]

                  # Ensure features used for threshold optimization match model features
                  X_train_full_cleaned_for_threshold = X_train_full_cleaned_for_threshold[features_for_prediction]


                  if not X_train_full_cleaned_for_threshold.empty and not y_train_full_cleaned_for_threshold.empty and len(y_train_full_cleaned_for_threshold.unique()) > 1:
                      # Predict probabilities on the ORIGINAL cleaned training data for threshold optimization
                      y_train_prob = best_model.predict_proba(X_train_full_cleaned_for_threshold)[:, 1]
                      thresholds = np.arange(0.5, 0.9, 0.1)
                      best_threshold = 0.5
                      best_f1 = 0 # Changed from best_precision to best_f1

                      print(f"Optimizing threshold for maximum F1-score on training data (ORIGINAL distribution) for {symbol}...") # Updated message
                      # Calculate F1-score for each threshold
                      for threshold in thresholds:
                          y_pred_threshold = (y_train_prob >= threshold).astype(int)
                          # Calculate f1_score (handles zero_division internally based on scikit-learn version)
                          f1 = f1_score(y_train_full_cleaned_for_threshold, y_pred_threshold, zero_division=0)
                          if f1 > best_f1:
                              best_f1 = f1
                              best_threshold = threshold

                      print(f"Mejor umbral para maximizar F1-score en entrenamiento (ORIGINAL distribution) para {symbol}: {best_threshold:.4f} (F1-score: {best_f1:.4f})") # Updated message and metric
                  else:
                      print(f"\nWarning: Original training set for {symbol} contains no samples or only one class after cleaning for threshold optimization. Cannot optimize threshold for F1-score. Using default threshold 0.5.") # Updated message
                      best_threshold = 0.5


                  # Evaluar el modelo en el conjunto de prueba con el best threshold (Test set is NOT resampled)
                  if not X_test.empty and not y_test.empty and len(y_test.unique()) > 1:
                      print(f"\nEvaluating best model on test set for {symbol} with best threshold ({best_threshold:.4f}):")

                      # Clean X_test before evaluation
                      X_test_cleaned = X_test.replace([np.inf, -np.inf], np.nan).dropna()
                      y_test_cleaned = y_test.loc[X_test_cleaned.index]

                      # Ensure features used for evaluation match model features
                      missing_features_test = [f for f in features_for_prediction if f not in X_test_cleaned.columns]
                      if missing_features_test:
                           print(f"ERROR: Missing features in test data for {symbol} that are present in the model: {missing_features_test}. Skipping test evaluation.")
                           precision_test_alcista = None
                           recall_test_alcista = None
                           f1_test_alcista = None
                           roc_auc_test = None
                           ratio_1_test = None
                           # Skip the rest of the evaluation for this ticker
                           continue # Skip to the next ticker

                      X_test_cleaned = X_test_cleaned[features_for_prediction] # Select only the features the model was trained on


                      # Scale X_test using the scaler fitted on the ORIGINAL training data
                      scaler = RobustScaler()
                      # Fit scaler on the ORIGINAL cleaned training data (NOT resampled)
                      X_train_full_cleaned_for_scaler_eval = X_train_full.replace([np.inf, -np.inf], np.nan)
                      X_train_full_cleaned_for_scaler_eval.dropna(inplace=True)
                      X_train_full_cleaned_for_scaler_eval = X_train_full_cleaned_for_scaler_eval.clip(lower=lower_bound, upper=upper_bound)
                      X_train_full_cleaned_for_scaler_eval = X_train_full_cleaned_for_scaler_eval.astype(np.float64)
                      # Ensure features for scaler fitting match model features
                      X_train_full_cleaned_for_scaler_eval = X_train_full_cleaned_for_scaler_eval[features_for_prediction]


                      if not X_train_full_cleaned_for_scaler_eval.empty and np.isfinite(X_train_full_cleaned_for_scaler_eval).all().all():
                          scaler.fit(X_train_full_cleaned_for_scaler_eval)

                          X_test_cleaned = X_test_cleaned.astype(np.float64)
                          if not np.isfinite(X_test_cleaned).all().all():
                               print(f"\nWarning: Non-finite values detected in X_test_cleaned for {symbol} before scaler transform. Attempting to fill with median (from train data).")
                               train_medians = X_train_full_cleaned_for_scaler_eval.median()
                               for col in X_test_cleaned.columns:
                                   median_val = train_medians.get(col, 0)
                                   X_test_cleaned[col].replace([np.inf, -np.inf], np.nan, inplace=True)
                                   X_test_cleaned[col].fillna(median_val, inplace=True)
                               if not np.isfinite(X_test_cleaned).all().all():
                                    print(f"\nERROR: Non-finite values STILL detected in X_test_cleaned for {symbol} after filling with median!")


                          if not X_test_cleaned.empty and np.isfinite(X_test_cleaned).all().all():
                              X_test_scaled = scaler.transform(X_test_cleaned)

                              y_test_pred_prob = best_model.predict_proba(X_test_scaled)[:, 1]
                              y_test_pred = (y_test_pred_prob >= best_threshold).astype(int)

                              if len(y_test_cleaned.unique()) > 1:
                                  print(f"\nClassification Report (Test Set) for {symbol}:")
                                  print(classification_report(y_test_cleaned, y_test_pred, zero_division=0))
                                  precision_test_alcista = precision_score(y_test_cleaned, y_test_pred, pos_label=1, zero_division=0)
                                  recall_test_alcista = recall_score(y_test_cleaned, y_test_pred, pos_label=1, zero_division=0)
                                  f1_test_alcista = f1_score(y_test_cleaned, y_test_pred, pos_label=1, zero_division=0)

                                  print(f"Tamaño de y_test (cleaned): {y_test_cleaned.size}")
                                  print(f"Distribución de clases en y_test (cleaned) para {symbol}:")
                                  print(y_test_cleaned.value_counts())
                                  if 1 in y_test_cleaned.value_counts():
                                      ratio_1_test=(y_test_cleaned.value_counts()[1]/y_test_cleaned.size).round(4)
                                  else:
                                      ratio_1_test = 0
                                  print(f"% clase 1 test para {symbol}: {ratio_1_test} ")

                                  if len(y_test_cleaned.unique()) > 1:
                                       roc_auc_test = roc_auc_score(y_test_cleaned, y_test_pred_prob).round(6)
                                       print(f"\nROC-AUC (Test Set) para {symbol}: {roc_auc_test:.4f}")
                                  else:
                                       roc_auc_test = None
                                       print(f"\nWarning: Test set for {symbol} contains only one class after cleaning. Cannot calculate ROC-AUC.")

                              else:
                                  print(f"\nWarning: Test set for {symbol} contains only one class after cleaning. Cannot generate full classification report.")
                                  precision_test_alcista = None
                                  recall_test_alcista = None
                                  f1_test_alcista = None
                                  roc_auc_test = None
                                  ratio_1_test = None


                          else:
                              print(f"\nWarning: X_test became empty after cleaning or contains non-finite values for {symbol}. Skipping test evaluation.")
                              precision_test_alcista = None
                              recall_test_alcista = None
                              f1_test_alcista = None
                              roc_auc_test = None
                              ratio_1_test = None





                      else:
                          print(f"\nAdvertencia: Conjunto de prueba insuficiente o con una sola clase para evaluación para {symbol}.")
                          precision_test_alcista = None
                          recall_test_alcista = None
                          f1_test_alcista = None
                          roc_auc_test = None
                          ratio_1_test = None

                      # Prediction for the next day is done only if model was trained successfully
                      # Use the single last row from the full DataFrame for prediction
                      last_features = X_predict_latest.copy() # Use the pre-selected last row

                      last_features_cleaned = None
                      last_features_scaled = None
                      future_pred_prob = None
                      future_pred = None

                      if not last_features.empty:
                          # Ensure last_features is a single row DataFrame before cleaning
                          if not isinstance(last_features, pd.DataFrame) or len(last_features) != 1:
                               print(f"\nError: last_features is not a single row DataFrame for {symbol}. Skipping prediction.")
                               # Set prediction results to skipped
                               last_data_date = df.index[-1] if not df.empty else None
                               last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
                               resultsp.append({
                                   'Papel': symbol,
                                   'Fecha Predicción': next_day,
                                   'Fecha Datos': last_data_date,
                                   'Predicción': 'Skipped (Prediction Data Error)',
                                   'Precio actual': last_close,
                                   'Probabilidad Alcista (Modelo)': None,
                                   'Umbral de Clasificación': None,
                                   'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                                   'Precision Test (Alcista)': precision_test_alcista,
                                   'Recall Test (Alcista)': recall_test_alcista,
                                   'F1 Test (Alcista)': f1_test_alcista,
                                   'ROC-AUC Test': roc_auc_test,
                                   'clase 1 en test (cleaned)': ratio_1_test,
                                   'Features Limpias (Predicción)': None,
                                   'Features Escaladas (Predicción)': None
                               })
                               # No need to continue here, the append is done and we move to the next ticker
                               continue # Skip to the next ticker


                          last_features_cleaned = last_features.replace([np.inf, -np.inf], np.nan).dropna()
                          last_features_cleaned = last_features_cleaned.clip(lower=lower_bound, upper=upper_bound)

                          # Ensure features for prediction match model features
                          missing_features_pred = [f for f in features_for_prediction if f not in last_features_cleaned.columns]
                          if missing_features_pred:
                               print(f"ERROR: Missing features in prediction data for {symbol} that are present in the model: {missing_features_pred}. Skipping prediction.")
                               last_data_date = df.index[-1] if not df.empty else None
                               last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
                               # Explicitly use None for feature columns if prediction skipped due to data issues
                               resultsp.append({
                                   'Papel': symbol,
                                   'Fecha Predicción': next_day,
                                   'Fecha Datos': last_data_date,
                                   'Predicción': f'Skipped (Missing Features in Prediction Data: {", ".join(missing_features_pred)})',
                                   'Precio actual': last_close,
                                   'Probabilidad Alcista (Modelo)': None,
                                   'Umbral de Clasificación': None,
                                   'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                                   'Precision Test (Alcista)': precision_test_alcista,
                                   'Recall Test (Alcista)': recall_test_alcista,
                                   'F1 Test (Alcista)': f1_test_alcista,
                                   'ROC-AUC Test': roc_auc_test,
                                   'clase 1 en test (cleaned)': ratio_1_test,
                                    'Features Limpias (Predicción)': last_features_cleaned.to_dict() if last_features_cleaned is not None and not last_features_cleaned.empty else None,
                                    'Features Escaladas (Predicción)': last_features_scaled.tolist()[0] if last_features_scaled is not None and last_features_scaled.size > 0 else None
                               })
                               continue # Skip to the next ticker

                          last_features_cleaned = last_features_cleaned[features_for_prediction] # Select only the features the model was trained on


                          # Add checks for NaN, Inf, and Zero in cleaned features BEFORE scaling
                          has_nan_cleaned = last_features_cleaned.isna().any().any()
                          has_inf_cleaned = np.isinf(last_features_cleaned).any().any()
                          has_zero_cleaned = (last_features_cleaned == 0).any().any()

                          if has_nan_cleaned:
                              print(f"\nDEBUG: NaN values detected in last_features_cleaned for {symbol}.")
                          if has_inf_cleaned:
                              print(f"\nDEBUG: Inf values detected in last_features_cleaned for {symbol}.")
                          if has_zero_cleaned:
                               print(f"\nDEBUG: Zero values detected in last_features_cleaned for {symbol}.")


                          if not last_features_cleaned.empty and np.isfinite(last_features_cleaned).all().all():
                              # Ensure scaler is fitted on the *cleaned* full training data
                              scaler = RobustScaler()
                              # Fit scaler on the ORIGINAL cleaned training data (NOT resampled)
                              X_train_full_cleaned_for_scaler_pred = X_train_full.replace([np.inf, -np.inf], np.nan)
                              X_train_full_cleaned_for_scaler_pred.dropna(inplace=True)
                              X_train_full_cleaned_for_scaler_pred = X_train_full_cleaned_for_scaler_pred.clip(lower=lower_bound, upper=upper_bound)
                              X_train_full_cleaned_for_scaler_pred = X_train_full_cleaned_for_scaler_pred.astype(np.float64)
                              # Ensure features for scaler fitting match model features
                              X_train_full_cleaned_for_scaler_pred = X_train_full_cleaned_for_scaler_pred[features_for_prediction]


                              if not X_train_full_cleaned_for_scaler_pred.empty and np.isfinite(X_train_full_cleaned_for_scaler_pred).all().all():
                                   scaler.fit(X_train_full_cleaned_for_scaler_pred)

                                   last_features_cleaned = last_features_cleaned.astype(np.float64)
                                   # Add explicit fallback for non-finite values AFTER dropna for prediction data
                                   # This fallback is actually redundant if dropna() was called just above and np.isfinite checked,
                                   # but keeping it for safety if flow changes. The main checks should be BEFORE scaling.
                                   # Let's rely on the check and skip if still non-finite after cleaning.

                                   if not last_features_cleaned.empty and np.isfinite(last_features_cleaned).all().all():
                                       last_features_scaled = scaler.transform(last_features_cleaned)

                                       # Add checks for NaN, Inf, and Zero in scaled features BEFORE prediction
                                       has_nan_scaled = np.isnan(last_features_scaled).any()
                                       has_inf_scaled = np.isinf(last_features_scaled).any()
                                       has_zero_scaled = (last_features_scaled == 0).any()

                                       if has_nan_scaled:
                                           print(f"\nDEBUG: NaN values detected in last_features_scaled for {symbol}.")
                                       if has_inf_scaled:
                                           print(f"\nDEBUG: Inf values detected in last_features_scaled for {symbol}.")
                                       if has_zero_scaled:
                                            print(f"\nDEBUG: Zero values detected in last_features_scaled for {symbol}.")


                                       future_pred_prob = best_model.predict_proba(last_features_scaled)[:, 1][0].round(4)
                                       future_pred = 1 if future_pred_prob >= best_threshold else 0

                                       last_close = None
                                       last_open = None
                                       last_max = None
                                       last_data_date = None
                                       # Get the actual date of the last data point used for prediction
                                       if not df.empty:
                                           last_data_date = df.index[-1]
                                           if 'Close' in df.columns:
                                               last_close = df['Close'].iloc[-1]
                                           if 'Open' in df.columns:
                                               last_open = df['Open'].iloc[-1]
                                           if 'High' in df.columns:
                                               last_max = df['High'].iloc[-1]
                                       else:
                                           print(f"Warning: DataFrame 'df' is empty for {symbol}. Cannot get last prices.")


                                       action = 'BUY' if future_pred == 1 else 'SELL'
                                       direction = 1 if future_pred == 1 else -1

                                       # Explicitly use to_dict() and tolist()
                                       cleaned_features_dict = last_features_cleaned.to_dict() if last_features_cleaned is not None and not last_features_cleaned.empty else None
                                       # Convert numpy array to list for JSON serialization if needed later
                                       scaled_features_list = last_features_scaled.tolist()[0] if last_features_scaled is not None and last_features_scaled.size > 0 else None


                                       resultsp.append({
                                                   'Papel': symbol,
                                                   'Fecha Predicción': next_day,
                                                   'Fecha Datos': last_data_date, # Use the date of the last data point
                                                   'Predicción': 'Alcista' if future_pred == 1 else 'Bajista',
                                                   'Probabilidad Alcista (Modelo)': future_pred_prob,
                                                   'Umbral de Clasificación': best_threshold.round(4),
                                                   'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)', # Add check for best_model
                                                   'Precision Test (Alcista)': precision_test_alcista,
                                                   'Recall Test (Alcista)': recall_test_alcista,
                                                   'F1 Test (Alcista)': f1_test_alcista,
                                                   'ROC-AUC Test': roc_auc_test,
                                                   'clase 1 en test (cleaned)': ratio_1_test,
                                                   'Features Limpias (Predicción)': cleaned_features_dict, # Add cleaned features using to_dict
                                                   'Features Escaladas (Predicción)': scaled_features_list # Add scaled features as a list
                                           })



                          else:
                               print(f"Warning: Training data (X_train_full) became empty or contains non-finite values after cleaning for scaler fitting for prediction. Skipping prediction for {symbol}.")
                               last_data_date = df.index[-1] if not df.empty else None
                               last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
                               # Explicitly use None for feature columns if prediction skipped due to data issues
                               resultsp.append({
                                   'Papel': symbol,
                                   'Fecha Predicción': next_day,
                                   'Fecha Datos': last_data_date,
                                   'Predicción': 'Skipped (Prediction Data Issue)',
                                   'Precio actual': last_close,
                                   'Probabilidad Alcista (Modelo)': None,
                                   'Umbral de Clasificación': None,
                                   'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                                   'Precision Test (Alcista)': precision_test_alcista,
                                   'Recall Test (Alcista)': recall_test_alcista,
                                   'F1 Test (Alcista)': f1_test_alcista,
                                   'ROC-AUC Test': roc_auc_test,
                                   'clase 1 en test (cleaned)': ratio_1_test,
                                    'Features Limpias (Predicción)': last_features_cleaned.to_dict() if last_features_cleaned is not None and not last_features_cleaned.empty else None,
                                    'Features Escaladas (Predicción)': last_features_scaled.tolist()[0] if last_features_scaled is not None and last_features_scaled.size > 0 else None # Ensure list conversion
                               })


                      else:
                          print(f"Warning: Could not make prediction for {symbol} as last_features was initially empty.")
                          last_data_date = df.index[-1] if not df.empty else None
                          last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
                          # Explicitly use None for feature columns if prediction skipped due to data issues
                          resultsp.append({
                              'Papel': symbol,
                              'Fecha Predicción': next_day,
                              'Fecha Datos': last_data_date,
                              'Predicción': 'Skipped (Prediction Data Issue)',
                              'Precio actual': last_close,
                              'Probabilidad Alcista (Modelo)': None,
                              'Umbral de Clasificación': None,
                              'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                              'Precision Test (Alcista)': precision_test_alcista,
                              'Recall Test (Alcista)': recall_test_alcista,
                              'F1 Test (Alcista)': f1_test_alcista,
                              'ROC-AUC Test': roc_auc_test,
                              'clase 1 en test (cleaned)': ratio_1_test,
                       'Features Limpias (Predicción)': last_features_cleaned.to_dict() if last_features_cleaned is not None and not last_features_cleaned.empty else None,
                       'Features Escaladas (Predicción)': last_features_scaled.tolist()[0] if last_features_scaled is not None and last_features_scaled.size > 0 else None # Ensure list conversion
                          })


              # Catch a general Exception during tuning/evaluation/prediction to prevent script crash
              except Exception as e:
                  print(f"ERROR: An error occurred during tuning, evaluation, or prediction for {symbol}: {e}. Skipping this ticker.")
                  best_model = None
                  best_threshold = 0.5
                  precision_test_alcista = None
                  recall_test_alcista = None
                  f1_test_alcista = None
                  roc_auc_test = None
                  ratio_1_test = None

                  last_data_date = df.index[-1] if not df.empty else None
                  last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None

                  # Append entry indicating skip due to general error, explicitly add None for feature columns
                  resultsp.append({
                        'Papel': symbol,
                        'Fecha Predicción': next_day,
                        'Fecha Datos': last_data_date,
                        'Predicción': 'Skipped (Error)',
                        'Precio actual': last_close,
                        'Probabilidad Alcista (Modelo)': None,
                        'Umbral de Clasificación': None,
                        'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                        'Precision Test (Alcista)': precision_test_alcista,
                        'Recall Test (Alcista)': recall_test_alcista,
                        'F1 Test (Alcista)': f1_test_alcista,
                        'ROC-AUC Test': roc_auc_test,
                        'clase 1 en test (cleaned)': ratio_1_test,
                        'Features Limpias (Predicción)': None,
                        'Features Escaladas (Predicción)': None
                  })

          else: # This else belongs to the check for sufficient RESAMPLED data
              print(f"Warning: Resampled training data is insufficient or has only one class for {symbol}. Skipping model training and prediction.")
              last_data_date = df.index[-1] if not df.empty else None
              last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None

              # Append entry indicating skip due to insufficient training data (resampled)
              resultsp.append({
                    'Papel': symbol,
                    'Fecha Predicción': next_day,
                    'Fecha Datos': last_data_date,
                    'Predicción': 'Skipped (Insufficient Resampled Training Data)', # More specific message
                    'Precio actual': last_close,
                    'Probabilidad Alcista (Modelo)': None,
                    'Umbral de Clasificación': None,
                    'Mejores hiperparámetros (Incluye scale_pos_weight)': 'Skipped (Insufficient Resampled Training Data)',
                    'Precision Test (Alcista)': None, # Set to None as no model ran
                    'Recall Test (Alcista)': None, # Set to None as no model ran
                    'F1 Test (Alcista)': None, # Set to None as no model ran
                    'ROC-AUC Test': None, # Set to None as no model ran
                    'clase 1 en test (cleaned)': None, # Set to None as no model ran
                    'Features Limpias (Predicción)': None,
                    'Features Escaladas (Predicción)': None
              })


      else: # This else belongs to the initial check for sufficient ORIGINAL training data
          print(f"Warning: Original training data is insufficient or has only one class for {symbol}. Skipping model training and prediction.")
          last_data_date = df.index[-1] if not df.empty else None
          last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None

          # Append entry indicating skip due to insufficient training data (original)
          resultsp.append({
                'Papel': symbol,
                'Fecha Predicción': next_day,
                'Fecha Datos': last_data_date,
                'Predicción': 'Skipped (Insufficient Original Training Data)', # More specific message
                'Precio actual': last_close,
                'Probabilidad Alcista (Modelo)': None,
                'Umbral de Clasificación': None,
                'Mejores hiperparámetros (Incluye scale_pos_weight)': 'Skipped (Insufficient Original Training Data)',
                'Precision Test (Alcista)': None, # Set to None as no model ran
                'Recall Test (Alcista)': None, # Set to None as no model ran
                'F1 Test (Alcista)': None, # Set to None as no model ran
                'ROC-AUC Test': None, # Set to None as no model ran
                'clase 1 en test (cleaned)': None, # Set to None as no model ran
                'Features Limpias (Predicción)': None,
                'Features Escaladas (Predicción)': None
          })


  # --- End of block for full model run ---


# Create prediction results table ONLY if the full model was run
if run_full_model: # Moved this entire block inside the if condition
    resultsp_df = pd.DataFrame(resultsp)
    print(resultsp_df)
    if not resultsp_df.empty:
        resultsp_df.set_index('Fecha Predicción', inplace=True)

        # Mostrar resultados de predicción
        pd.set_option('display.max_columns', None)
        #pd.set_option('display.max_rows', None) # Optional: display all rows
        pd.set_option('display.max_colwidth', None) # Optional: display full content of columns

        print(f"\nPrediccion para el proximo dia (hasta {next_day.strftime('%Y-%m-%d')}):")
        print("Nota: 'Fecha Predicción' es la fecha predicha; 'Fecha Datos' es la fecha de los datos usados.")
        display(resultsp_df) # Use display for better formatting

        # Guardar y descargar el CSV de predicciones
        resultsp_df.to_csv(f"Predic_results_{end_date.strftime('%Y-%m-%d')}.csv", sep=";")
        files.download(f"Predic_results_{end_date.strftime('%Y-%m-%d')}.csv")
        print(f"\nArchivo 'Predic_results_{end_date.strftime('%Y-%m-%d')}.csv' generado y descargado.")
    else:
        print("\nNo hay resultados de predicción para mostrar.")
else: # Add a message when skipping the full model run
    print("\n'run_full_model' is set to False. Skipping model training, tuning, evaluation, and prediction.")
    print("Feature distribution analysis tables for all tickers are displayed above.")


Downloading data for ALUA.BA...


[*********************100%***********************]  1 of 1 completed


MultiIndex columns detected for ALUA.BA.
Successfully extracted and flattened MultiIndex columns for ALUA.BA.
Últimas filas del DataFrame antes de crear features:
             Open   High    Low  Close   Volume  Adj Close
Date                                                      
2025-09-08  670.0  705.0  600.0  696.5  1572445      696.5
2025-09-09  700.0  713.0  678.5  694.0  1217950      694.0
2025-09-10  700.0  716.5  691.5  702.0   641358      702.0
2025-09-11  702.0  713.0  676.0  682.0  1443135      682.0
2025-09-12  686.0  702.0  660.0  666.0   420245      666.0
                  RSI       ROC       PPO  PPO_Signal  PPO_Histogram  \
Date                                                                   
2025-09-12  26.056338 -2.915452 -1.130074    -0.73869      -0.391383   

                 EWO  EWO_Signal  EWO_Histogram  Volatility   SMA5     SMA13  \
Date                                                                           
2025-09-12 -3.056848   -1.972626      -1.084222

# Task
Analyze the reasons for the poor performance of the XGBoost model in predicting downward movements (class 0) in the stock data, specifically addressing the class imbalance and the potential impact of features, and propose steps to improve the model's ability to identify these instances.

## Analyze class imbalance impact

### Subtask:
Re-examine the class distribution in the training data to understand the extent of the imbalance and how `scale_pos_weight` was tuned.


**Reasoning**:
Re-examine the class distribution in the training data and how `scale_pos_weight` was tuned.



In [14]:
# 1. Print original training data class distribution
print("Original training data class distribution (y_train_full):")
print(y_train_full.value_counts())
print(y_train_full.value_counts(normalize=True))

# 2. Print resampled training data class distribution
print("\nResampled training data class distribution (y_train_res):")
print(y_train_res.value_counts())
print(y_train_res.value_counts(normalize=True))

# 3. Review the tuned scale_pos_weight
if 'random_search' in locals():
    print("\nBest hyperparameters from RandomizedSearchCV:")
    print(random_search.best_params_)
    tuned_scale_pos_weight = random_search.best_params_.get('scale_pos_weight')
    print(f"\nTuned scale_pos_weight: {tuned_scale_pos_weight}")

    # 4. Compare original class ratio with tuned scale_pos_weight
    original_counts = y_train_full.value_counts()
    if 0 in original_counts and 1 in original_counts:
        original_ratio = original_counts[0] / original_counts[1]
        print(f"\nOriginal training data ratio (Class 0 / Class 1): {original_ratio:.4f}")
        print(f"Is tuned scale_pos_weight close to the original ratio? {'Yes' if abs(tuned_scale_pos_weight - original_ratio) / original_ratio < 0.2 else 'No'} (within 20%)") # Simple check
    else:
        print("\nCannot calculate original class ratio (one or both classes missing).")

else:
    print("\nRandomizedSearchCV object not found. Cannot retrieve best hyperparameters or tuned scale_pos_weight.")


Original training data class distribution (y_train_full):
Label
0.0    2188
1.0    1583
Name: count, dtype: int64
Label
0.0    0.580217
1.0    0.419783
Name: proportion, dtype: float64

Resampled training data class distribution (y_train_res):
Label
1.0    2188
0.0    2188
Name: count, dtype: int64
Label
1.0    0.5
0.0    0.5
Name: proportion, dtype: float64

Best hyperparameters from RandomizedSearchCV:


AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

**Reasoning**:
The previous attempt failed because the `random_search` object apparently does not have the `best_params_` attribute after execution in the current state. I need to find the best parameters, including `scale_pos_weight`, from the `random_search` object. I will assume the object exists and try accessing the `best_params_` attribute again, perhaps there was a temporary issue. I will also ensure the code is robust to the attribute not existing by adding a check.



In [11]:
# 1. Print original training data class distribution
print("Original training data class distribution (y_train_full):")
print(y_train_full.value_counts())
print(y_train_full.value_counts(normalize=True))

# 2. Print resampled training data class distribution
print("\nResampled training data class distribution (y_train_res):")
print(y_train_res.value_counts())
print(y_train_res.value_counts(normalize=True))

# 3. Review the tuned scale_pos_weight
if 'random_search' in locals() and hasattr(random_search, 'best_params_'):
    print("\nBest hyperparameters from RandomizedSearchCV:")
    print(random_search.best_params_)
    tuned_scale_pos_weight = random_search.best_params_.get('scale_pos_weight')
    print(f"\nTuned scale_pos_weight: {tuned_scale_pos_weight}")

    # 4. Compare original class ratio with tuned scale_pos_weight
    original_counts = y_train_full.value_counts()
    if 0 in original_counts and 1 in original_counts:
        original_ratio = original_counts[0] / original_counts[1]
        print(f"\nOriginal training data ratio (Class 0 / Class 1): {original_ratio:.4f}")
        # Add check to prevent division by zero if tuned_scale_pos_weight is None or 0
        if tuned_scale_pos_weight is not None and tuned_scale_pos_weight != 0:
            print(f"Is tuned scale_pos_weight close to the original ratio? {'Yes' if abs(tuned_scale_pos_weight - original_ratio) / original_ratio < 0.2 else 'No'} (within 20%)") # Simple check
        else:
            print("Cannot compare tuned scale_pos_weight as it is None or zero.")
    else:
        print("\nCannot calculate original class ratio (one or both classes missing).")

else:
    print("\nRandomizedSearchCV object or best_params_ attribute not found. Cannot retrieve best hyperparameters or tuned scale_pos_weight.")

Original training data class distribution (y_train_full):
Label
0.0    2188
1.0    1583
Name: count, dtype: int64
Label
0.0    0.580217
1.0    0.419783
Name: proportion, dtype: float64

Resampled training data class distribution (y_train_res):
Label
1.0    2188
0.0    2188
Name: count, dtype: int64
Label
1.0    0.5
0.0    0.5
Name: proportion, dtype: float64

RandomizedSearchCV object or best_params_ attribute not found. Cannot retrieve best hyperparameters or tuned scale_pos_weight.


## Review feature importance

### Subtask:
Look closely at the feature importance scores from the trained model to see which features it is relying on and whether they seem relevant or if there are unexpected results.


:**Reasoning**:
Check if feature_importance exists and print it for analysis.



In [15]:
# 1. Check if feature_importance variable exists and is not empty.
if 'feature_importance' in locals() and not feature_importance.empty:
    # 2. Print the feature_importance Series.
    print("Feature Importance Scores:")
    print(feature_importance)
else:
    print("Feature importance data not available or is empty.")

# 3. Analysis will be done manually after the output is displayed.

Feature Importance Scores:
lag_change2    657.0
lag_change3    653.0
lag_change1    632.0
lag_change4    613.0
lag_change5    612.0
               ...  
high_lag_3      52.0
high_lag_1      52.0
high_lag_5      52.0
close_lag_2     49.0
high_lag_2      33.0
Length: 67, dtype: float64
