<a href="https://colab.research.google.com/github/dmdiegoar/Quant-code-t0/blob/main/XGB_Sentido.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [51]:
import yfinance as yf
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from xgboost import XGBClassifier
from google.colab import files
from scipy import stats
import time


# Functions for features
def add_lagged_price_features(df, etiqueta="close_lag", dato="Close"):
    for lag in range(1, 6):
        df[f'{etiqueta}_{lag}'] = df[dato].shift(lag)
    return df

def calculate_RSI(series, period=7):
    delta = series.diff(1)
    gain = delta.where(delta > 0, 0).rolling(window=period).mean()
    loss = -delta.where(delta < 0, 0).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def calculate_ROC(series, period=5):
    return ((series - series.shift(period)) / series.shift(period)) * 100

def calculate_PPO(series, fast_period=5, slow_period=9, signal_period=5):
    ema_fast = series.ewm(span=fast_period, adjust=False).mean()
    ema_slow = series.ewm(span=slow_period, adjust=False).mean()
    ppo = (ema_fast - ema_slow) / ema_slow * 100
    signal_line = ppo.ewm(span=signal_period, adjust=False).mean()
    histogram = ppo - signal_line
    return ppo, signal_line, histogram

def calculate_EWO(series, fast_period=5, slow_period=35, signal_period=5):
    ema_fast = series.ewm(span=fast_period, adjust=False).mean()
    ema_slow = series.ewm(span=slow_period, adjust=False).mean()
    ewo = (ema_fast - ema_slow) / ema_slow * 100
    signal_line = ewo.ewm(span=signal_period, adjust=False).mean()
    histogram = ewo - signal_line
    return ewo, signal_line, histogram

def calculate_volatility(series, window=20):
    return series.rolling(window).std().round(6)

def calculate_sma5(series, period=5):
    return series.rolling(window=period).mean().round(4)

def calculate_sma13(series, period=13):
    return series.rolling(window=period).mean().round(4)

def calculate_sma26(series, period=26):
    return series.rolling(window=period).mean().round(4)

def calculate_sma50(series, period=50):
    return series.rolling(window=period).mean().round(4)

def calculate_sma200(series, period=200):
    return series.rolling(window=period).mean().round(4)


def create_features(df, umbral, n_days_high=1):
    df = add_lagged_price_features(df, "close_lag", "Close")
    df = add_lagged_price_features(df, "open_lag", "Open")
    df = add_lagged_price_features(df, "high_lag", "High")
    df['Pct_change'] = df['Close'].pct_change()
    for lag in range(1, 6):
        df[f'lag_change{lag}'] = df['Pct_change'].shift(lag)
    df['RSI'] = calculate_RSI(df['Close'])
    df['ROC'] = calculate_ROC(df['Close'])
    df['PPO'], df['PPO_Signal'], df['PPO_Histogram'] = calculate_PPO(df['Close'])
    df['EWO'], df['EWO_Signal'], df['EWO_Histogram'] = calculate_EWO(df['Close'])
    df['SMA5'] = calculate_sma5(df['Close'])
    df['SMA13'] = calculate_sma13(df['Close'])
    df['SMA26'] = calculate_sma26(df['Close'])
    df['SMA50'] = calculate_sma50(df['Close'])
    df['SMA200'] = calculate_sma200(df['Close'])
    df['Volatility'] = calculate_volatility(df['Close'])

    # --- New Feature: Max Gain from Open over Past N Days ---
    # Calculate the maximum High price over the *next N days* for *each historical day*.
    # Use rolling().max() with min_periods=1 to handle ends of series.
    # Then shift to align with the start of the N-day window (the current day's Open).
    max_high_over_next_n_days_hist = df['High'].rolling(window=n_days_high, min_periods=1).max().shift(-n_days_high + 1)


    # Calculate the potential max gain from Open for *each historical day*
    # Using the Open price of that historical day
    epsilon = 1e-9 # To prevent division by zero
    df['Max_Gain_from_Open_Current'] = (max_high_over_next_n_days_hist - df['Open']) / (df['Open'] + epsilon)

    # --- Add lagged versions of the new feature ---
    for lag in range(1, 7): # Create lags from 1 to 6
        df[f'Max_Gain_from_Open_Lag_{lag}'] = df['Max_Gain_from_Open_Current'].shift(lag)


    # Calculate the target based on tomorrow's Open vs Max High over next n_days_high days
    # Use rolling().max() with min_periods=1 for the target as well.
    # Shift to align with the start of the N-day window for the target (tomorrow's Open).
    max_high_next_n_days_target = df['High'].rolling(window=n_days_high, min_periods=1).max().shift(-n_days_high + 1)
    open_next_day = df['Open'].shift(-1)
    df['Label_raw'] = ((max_high_next_n_days_target - open_next_day) / (open_next_day + epsilon) > umbral).astype(int)
    df['Label'] = df['Label_raw'].shift(-1) # Target for the next day


    # Replace inf values with NaN before dropping
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Dropping NaNs here will remove rows where features or the target are still NaN (e.g., due to initial lags,
    # or if min_periods=1 still results in NaN for very short series, although less likely now for the rolling max).
    df.dropna(inplace=True)

    return df

# Definir fecha de corte manualmente (cambiar diariamente)
end_date = dt.datetime(2025, 9, 14)  # Ejemplo: cambiar a 2025-07-18 mañana

tk =[ "ALUA.BA", "BBAR.BA", "BMA.BA", "COME.BA", "CRES.BA", "EDN.BA", "GGAL.BA", "IRSA.BA", "LOMA.BA", "METR.BA", "PAMP.BA", "SUPV.BA", "TECO2.BA", "TGNO4.BA", "TGSU2.BA", "TRAN.BA", "TXAR.BA", "VALO.BA", "YPFD.BA"]


# results = [] # el del test, para que no lo reinicie - REMOVED
resultsp = [] # las predicciones, para que no lo reinicie

umbral = 0.029
lapso = 1 # Lapso is no longer directly used for the target definition, but keeping it doesn't hurt
n_days_high_target = 3 # Define the number of days for the High target (used for both target and new feature)

# Define clipping bounds - adjust based on feature distributions
lower_bound = -1e9
upper_bound = 1e9


for papel in tk:

  symbol=papel
  #symbol="COME.BA"
  # Fechas dinámicas
  start_date = dt.datetime(2001, 1, 1)  # Inicio fijo
  train_end = end_date - pd.Timedelta(days=780)  # 6 meses antes de end_date (ajustable)
  next_day = end_date + pd.Timedelta(days=1)  # Predicción para el día siguiente


  # Select features - Add the new feature and its lags
  features = ['RSI', 'ROC', 'PPO', 'PPO_Signal', 'PPO_Histogram', 'EWO', 'EWO_Signal', 'EWO_Histogram', 'Volatility', 'SMA5', 'SMA13', 'SMA26', 'SMA50', 'SMA200' ] + [f'lag_change{i}' for i in range(1, 6)] + \
            [f'close_lag_{i}' for i in range(1, 6)] + [f'open_lag_{i}' for i in range(1, 6)]+ [f'high_lag_{i}' for i in range(1, 6)] + \
            ['Max_Gain_from_Open_Current'] + [f'Max_Gain_from_Open_Lag_{i}' for i in range(1, 7)]


  # Download data for the current ticker inside the loop
  print(f"\nDownloading data for {symbol}...")
  df = yf.download(symbol, start=start_date, end=end_date, auto_adjust=False)

  # Verify data download
  if df.empty:
      print(f"Warning: No data downloaded for {symbol}. Skipping.")
      continue # Skip to the next ticker


  # Handle MultiIndex columns and ensure standard column names - More robust logic
  required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
  processed_df = None # Initialize processed_df

  if isinstance(df.columns, pd.MultiIndex):
      print(f"MultiIndex columns detected for {symbol}.")
      try:
          # Attempt to extract columns by looking for standard names in the last level
          extracted_data = {}
          for std_name in required_cols:
              # Find the MultiIndex column tuple whose last level matches the standard name
              matching_col_tuple = None
              for col_tuple in df.columns:
                  if col_tuple[-1] == std_name:
                      matching_col_tuple = col_tuple
                      break # Found the first match

              if matching_col_tuple:
                  extracted_data[std_name] = df[matching_col_tuple]
              else:
                  print(f"Warning: Could not find standard column '{std_name}' in the last level of MultiIndex for {symbol}. Column missing.")
                  # Continue to look for other required columns, processed_df will be checked later

          if len(extracted_data) == len(required_cols):
              processed_df = pd.DataFrame(extracted_data)
              processed_df.index = df.index # Preserve original index
              print(f"Successfully extracted and flattened MultiIndex columns for {symbol}.")
          else:
              missing_cols = [name for name in required_cols if name not in extracted_data]
              print(f"Warning: Could not extract all required columns from MultiIndex for {symbol}. Missing: {missing_cols}. Skipping ticker.")
              continue # Skip to the next ticker

      except Exception as e:
          print(f"Warning: An error occurred while processing MultiIndex columns for {symbol}: {e}. Skipping.")
          #print(f"Original columns: {df.columns.tolist()}")
          continue # Skip to the next ticker

  else: # If not MultiIndex columns, assume standard flat DataFrame is already present
      print(f"No MultiIndex columns detected for {symbol}. Checking for standard columns.")
      # Check if the required columns are directly present
      if all(col in df.columns for col in required_cols):
          processed_df = df[required_cols].copy() # Select required columns and make a copy
          print(f"Using existing standard columns for {symbol}.")
      else:
          missing_cols = [col for col in required_cols if col not in df.columns]
          print(f"Warning: Required standard columns not found in flat DataFrame for {symbol}. Missing: {missing_cols}. Skipping ticker.")
          #print(f"Available columns: {df.columns.tolist()}")
          continue # Skip to the next ticker

  # Ensure df is set to processed_df if successful
  df = processed_df

  # Handle MultiIndex index if present (less common with single ticker download but possible)
  if isinstance(df.index, pd.MultiIndex):
      print(f"MultiIndex index detected for {symbol}. Attempting to flatten index.")
      try:
          # Assuming the MultiIndex index structure is ('Ticker', 'Date')
          if 'Ticker' in df.index.names:
               df = df.xs(symbol, level='Ticker', axis=0)
               print(f"Índice aplanado para {symbol}.")
          else:
               print(f"Warning: MultiIndex index detected for {symbol} but 'Ticker' level not found. Skipping index flattening.")
               # If 'Ticker' level is not there, maybe it's just a date/time MultiIndex?
               # Or a different structure. For now, proceed without flattening index if Ticker level is missing.


      except KeyError:
          print(f"Warning: Could not select ticker from MultiIndex index for {symbol}. Skipping.")
          continue # Skip to the next ticker
      except Exception as e:
          print(f"Warning: An error occurred while flattening MultiIndex index for {symbol}: {e}. Skipping.")
          continue # Skip to the next ticker


  df.index = pd.to_datetime(df.index)
  if not df.index.is_unique:
      print(f"Advertencia: Índice con fechas duplicadas para {symbol}. Eliminando duplicados...")
      df = df[~df.index.duplicated(keep='first')]

  if df.empty:
      print(f"Warning: DataFrame is empty after initial processing and cleaning for {symbol}. Skipping.")
      continue


  # Ensure numeric types and handle potential non-numeric data
  for col in required_cols:
      if col in df.columns: # Ensure column exists before processing
          df[col] = pd.to_numeric(df[col], errors='coerce')
      else:
           # This should ideally not happen if previous checks passed, but as a safeguard:
           print(f"Error: Required column '{col}' not found in df for {symbol} before numeric conversion. Skipping ticker.")
           df = pd.DataFrame() # Set df to empty to skip further processing
           break # Exit column processing loop


  if df.empty: # Check again if df became empty due to missing columns
       continue # Skip to the next ticker

  # Drop rows where essential price data is missing after coercion
  df.dropna(subset=['Open', 'High', 'Low', 'Close'], inplace=True)


  if df.empty:
      print(f"Warning: DataFrame is empty after dropping rows with missing price data for {symbol}. Skipping.")
      continue


  df['Open']= df['Open'].round(2)
  df['High']= df['High'].round(2)
  df['Low']= df['Low'].round(2)
  df['Close']= df['Close'].round(2)
  df['Adj Close']= df['Adj Close'].round(2)

  print("Últimas filas del DataFrame antes de crear features:")
  print(df.tail())


  # Crear features with the new target definition
  df = create_features(df, umbral=umbral, n_days_high=n_days_high_target) # Pass n_days_high_target, removed lapso

  # Verify data is not empty after feature creation and dropna
  if df.empty:
      print(f"Warning: DataFrame is empty after feature creation and dropna for {symbol}. Skipping.")
      continue


  # Verify data after creating features
  print("\nÚltimas filas del DataFrame después de crear features:")
  print(df.tail())
  print(df.columns)

  print(f"Distribucion de etiquetas para {symbol}:")
  print(df["Label"].value_counts(normalize=True))

  # Check if there are samples from both classes in the target variable
  if len(df["Label"].unique()) < 2:
      print(f"Warning: Target variable 'Label' contains only one class for {symbol} after feature creation. Cannot train a classifier. Skipping.")
      continue


  correlation = df[features + ["Label"]].corr()["Label"].sort_values(ascending=False)
  print(f"Correlacion con label para {symbol}:")
  print(correlation)


  # Dividir datos en entrenamiento y prueba
  X = df[features]
  y = df['Label']
  X_train_full = X[df.index <= train_end]
  y_train_full = y[df.index <= train_end]
  X_test = X[(df.index > train_end) & (df.index <= end_date)]  # Hasta end_date
  y_test = y[(df.index > train_end) & (df.index <= end_date)]

  # Verify training and test sets are not empty and have both classes
  if X_train_full.empty or y_train_full.empty or len(y_train_full.unique()) < 2:
      print(f"Warning: Training data is insufficient or has only one class for {symbol}. Skipping model training and prediction.")
      continue

  # Initialize test metrics before evaluation
  precision_test_alcista = None
  recall_test_alcista = None
  f1_test_alcista = None
  roc_auc_test = None
  ratio_1_test = None
  best_model = None # Initialize best_model to None
  best_threshold = 0.5 # Initialize best_threshold to default


  # Optimizar hiperparámetros con RandomizedSearchCV
  print(f"Optimizar hiperparámetros con RandomizedSearchCV para {symbol}")
  param_dist = {
      'learning_rate': [0.01, 0.05, 0.1, 0.2],
      'max_depth': [3, 5, 7, 9],
      'n_estimators': [100, 500, 900],
      'subsample': [0.6, 0.8, 1.0],
      'colsample_bytree': [0.6, 0.8, 1.0],
      'gamma': [0, 0.1, 0.2],
      'scale_pos_weight': [0.5, 1, 2, 5, 10, 20, 50, 100] # Incluir scale_pos_weight en la búsqueda
  }

  # Inicializar el clasificador XGBoost sin scale_pos_weight fijo (se tuneará)
  xgb = XGBClassifier(objective='binary:logistic', random_state=42)

  # Usar TimeSeriesSplit para cross-validation
  n_splits = 5  # Puedes ajustar el número de splits
  tscv = TimeSeriesSplit(n_splits=n_splits)

  # Definir scorer para maximizar Precision de la Clase 1
  precision_scorer = make_scorer(precision_score, pos_label=1, zero_division=0) # zero_division=0 para manejar casos sin predicciones positivas

  # Clean X_train_full and y_train_full before fitting RandomizedSearchCV
  X_train_full_cleaned_for_tuning = X_train_full.replace([np.inf, -np.inf], np.nan)
  X_train_full_cleaned_for_tuning.dropna(inplace=True)
  y_train_full_cleaned_for_tuning = y_train_full.loc[X_train_full_cleaned_for_tuning.index] # Ensure y matches cleaned X

  # Explicit check, conversion, and fallback for non-finite values before fitting RandomizedSearchCV
  X_train_full_cleaned_for_tuning = X_train_full_cleaned_for_tuning.astype(np.float64) # Ensure dtype
  if not np.isfinite(X_train_full_cleaned_for_tuning).all().all():
      print(f"\nWarning: Non-finite values detected in X_train_full_cleaned_for_tuning for {symbol} before RandomizedSearchCV fit. Attempting to fill with median.")
      for col in X_train_full_cleaned_for_tuning.columns:
          finite_values = X_train_full_cleaned_for_tuning[col][np.isfinite(X_train_full_cleaned_for_tuning[col])]
          if not finite_values.empty:
              median_val = finite_values.median()
              X_train_full_cleaned_for_tuning[col].replace([np.inf, -np.inf], np.nan, inplace=True)
              X_train_full_cleaned_for_tuning[col].fillna(median_val, inplace=True)
          else:
              print(f"Warning: Column '{col}' in X_train_full_cleaned_for_tuning is all non-finite. Filling with 0.")
              X_train_full_cleaned_for_tuning[col].fillna(0, inplace=True)


  # Perform RandomizedSearchCV - Add try-except block
  # Ensure cleaned data for tuning is not empty and has more than one class
  if not X_train_full_cleaned_for_tuning.empty and not y_train_full_cleaned_for_tuning.empty and len(y_train_full_cleaned_for_tuning.unique()) > 1:
      try:
          random_search = RandomizedSearchCV(xgb, param_distributions=param_dist, n_iter=20, cv=tscv, scoring=precision_scorer, n_jobs=-1, random_state=42) # Usar precision_scorer
          random_search.fit(X_train_full_cleaned_for_tuning, y_train_full_cleaned_for_tuning)
          print(f"Mejores hiperparámetros para {symbol}:", random_search.best_params_)

          # Usar el mejor modelo encontrado por RandomizedSearchCV
          best_model = random_search.best_estimator_

          # Get and print feature importance
          feature_importance = pd.Series(best_model.feature_importances_, index=features)
          print(f"\nFeature Importance for {symbol}:")
          print(feature_importance.sort_values(ascending=False))

          # Optimize the threshold for maximum Precision (Clase 1) on the full training set
          X_train_full_cleaned_for_threshold = X_train_full.replace([np.inf, -np.inf], np.nan)
          X_train_full_cleaned_for_threshold.dropna(inplace=True)
          y_train_full_cleaned_for_threshold = y_train_full.loc[X_train_full_cleaned_for_threshold.index]

          if not X_train_full_cleaned_for_threshold.empty and not y_train_full_cleaned_for_threshold.empty and len(y_train_full_cleaned_for_threshold.unique()) > 1:
              y_train_prob = best_model.predict_proba(X_train_full_cleaned_for_threshold)[:, 1]
              thresholds = np.arange(0.01, 1.0, 0.01)
              best_threshold = 0.5
              best_precision = 0

              print(f"Optimizing threshold for maximum Precision (Clase 1) on training data for {symbol}...")
              if 1 in y_train_full_cleaned_for_threshold.unique():
                  for threshold in thresholds:
                      y_pred_threshold = (y_train_prob >= threshold).astype(int)
                      if np.sum(y_pred_threshold) > 0:
                           precision = precision_score(y_train_full_cleaned_for_threshold, y_pred_threshold, pos_label=1, zero_division=0)
                           if precision > best_precision:
                               best_precision = precision
                               best_threshold = threshold
                  print(f"Mejor umbral para maximizar Precision (Clase 1) en entrenamiento para {symbol}: {best_threshold:.4f} (Precision: {best_precision:.4f})")
              else:
                  print(f"\nWarning: Training set for {symbol} contains no positive samples after cleaning for threshold optimization. Cannot optimize threshold for Precision (Clase 1). Using default threshold 0.5.")
                  best_threshold = 0.5
          else:
              print(f"\nWarning: Training data for threshold optimization is insufficient for {symbol}. Using default threshold 0.5.")
              best_threshold = 0.5


          # Evaluar el modelo en el conjunto de prueba con el best threshold
          if not X_test.empty and not y_test.empty and len(y_test.unique()) > 1:
              print(f"\nEvaluating best model on test set for {symbol} with best threshold ({best_threshold:.4f}):")

              X_test_cleaned = X_test.replace([np.inf, -np.inf], np.nan).dropna()
              y_test_cleaned = y_test.loc[X_test_cleaned.index]

              scaler = RobustScaler()
              X_train_full_cleaned_for_scaler_eval = X_train_full.replace([np.inf, -np.inf], np.nan)
              X_train_full_cleaned_for_scaler_eval.dropna(inplace=True)
              X_train_full_cleaned_for_scaler_eval = X_train_full_cleaned_for_scaler_eval.clip(lower=lower_bound, upper=upper_bound)
              X_train_full_cleaned_for_scaler_eval = X_train_full_cleaned_for_scaler_eval.astype(np.float64)

              if not np.isfinite(X_train_full_cleaned_for_scaler_eval).all().all():
                  print(f"\nWarning: Non-finite values detected in X_train_full_cleaned_for_scaler_eval for {symbol} before scaler fit. Attempting to fill with median.")
                  for col in X_train_full_cleaned_for_scaler_eval.columns:
                      finite_values = X_train_full_cleaned_for_scaler_eval[col][np.isfinite(X_train_full_cleaned_for_scaler_eval[col])]
                      if not finite_values.empty:
                          median_val = finite_values.median()
                          X_train_full_cleaned_for_scaler_eval[col].replace([np.inf, -np.inf], np.nan, inplace=True)
                          X_train_full_cleaned_for_scaler_eval[col].fillna(median_val, inplace=True)
                      else:
                          print(f"Warning: Column '{col}' in X_train_full_cleaned_for_scaler_eval is all non-finite. Filling with 0.")
                          X_train_full_cleaned_for_scaler_eval[col].fillna(0, inplace=True)


              if not X_train_full_cleaned_for_scaler_eval.empty and np.isfinite(X_train_full_cleaned_for_scaler_eval).all().all():
                  scaler.fit(X_train_full_cleaned_for_scaler_eval)

                  X_test_cleaned = X_test_cleaned.astype(np.float64)
                  if not np.isfinite(X_test_cleaned).all().all():
                       print(f"\nWarning: Non-finite values detected in X_test_cleaned for {symbol} before scaler transform. Attempting to fill with median (from train data).")
                       train_medians = X_train_full_cleaned_for_scaler_eval.median()
                       for col in X_test_cleaned.columns:
                           median_val = train_medians.get(col, 0)
                           X_test_cleaned[col].replace([np.inf, -np.inf], np.nan, inplace=True)
                           X_test_cleaned[col].fillna(median_val, inplace=True)
                       if not np.isfinite(X_test_cleaned).all().all():
                            print(f"\nERROR: Non-finite values STILL detected in X_test_cleaned for {symbol} after filling with median!")


                  if not X_test_cleaned.empty and np.isfinite(X_test_cleaned).all().all():
                      X_test_scaled = scaler.transform(X_test_cleaned)

                      y_test_pred_prob = best_model.predict_proba(X_test_scaled)[:, 1]
                      y_test_pred = (y_test_pred_prob >= best_threshold).astype(int)

                      if len(y_test_cleaned.unique()) > 1:
                          print("\nClassification Report (Test Set):")
                          print(classification_report(y_test_cleaned, y_test_pred, zero_division=0))
                          precision_test_alcista = precision_score(y_test_cleaned, y_test_pred, pos_label=1, zero_division=0)
                          recall_test_alcista = recall_score(y_test_cleaned, y_test_pred, pos_label=1, zero_division=0)
                          f1_test_alcista = f1_score(y_test_cleaned, y_test_pred, pos_label=1, zero_division=0)

                          print(f"Tamaño de y_test (cleaned): {y_test_cleaned.size}")
                          print(f"Distribución de clases en y_test (cleaned) para {symbol}:")
                          print(y_test_cleaned.value_counts())
                          if 1 in y_test_cleaned.value_counts():
                              ratio_1_test=(y_test_cleaned.value_counts()[1]/y_test_cleaned.size).round(4)
                          else:
                              ratio_1_test = 0
                          print(f"% clase 1 test para {symbol}: {ratio_1_test} ")

                          if len(y_test_cleaned.unique()) > 1:
                               roc_auc_test = roc_auc_score(y_test_cleaned, y_test_pred_prob).round(6)
                               print(f"\nROC-AUC (Test Set) para {symbol}: {roc_auc_test:.4f}")
                          else:
                               roc_auc_test = None
                               print(f"\nWarning: Test set for {symbol} contains only one class after cleaning. Cannot calculate ROC-AUC.")

                      else:
                          print(f"\nWarning: Test set for {symbol} contains only one class after cleaning. Cannot generate full classification report.")
                          precision_test_alcista = None
                          recall_test_alcista = None
                          f1_test_alcista = None
                          roc_auc_test = None
                          ratio_1_test = None


                  else:
                      print(f"\nWarning: X_test became empty after cleaning or contains non-finite values for {symbol}. Skipping test evaluation.")
                      precision_test_alcista = None
                      recall_test_alcista = None
                      f1_test_alcista = None
                      roc_auc_test = None
                      ratio_1_test = None


              else:
                  print(f"\nWarning: Training data (X_train_full) became empty or contains non-finite values after cleaning for scaler fitting for evaluation for {symbol}. Skipping test evaluation.")
                  precision_test_alcista = None
                  recall_test_alcista = None
                  f1_test_alcista = None
                  roc_auc_test = None
                  ratio_1_test = None


          else:
              print(f"\nAdvertencia: Conjunto de prueba insuficiente o con una sola clase para evaluación para {symbol}.")
              precision_test_alcista = None
              recall_test_alcista = None
              f1_test_alcista = None
              roc_auc_test = None
              ratio_1_test = None

          # Prediction for the next day is done only if model was trained successfully
          last_features = df[features].iloc[-1:]
          last_features_cleaned = None
          last_features_scaled = None
          future_pred_prob = None
          future_pred = None

          if not last_features.empty:
              # Ensure last_features is a single row DataFrame before cleaning
              if not isinstance(last_features, pd.DataFrame) or len(last_features) != 1:
                   print(f"\nError: last_features is not a single row DataFrame for {symbol}. Skipping prediction.")
                   # Set prediction results to skipped
                   last_data_date = df.index[-1] if not df.empty else None
                   last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
                   resultsp.append({
                       'Papel': symbol,
                       'Fecha Predicción': next_day,
                       'Fecha Datos': last_data_date,
                       'Predicción': 'Skipped (Prediction Data Error)',
                       'Precio actual': last_close,
                       'Probabilidad Alcista (Modelo)': None,
                       'Umbral de Clasificación': None,
                       'Mejores hiperparámetros (Incluye scale_pos_weight)': 'Skipped (Prediction Data Error)',
                       'Precision Test (Alcista)': precision_test_alcista,
                       'Recall Test (Alcista)': recall_test_alcista,
                       'F1 Test (Alcista)': f1_test_alcista,
                       'ROC-AUC Test': roc_auc_test,
                       'clase 1 en test (cleaned)': ratio_1_test,
                       'Features Limpias (Predicción)': None,
                       'Features Escaladas (Predicción)': None
                   })
                   continue # Skip to the next ticker


              last_features_cleaned = last_features.replace([np.inf, -np.inf], np.nan).dropna()
              last_features_cleaned = last_features_cleaned.clip(lower=lower_bound, upper=upper_bound)

              if not last_features_cleaned.empty and np.isfinite(last_features_cleaned).all().all():
                  # Ensure scaler is fitted on the *cleaned* full training data
                  scaler = RobustScaler()
                  X_train_full_cleaned_for_scaler_pred = X_train_full.replace([np.inf, -np.inf], np.nan)
                  X_train_full_cleaned_for_scaler_pred.dropna(inplace=True)
                  X_train_full_cleaned_for_scaler_pred = X_train_full_cleaned_for_scaler_pred.clip(lower=lower_bound, upper=upper_bound)
                  X_train_full_cleaned_for_scaler_pred = X_train_full_cleaned_for_scaler_pred.astype(np.float64)

                  if not np.isfinite(X_train_full_cleaned_for_scaler_pred).all().all():
                       print(f"\nWarning: Non-finite values detected in X_train_full_cleaned_for_scaler_pred for {symbol} before scaler fit. Attempting to fill with median.")
                       for col in X_train_full_cleaned_for_scaler_pred.columns:
                           finite_values = X_train_full_cleaned_for_scaler_pred[col][np.isfinite(X_train_full_cleaned_for_scaler_pred[col])]
                           if not finite_values.empty:
                               median_val = finite_values.median()
                               X_train_full_cleaned_for_scaler_pred[col].replace([np.inf, -np.inf], np.nan, inplace=True)
                               X_train_full_cleaned_for_scaler_pred[col].fillna(median_val, inplace=True)
                           else:
                               print(f"Warning: Column '{col}' in X_train_full_cleaned_for_scaler_pred is all non-finite. Filling with 0.")
                               X_train_full_cleaned_for_scaler_pred[col].fillna(0, inplace=True)


                  if not X_train_full_cleaned_for_scaler_pred.empty and np.isfinite(X_train_full_cleaned_for_scaler_pred).all().all():
                       scaler.fit(X_train_full_cleaned_for_scaler_pred)

                       last_features_cleaned = last_features_cleaned.astype(np.float64)
                       # Add explicit fallback for non-finite values AFTER dropna for prediction data
                       if not np.isfinite(last_features_cleaned).all().all():
                            print(f"\nWarning: Non-finite values STILL detected in last_features_cleaned for {symbol} AFTER dropna. Attempting to fill with median (from train data).")
                            train_medians = X_train_full_cleaned_for_scaler_pred.median()
                            for col in last_features_cleaned.columns:
                                median_val = train_medians.get(col, 0)
                                last_features_cleaned[col].replace([np.inf, -np.inf], np.nan, inplace=True)
                                last_features_cleaned[col].fillna(median_val, inplace=True)
                            if not np.isfinite(last_features_cleaned).all().all():
                                 print(f"\nERROR: Non-finite values STILL detected in last_features_cleaned for {symbol} after filling with median!")


                       if not last_features_cleaned.empty and np.isfinite(last_features_cleaned).all().all():
                           last_features_scaled = scaler.transform(last_features_cleaned)

                           future_pred_prob = best_model.predict_proba(last_features_scaled)[:, 1][0].round(4)
                           future_pred = 1 if future_pred_prob >= best_threshold else 0

                           last_close = None
                           last_open = None
                           last_max = None
                           if not df.empty:
                               last_close = df['Close'].iloc[-1]
                               last_open = df['Open'].iloc[-1]
                               last_max = df['High'].iloc[-1]
                               last_data_date = df.index[-1]
                        else:
                               print(f"Warning: DataFrame 'df' is empty for {symbol}. Cannot get last prices.")
                               last_data_date = None

                           action = 'BUY' if future_pred == 1 else 'SELL'
                           direction = 1 if future_pred == 1 else -1

                           # Ensure output uses to_dict() and tolist() explicitly
                           resultsp.append({
                                       'Papel': symbol,
                                       'Fecha Predicción': next_day,
                                       'Fecha Datos': last_data_date,
                                       'Predicción': 'Alcista' if future_pred == 1 else 'Bajista',
                                       'Probabilidad Alcista (Modelo)': future_pred_prob,
                                       'Umbral de Clasificación': best_threshold.round(4),
                                       'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)', # Add check for best_model
                                       'Precision Test (Alcista)': precision_test_alcista,
                                       'Recall Test (Alcista)': recall_test_alcista,
                                       'F1 Test (Alcista)': f1_test_alcista,
                                       'ROC-AUC Test': roc_auc_test,
                                       'clase 1 en test (cleaned)': ratio_1_test,
                                       'Features Limpias (Predicción)': last_features_cleaned.to_dict() if last_features_cleaned is not None and not last_features_cleaned.empty else None, # Add cleaned features using to_dict
                                       'Features Escaladas (Predicción)': last_features_scaled.tolist()[0] if last_features_scaled is not None and last_features_scaled.size > 0 else None # Add scaled features as a list
                               })
                       else:
                            print(f"\nWarning: last_features became empty or contains non-finite values after final filling for scaler transform. Skipping prediction for {symbol}.")
                            last_data_date = df.index[-1] if not df.empty else None
                            last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
                            resultsp.append({
                                'Papel': symbol,
                                'Fecha Predicción': next_day,
                                'Fecha Datos': last_data_date,
                                'Predicción': 'Skipped (Prediction Data Issue)',
                                'Precio actual': last_close,
                                'Probabilidad Alcista (Modelo)': None,
                                'Umbral de Clasificación': None,
                                'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                                'Precision Test (Alcista)': precision_test_alcista,
                                'Recall Test (Alcista)': recall_test_alcista,
                                'F1 Test (Alcista)': f1_test_alcista,
                                'ROC-AUC Test': roc_auc_test,
                                'clase 1 en test (cleaned)': ratio_1_test,
                                 'Features Limpias (Predicción)': last_features_cleaned.to_dict() if last_features_cleaned is not None and not last_features_cleaned.empty else None,
                                 'Features Escaladas (Predicción)': last_features_scaled.tolist()[0] if last_features_scaled is not None and last_features_scaled.size > 0 else None
                            })


                  else:
                       print(f"\nWarning: Training data (X_train_full) became empty or contains non-finite values after final filling for scaler fitting for prediction. Skipping prediction for {symbol}.")
                       last_data_date = df.index[-1] if not df.empty else None
                       last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
                       resultsp.append({
                           'Papel': symbol,
                           'Fecha Predicción': next_day,
                           'Fecha Datos': last_data_date,
                           'Predicción': 'Skipped (Prediction Data Issue)',
                           'Precio actual': last_close,
                           'Probabilidad Alcista (Modelo)': None,
                           'Umbral de Clasificación': None,
                           'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                           'Precision Test (Alcista)': precision_test_alcista,
                           'Recall Test (Alcista)': recall_test_alcista,
                           'F1 Test (Alcista)': f1_test_alcista,
                           'ROC-AUC Test': roc_auc_test,
                           'clase 1 en test (cleaned)': ratio_1_test,
                            'Features Limpias (Predicción)': last_features_cleaned.to_dict() if last_features_cleaned is not None and not last_features_cleaned.empty else None,
                            'Features Escaladas (Predicción)': last_features_scaled.tolist()[0] if last_features_scaled is not None and last_features_scaled.size > 0 else None
                       })


              else:
                  print(f"\nWarning: Could not make prediction for {symbol} as last_features became empty or contains non-finite values after cleaning.")
                  last_data_date = df.index[-1] if not df.empty else None
                  last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
                  resultsp.append({
                      'Papel': symbol,
                      'Fecha Predicción': next_day,
                      'Fecha Datos': last_data_date,
                      'Predicción': 'Skipped (Prediction Data Issue)',
                      'Precio actual': last_close,
                      'Probabilidad Alcista (Modelo)': None,
                      'Umbral de Clasificación': None,
                      'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                      'Precision Test (Alcista)': precision_test_alcista,
                      'Recall Test (Alcista)': recall_test_alcista,
                      'F1 Test (Alcista)': f1_test_alcista,
                      'ROC-AUC Test': roc_auc_test,
                      'clase 1 en test (cleaned)': ratio_1_test,
                       'Features Limpias (Predicción)': last_features_cleaned.to_dict() if last_features_cleaned is not None and not last_features_cleaned.empty else None,
                       'Features Escaladas (Predicción)': last_features_scaled.tolist()[0] if last_features_scaled is not None and last_features_scaled.size > 0 else None
                  })


          else:
              print(f"\nWarning: Could not make prediction for {symbol} as last_features was initially empty.")
              last_data_date = df.index[-1] if not df.empty else None
              last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
              resultsp.append({
                  'Papel': symbol,
                  'Fecha Predicción': next_day,
                  'Fecha Datos': last_data_date,
                  'Predicción': 'Skipped (Prediction Data Issue)',
                  'Precio actual': last_close,
                  'Probabilidad Alcista (Modelo)': None,
                  'Umbral de Clasificación': None,
                  'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                  'Precision Test (Alcista)': precision_test_alcista,
                  'Recall Test (Alcista)': recall_test_alcista,
                  'F1 Test (Alcista)': f1_test_alcista,
                  'ROC-AUC Test': roc_auc_test,
                  'clase 1 en test (cleaned)': ratio_1_test,
                   'Features Limpias (Predicción)': last_features_cleaned.to_dict() if last_features_cleaned is not None and not last_features_cleaned.empty else None,
                   'Features Escaladas (Predicción)': last_features_scaled.tolist()[0] if last_features_scaled is not None and last_features_scaled.size > 0 else None
              })


      except ValueError as e:
          # Catch the specific ValueError during RandomizedSearchCV fit
          print(f"\nERROR: Failed to fit RandomizedSearchCV for {symbol} due to data issues: {e}. Skipping tuning, evaluation, and prediction for this ticker.")
          # Set results for this ticker to None or default values if the fit fails
          best_model = None
          best_threshold = 0.5 # Use default threshold if model not trained
          precision_test_alcista = None
          recall_test_alcista = None
          f1_test_alcista = None
          roc_auc_test = None
          ratio_1_test = None

          # Append a result entry indicating failure, if you still want a record
          last_data_date = df.index[-1] if not df.empty else None
          last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None

          # Explicitly add None for feature columns if tuning failed
          resultsp.append({
                'Papel': symbol,
                'Fecha Predicción': next_day,
                'Fecha Datos': last_data_date,
                'Predicción': 'Skipped (Tuning Error)', # More specific error message
                'Precio actual': last_close,
                'Probabilidad Alcista (Modelo)': None,
                'Umbral de Clasificación': None,
                'Mejores hiperparámetros (Incluye scale_pos_weight)': 'Skipped (Tuning Error)',
                'Precision Test (Alcista)': precision_test_alcista,
                'Recall Test (Alcista)': recall_test_alcista,
                'F1 Test (Alcista)': f1_test_alcista,
                'ROC-AUC Test': roc_auc_test,
                'clase 1 en test (cleaned)': ratio_1_test,
                 'Features Limpias (Predicción)': None, # No prediction data if tuning failed
                 'Features Escaladas (Predicción)': None # No prediction data if tuning failed
          })


      except Exception as e:
          # Catch any other unexpected errors during tuning or evaluation
          print(f"\nERROR: An unexpected error occurred during tuning or evaluation for {symbol}: {e}. Skipping evaluation and prediction for this ticker.")
          best_model = None
          best_threshold = 0.5
          precision_test_alcista = None
          recall_test_alcista = None
          f1_test_alcista = None
          roc_auc_test = None
          ratio_1_test = None

          last_data_date = df.index[-1] if not df.empty else None
          last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None

          # Explicitly add None for feature columns if an unexpected error occurred
          resultsp.append({
                'Papel': symbol,
                'Fecha Predicción': next_day,
                'Fecha Datos': last_data_date,
                'Predicción': 'Skipped (Error)',
                'Precio actual': last_close,
                'Probabilidad Alcista (Modelo)': None,
                'Umbral de Clasificación': None,
                'Mejores hiperparámetros (Incluye scale_pos_weight)': 'Skipped (Error)',
                'Precision Test (Alcista)': precision_test_alcista,
                'Recall Test (Alcista)': recall_test_alcista,
                'F1 Test (Alcista)': f1_test_alcista,
                'ROC-AUC Test': roc_auc_test,
                'clase 1 en test (cleaned)': ratio_1_test,
                'Features Limpias (Predicción)': None, # No prediction data if error occurred
                'Features Escaladas (Predicción)': None # No prediction data if error occurred
          })


  else:
      print(f"\nWarning: Training data (X_train_full or y_train_full) is empty or has only one class after cleaning for tuning for {symbol}. Skipping model training and prediction.")
      # Append a result entry indicating skip due to insufficient training data
      last_data_date = df.index[-1] if not df.empty else None
      last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None

      # Explicitly add None for feature columns if training data is insufficient
      resultsp.append({
            'Papel': symbol,
            'Fecha Predicción': next_day,
            'Fecha Datos': last_data_date,
            'Predicción': 'Skipped (Insufficient Training Data)',
            'Precio actual': last_close,
            'Probabilidad Alcista (Modelo)': None,
            'Umbral de Clasificación': None,
            'Mejores hiperparámetros (Incluye scale_pos_weight)': 'Skipped (Insufficient Training Data)',
            'Precision Test (Alcista)': precision_test_alcista,
            'Recall Test (Alcista)': recall_test_alcista,
            'F1 Test (Alcista)': f1_test_alcista,
            'ROC-AUC Test': roc_auc_test,
            'clase 1 en test (cleaned)': ratio_1_test,
            'Features Limpias (Predicción)': None, # No prediction data if training data is insufficient
            'Features Escaladas (Predicción)': None # No prediction data if training data is insufficient
      })


# Crear tabla de resultados de predicción
resultsp_df = pd.DataFrame(resultsp)
print(resultsp_df)
if not resultsp_df.empty:
    resultsp_df.set_index('Fecha Predicción', inplace=True)

    # Mostrar resultados de predicción
    pd.set_option('display.max_columns', None)
    #pd.set_option('display.max_rows', None) # Optional: display all rows
    pd.set_option('display.max_colwidth', None) # Optional: display full content of columns

    print("\nPrediccion para el proximo dia (hasta", next_day.strftime('%Y-%m-%d'), "):")
    print("Nota: 'Fecha Predicción' es la fecha predicha; 'Fecha Datos' es la fecha de los datos usados.")
    display(resultsp_df) # Use display for better formatting

    # Guardar y descargar el CSV de predicciones
    resultsp_df.to_csv(f"Predic_results_{end_date.strftime('%Y-%m-%d')}.csv", sep=";")
    files.download(f"Predic_results_{end_date.strftime('%Y-%m-%d')}.csv")
    print(f"\nArchivo 'Predic_results_{end_date.strftime('%Y-%m-%d')}.csv' generado y descargado.")
else:
    print("\nNo hay resultados de predicción para mostrar.")

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 609)

LA MEJORA DE GK

mas listo que ayer

In [4]:
import yfinance as yf
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from google.colab import files  # Para descargar el CSV en Colab
from scipy import stats


# Funciones de features
def add_lagged_price_features(df):
    for lag in range(1, 6):
        df[f'close_lag_{lag}'] = df['Close'].shift(lag)
    return df

def add_lagged_price_features(df, etiqueta="close_lag", dato="Close"):
    for lag in range(1, 6):
        df[f'{etiqueta}_{lag}'] = df[dato].shift(lag)
    return df


def calculate_RSI(series, period=7):
    delta = series.diff(1)
    gain = delta.where(delta > 0, 0).rolling(window=period).mean()
    loss = -delta.where(delta < 0, 0).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def calculate_ROC(series, period=5):
    return ((series - series.shift(period)) / series.shift(period)) * 100

def calculate_PPO(series, fast_period=5, slow_period=9, signal_period=5):
    ema_fast = series.ewm(span=fast_period, adjust=False).mean()
    ema_slow = series.ewm(span=slow_period, adjust=False).mean()
    ppo = (ema_fast - ema_slow) / ema_slow * 100
    signal_line = ppo.ewm(span=signal_period, adjust=False).mean()
    histogram = ppo - signal_line
    return ppo, signal_line, histogram

def calculate_EWO(series, fast_period=5, slow_period=35, signal_period=5):
    ema_fast = series.ewm(span=fast_period, adjust=False).mean()
    ema_slow = series.ewm(span=slow_period, adjust=False).mean()
    ewo = (ema_fast - ema_slow) / ema_slow * 100
    signal_line = ewo.ewm(span=signal_period, adjust=False).mean()
    histogram = ewo - signal_line
    return ewo, signal_line, histogram

def calculate_volatility(series, window=20):
    return series.rolling(window).std().round(6)

def calculate_sma5(series, period=5):
    return series.rolling(window=period).mean().round(4)

def calculate_sma13(series, period=13):
    return series.rolling(window=period).mean().round(4)

def calculate_sma26(series, period=26):
    return series.rolling(window=period).mean().round(4)

def calculate_sma50(series, period=50):
    return series.rolling(window=period).mean().round(4)

def calculate_sma200(series, period=200):
    return series.rolling(window=period).mean().round(4)


#def calculate_earnings_season(df):
#    df['Is_Earnings_Season'] = df.index.month.isin([1, 4, 7, 10])
#    return df

#def calculate_christmas_rally(df):
#    df['Is_Christmas_Rally'] = df.index.month.isin([11, 12])
#    return df

def create_features(df):
    df = add_lagged_price_features(df, "close_lag", "Close")
    df = add_lagged_price_features(df, "open_lag", "Open")
    df = add_lagged_price_features(df, "high_lag", "High")
    df['Pct_change'] = df['Close'].pct_change()
    for lag in range(1, 6):
        df[f'lag_change{lag}'] = df['Pct_change'].shift(lag)
    df['RSI'] = calculate_RSI(df['Close'])
    df['ROC'] = calculate_ROC(df['Close'])
    df['PPO'], df['PPO_Signal'], df['PPO_Histogram'] = calculate_PPO(df['Close'])
    df['EWO'], df['EWO_Signal'], df['EWO_Histogram'] = calculate_EWO(df['Close'])
    df['SMA5'] = calculate_sma5(df['Close'])
    df['SMA13'] = calculate_sma13(df['Close'])
    df['SMA26'] = calculate_sma26(df['Close'])
    df['SMA50'] = calculate_sma50(df['Close'])
    df['SMA200'] = calculate_sma200(df['Close'])
    df['Volatility'] = calculate_volatility(df['Close'])
    df['Label'] = (df['Pct_change'] > 0).astype(int)
    df['Return'] = np.log(df['Close'] / df['Close'].shift())
    #df = calculate_earnings_season(df)
    #df = calculate_christmas_rally(df)
    df.dropna(inplace=True)
    return df

# Definir fecha de corte manualmente (cambiar diariamente)
end_date = dt.datetime(2025, 7, 23)  # Ejemplo: cambiar a 2025-07-18 mañana

tk =[ "ALUA.BA", "BBAR.BA", "BMA.BA", "COME.BA", "CRES.BA", "EDN.BA", "GGAL.BA", "IRSA.BA", "LOMA.BA", "METR.BA", "PAMP.BA", "SUPV.BA", "TECO2.BA", "TGNO4.BA", "TGSU2.BA", "TRAN.BA", "TXAR.BA", "VALO.BA", "YPFD.BA"]

tk =[ "ALUA.BA", "BBAR.BA",  "PAMP.BA",  "YPFD.BA"]

# pinchas = "BYMA.BA" start_date dt.datetime(2021, 1, 1),  "CEPU.BA" start_date  dt.datetime(2018, 1, 1)

results = [] # el del test, para que no lo reinicie
resultsp = [] # las predicciones, para que no lo reinicie

for papel in tk:

  symbol=papel
  #symbol="COME.BA"
  # Fechas dinámicas
  start_date = dt.datetime(2001, 1, 1)  # Inicio fijo
  train_end = end_date - pd.Timedelta(days=780)  # 6 meses antes de end_date (ajustable)
  next_day = end_date + pd.Timedelta(days=1)  # Predicción para el día siguiente
  backtest_start = end_date - pd.Timedelta(days=2)  # Inicio del backtesting 6 meses antes de end_date (ajustable)

  # Descargar datos
  df = yf.download(symbol, start=start_date, end=end_date, auto_adjust=False)

  # Verificar datos

  for intentos in range(1, 20):
    if df.empty:
      print("No se pudieron descargar datos. Verifica el símbolo, las fechas o la conexión.")
      print(f"\n Reintentando {symbol}: {intentos} de 20")
      df = yf.download(symbol, start=start_date, end=end_date, auto_adjust=False)
      intentos +=1
      time.sleep(8)
    else:
      print(f"\n Datos descargados en {intentos} vueltas: {symbol} , seguimos")


  # Aplanar MultiIndex si existe
  if isinstance(df.columns, pd.MultiIndex):
      print("MultiIndex detectado en columnas. Aplanando...")
      df.columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
      print("Columnas asignadas después de aplanamiento:", df.columns.tolist())
      print("Últimas filas antes de corrección:", df.tail())

      # Corregir el orden de las columnas según tu mapeo
      columns = df.columns.tolist()
      df = df[[columns[4], columns[2], columns[3], columns[0], columns[5], columns[1]]]  # Reordenar
      df.columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']  # Asignar nombres correctos
      print("Últimas filas después de corrección:", df.tail())

  if isinstance(df.index, pd.MultiIndex):
      print("MultiIndex detectado en índice. Seleccionando ticker...")
      df = df.xs(symbol, level='Ticker', axis=0)
  df.index = pd.to_datetime(df.index)  # Asegurar índice datetime
  if not df.index.is_unique:
      print("Advertencia: Índice con fechas duplicadas. Eliminando duplicados...")
      df = df[~df.index.duplicated(keep='first')]

  # Verificar columnas
  print("Columnas del DataFrame después de descargar y corregir:")
  print(df.columns)

  df['Open']= df['Open'].round(2)
  df['High']= df['High'].round(2)
  df['Low']= df['Low'].round(2)
  df['Close']= df['Close'].round(2)
  df['Adj Close']= df['Adj Close'].round(2)

  print("Últimas filas del DataFrame antes de crear features:")
  print(df.tail())

  # Crear features
  df = create_features(df)

  # Verificar datos después de crear features
  print("\nÚltimas filas del DataFrame después de crear features:")
  print(df.tail())
  # Verificar datos después de crear features
  print("\nÚltimas filas del DataFrame después de crear features:")
  print(df.tail())
  print(df.columns)

  print("Distribucion de etiquetas")
  print(df["Label"].value_counts(normalize=True))
  correlation = df[features + ["Label"]].corr()["Label"].sort_values(ascending=False)
  print("Correlacion con label")
  print(correlation)

  # Seleccionar features
  features = ['RSI', 'ROC', 'PPO', 'PPO_Signal', 'PPO_Histogram', 'EWO', 'EWO_Signal', 'EWO_Histogram', 'Volatility', 'SMA5', 'SMA13', 'SMA26', 'SMA50', 'SMA200' ] + [f'lag_change{i}' for i in range(1, 6)] + \
            [f'close_lag_{i}' for i in range(1, 6)] + [f'open_lag_{i}' for i in range(1, 6)]+ [f'high_lag_{i}' for i in range(1, 6)]

  #features = ['RSI', 'ROC', 'PPO', 'PPO_Signal', 'PPO_Histogram', 'EWO', 'EWO_Signal', 'EWO_Histogram', 'Volatility', 'SMA5', 'SMA13', 'SMA26', 'SMA50', 'SMA200' ] + [f'lag_change{i}' for i in range(1, 6)] + \
  #          [f'close_lag_{i}' for i in range(1, 6)] + [f'open_lag_{i}' for i in range(1, 6)]+ [f'high_lag_{i}' for i in range(1, 6)]
  X = df[features]
  y = df['Label']

  # Dividir datos en entrenamiento y prueba
  X_train_full = X[df.index <= train_end]
  y_train_full = y[df.index <= train_end]
  X_test = X[(df.index > train_end) & (df.index <= end_date)]  # Hasta end_date
  y_test = y[(df.index > train_end) & (df.index <= end_date)]

  # Optimizar hiperparámetros con RandomizedSearchCV
  print("Optimizar hiperparámetros con RandomizedSearchCV")
  param_dist = {
      'learning_rate': [0.01, 0.05, 0.1, 0.2],
      'max_depth': [3, 5, 7, 9],
      'n_estimators': [100, 500, 900],
      'subsample': [0.6, 0.8, 1.0],
      'colsample_bytree': [0.6, 0.8, 1.0],
      'gamma': [0, 0.1, 0.2]
  }
  xgb = XGBClassifier(objective='binary:logistic', random_state=42)
  random_search = RandomizedSearchCV(xgb, param_distributions=param_dist, n_iter=20, cv=5, scoring='f1', n_jobs=-1, random_state=42)
  random_search.fit(X_train_full, y_train_full)
  print("Mejores hiperparámetros:", random_search.best_params_)

  # Usar el mejor modelo
  best_model = random_search.best_estimator_

  # Optimizar el umbral
  y_train_prob = best_model.predict_proba(X_train_full)[:, 1]
  thresholds = np.arange(0.48, 0.80, 0.01) # de 0.48 a 0.80  con pasos de 0.01????
  best_threshold = 0.5
  best_f1 = 0
  for threshold in thresholds:
      y_pred_threshold = (y_train_prob >= threshold).astype(int)
      f1 = f1_score(y_train_full, y_pred_threshold)
      if f1 > best_f1:
          best_f1 = f1
          best_threshold = threshold
  best_threshold.round(2)
  print("Mejor umbral:", best_threshold)



  # Backtesting con escalado dinámico para evitar data leak
  dates = df[(df.index >= backtest_start) & (df.index <= end_date)].index
  #results = []

  for test_date in dates:
      train_data = df[df.index < test_date].copy()
      if train_data.empty or train_data['Label'].isna().all():
          continue
      train_data = train_data.dropna()

      X_train_loop = train_data[features]
      y_train_loop = train_data['Label']

      scaler = StandardScaler()  # Reinicio dinámico del escalador
      X_train_scaled = scaler.fit_transform(X_train_loop)
      print(f"Escalando datos hasta {train_data.index[-1]} para predecir {test_date}")

      best_model.fit(X_train_scaled, y_train_loop)

      if test_date in df.index:
          test_row = df.loc[[test_date]][features]
          if test_row.empty:
              continue
          test_row_scaled = scaler.transform(test_row)

          prediction_prob = best_model.predict_proba(test_row_scaled)[0][1].round(4)
          prediction = 1 if prediction_prob >= best_threshold else 0

          real_direction = df.loc[test_date, 'Label'] if pd.notna(df.loc[test_date, 'Label']) else None
          close_price = df.loc[test_date, 'Close']

          is_correct = int(prediction == real_direction) if real_direction is not None else None
          data_date = df.index[df.index < test_date][-1] if not df[df.index < test_date].empty else None

          results.append({
              'Papel': papel,
              'Fecha Predicción': test_date,
              'Fecha Datos': data_date,
              'Predicción': 'Alcista' if prediction == 1 else 'Bajista',
              'Resultado Real': 'Alcista' if real_direction == 1 else 'Bajista' if real_direction == 0 else None,
              'Precio Cierre': close_price,
              'Probabilidad Alcista': prediction_prob,
              'Correcta': 'Sí' if is_correct == 1 else 'No' if is_correct == 0 else None
          })

  # Crear tabla de resultados
  results_df = pd.DataFrame(results)
  results_df.set_index('Fecha Predicción', inplace=True)

  # Mostrar resultados
  pd.set_option('display.max_columns', None)
  print("\nResultados del backtesting (hasta", end_date.strftime('%Y-%m-%d'), "):")
  print("Nota: 'Fecha Predicción' es la fecha predicha; 'Fecha Datos' es la fecha de los datos usados.")
  print(results_df)

  # Guardar y descargar el CSV
  #results_df.to_csv(f"backtesting_results_{symbol}_{end_date.strftime('%Y-%m-%d')}.csv", sep=";")
  #files.download(f"backtesting_results_{symbol}_{end_date.strftime('%Y-%m-%d')}.csv")
  #print(f"\nArchivo 'backtesting_results_{symbol}_{end_date.strftime('%Y-%m-%d')}.csv' generado y descargado.")

  # Métricas del backtesting
  #if results_df['Correcta'].notna().sum() > 0:
  #    accuracy = (results_df['Correcta'] == 'Sí').sum() / results_df['Correcta'].notna().sum()
  #    print(f"\nAccuracy del backtesting: {accuracy:.2%}")

  #    valid_results = results_df[results_df['Correcta'].notna()]
  #    y_true = [1 if r == 'Alcista' else 0 for r in valid_results['Resultado Real']]
  #    y_pred = [1 if p == 'Alcista' else 0 for p in valid_results['Predicción']]
  #    print("\nMatriz de Confusión:")
  #    print(confusion_matrix(y_true, y_pred))
  #    print("\nInforme de Clasificación:")
  #    print(classification_report(y_true, y_pred))

  # ROC-AUC para entrenamiento y prueba
  if not X_train_full.empty and not X_test.empty:
      X_train_scaled = StandardScaler().fit_transform(X_train_full)
      X_test_scaled = StandardScaler().fit_transform(X_test)
      best_model.fit(X_train_scaled, y_train_full)
      train_pred_proba = best_model.predict_proba(X_train_scaled)[:, 1]
      test_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]
      roc_auc_train = roc_auc_score(y_train_full, train_pred_proba)
      roc_auc_test = roc_auc_score(y_test, test_pred_proba).round(6)
      # Verificar tamaños de las muestras
      print(f"Tamaño de y_train_full: {y_train_full.size}")
      print(f"Tamaño de y_test: {y_test.size}")
      print("Distribución de clases en y_train_full:")
      print(y_train_full.value_counts())
      ratio_1_train=(y_train_full.value_counts()[1]/y_train_full.size).round(4)
      print(f"% clase 1 train: {ratio_1_train} ")

      print("Distribución de clases en y_test:")
      print(y_test.value_counts())
      ratio_1_test=(y_test.value_counts()[1]/y_test.size).round(4)
      print(f"% clase 1 test: {ratio_1_test} ")

      print(f"\nROC-AUC en el conjunto de entrenamiento: {roc_auc_train:.4f}")
      print(roc_auc_train)
      print(f"ROC-AUC en el conjunto de prueba: {roc_auc_test:.4f}")
      y_pred_test = (best_model.predict_proba(X_test_scaled)[:, 1] >= 0.3).astype(int)
      print("\nMatriz de Confusión (prueba):")
      print(confusion_matrix(y_test, y_pred_test))
      print("\nInforme de Clasificación (prueba):")
      print(classification_report(y_test, y_pred_test))
  else:
      print("\nAdvertencia: Conjunto de prueba o entrenamiento insuficiente. No se calculó ROC-AUC.")

  print("Mejores hiperparámetros:", random_search.best_params_)
  print("Mejor umbral:", best_threshold)


  # Predicción para el día siguiente
  last_features = df[features].iloc[-1:]
  scaler = StandardScaler()
  last_features_scaled = scaler.fit_transform(last_features)
  future_pred_prob = best_model.predict_proba(last_features_scaled)[0][1].round(4)
  future_pred = 1 if future_pred_prob >= best_threshold else 0

  returns = df['Return'].dropna()
  mean_return = returns.mean()
  std_return = returns.std()
  last_close = df['Close'].iloc[-1]
  expected_price = last_close * np.exp(mean_return + 0.5 * std_return**2)
  price_prob = future_pred_prob if future_pred == 1 else 1 - future_pred_prob

  action = 'BUY' if future_pred == 1 else 'SELL'
  direction = 1 if future_pred == 1 else -1

  resultsp.append({
              'Papel': papel,
              'Fecha Predicción': next_day,
              'Fecha Datos': df.index[-1],
              'Predicción': 'Alcista' if future_pred == 1 else 'Bajista',
              'Resultado Real': "Veremos",
              'Precio actual': last_close,
              'Precio Cierre': "futuro",
              'Correcta': "verificar",
              'Probabilidad Alcista': future_pred_prob,
              'Umbral': best_threshold,
              'ROC-AUC prueba': roc_auc_test ,
              'clase 1 en train': str(ratio_1_train),
              'clase 1 en test': str(ratio_1_test),
              'Mejores hiperparámetros': str(random_search.best_params_),
              'Matrix prueba': str(classification_report(y_test, y_pred_test))
              #si la pasas a tolist() perdes los titulos
              #'Matrix prueba': str((classification_report(y_test, y_pred_test)).tolist())

          })
  # Crear tabla de resultados
  resultsp_df = pd.DataFrame(resultsp)
  resultsp_df.set_index('Fecha Predicción', inplace=True)

  # Mostrar resultados
  pd.set_option('display.max_columns', None)
  print("\nPrediccion para el proximo dia (hasta", next_day.strftime('%Y-%m-%d'), "):")
  print("Nota: 'Fecha Predicción' es la fecha predicha; 'Fecha Datos' es la fecha de los datos usados.")
  print(resultsp_df)

  print(f"\nPredicción para {next_day.strftime('%Y-%m-%d')} (basada en datos hasta {df.index[-1].strftime('%Y-%m-%d')}):")
  print(f"Papel: {symbol}")
  print(f"Precio actual: [{last_close:.4f}]")
  print(f"Precio esperado para el siguiente día (distribución log-normal): [{expected_price:.4f}]")
  print(f"Probabilidad de que el precio predicho sea correcto: [{price_prob:.4f}]")
  print(f"Corte: {df.index[-1]}")
  print(f"\nPredicción para {next_day.strftime('%Y-%m-%d')}: {'Alcista' if future_pred == 1 else 'Bajista'} (Probabilidad Alcista: {future_pred_prob:.2%})")
  print(f"Pronóstico de dirección del activo (1: subida, -1: bajada): {direction}")
  print(f"Acción sugerida por la estrategia de trading: {action}")

  print(f"\n {symbol}, {next_day.strftime('%d-%m-%y')}, {df.index[-1].strftime('%d-%m-%y')}, {action}, , ,{future_pred_prob:.4}")


# Guardar y descargar el CSV
results_df.to_csv(f"backtesting_results_{symbol}_{end_date.strftime('%Y-%m-%d')}.csv", sep=";")
files.download(f"backtesting_results_{symbol}_{end_date.strftime('%Y-%m-%d')}.csv")
print(f"\nArchivo 'backtesting_results_{symbol}_{end_date.strftime('%Y-%m-%d')}.csv' generado y descargado.")
import time
time.sleep(6)

resultsp_df.to_csv(f"Predic_results_{symbol}_{end_date.strftime('%Y-%m-%d')}.csv", sep=";")
files.download(f"Predic_results_{symbol}_{end_date.strftime('%Y-%m-%d')}.csv")
print(f"\nArchivo 'Predic_results_{symbol}_{end_date.strftime('%Y-%m-%d')}.csv' generado y descargado.")

[*********************100%***********************]  1 of 1 completed


 Datos descargados en 1 vueltas: ALUA.BA , seguimos

 Datos descargados en 2 vueltas: ALUA.BA , seguimos

 Datos descargados en 3 vueltas: ALUA.BA , seguimos

 Datos descargados en 4 vueltas: ALUA.BA , seguimos

 Datos descargados en 5 vueltas: ALUA.BA , seguimos

 Datos descargados en 6 vueltas: ALUA.BA , seguimos

 Datos descargados en 7 vueltas: ALUA.BA , seguimos

 Datos descargados en 8 vueltas: ALUA.BA , seguimos

 Datos descargados en 9 vueltas: ALUA.BA , seguimos

 Datos descargados en 10 vueltas: ALUA.BA , seguimos

 Datos descargados en 11 vueltas: ALUA.BA , seguimos

 Datos descargados en 12 vueltas: ALUA.BA , seguimos

 Datos descargados en 13 vueltas: ALUA.BA , seguimos

 Datos descargados en 14 vueltas: ALUA.BA , seguimos

 Datos descargados en 15 vueltas: ALUA.BA , seguimos

 Datos descargados en 16 vueltas: ALUA.BA , seguimos

 Datos descargados en 17 vueltas: ALUA.BA , seguimos

 Datos descargados en 18 vueltas: ALUA.BA , seguimos

 Datos descargados en 19 vueltas: AL




KeyError: "['Max_Gain_from_Open_Current', 'Max_Gain_from_Open_Lag_1', 'Max_Gain_from_Open_Lag_2', 'Max_Gain_from_Open_Lag_3', 'Max_Gain_from_Open_Lag_4', 'Max_Gain_from_Open_Lag_5', 'Max_Gain_from_Open_Lag_6'] not in index"

COMIENZA EL UMBRAL

In [None]:
import pandas as pd
import yfinance as yf
import datetime as dt

# Descargar datos (si no lo hiciste aún)
end_date = dt.datetime(2025, 7, 20)
df = yf.download("COME.BA", start=dt.datetime(2001, 1, 1), end=end_date)

# Definir umbral (por ejemplo, 2%)
umbral = 0.02

# Calcular la diferencia relativa y etiquetar
df['Label'] = ((df['High'] - df['Open']) / df['Open'] > umbral).astype(int) * 2 - 1

# Contar etiquetas y calcular porcentajes
label_counts = df['Label'].value_counts()
total_dias = len(df)
percentages = (label_counts / total_dias) * 100

# Imprimir resultados
print("Conteo de etiquetas:")
print(label_counts)
print("\nPorcentajes de cada clase (%):")
print(percentages.round(2))

# Opcional: Ver los primeros días etiquetados
print("\nPrimeros días etiquetados:")
print(df[['Open', 'High', 'Label']].tail(20))
print("mambral")
umbral = df['High'].sub(df['Open']).div(df['Open']).quantile(0.75)
print(umbral)
df['Label'] = ((df['High'] - df['Open']) / df['Open'] > umbral).astype(int) * 2 - 1
print(df['Label'].value_counts(normalize=True) * 100)

# Contar etiquetas y calcular porcentajes
label_counts = df['Label'].value_counts()
total_dias = len(df)
percentages = (label_counts / total_dias) * 100

# Imprimir resultados
print("Conteo de etiquetas:")
print(label_counts)
print("\nPorcentajes de cada clase (%):")
print(percentages.round(2))

In [5]:
import pandas as pd
import yfinance as yf
import datetime as dt

# Descargar datos
end_date = dt.datetime(2025, 7, 30)
df = yf.download("GGAL", start=dt.datetime(2001, 1, 1), end=end_date)

# Aplanar el MultiIndex a columnas simples
df.columns = df.columns.map(lambda x: x[0])

# Limpiar datos
df = df.dropna(subset=['Open', 'High'])

# Calcular umbral dinámico
#differences = (df['High'] - df['Open']) / df['Open']
#umbral = differences.quantile(0.6).item()
#print(f"Umbral calculado: {umbral:.4f}")
umbral = 0.019
lapso = 2

# Calcular etiqueta sin desfase (para verificar)
df['Label_raw'] = ((df['High'] - df['Open']) / df['Open'] > umbral).astype(int) * 2 - 1

# Desplazar la etiqueta un día hacia atrás (target del día "lapso")
df['Label'] = df['Label_raw'].shift(-lapso)

# Eliminar la última fila (no tiene etiqueta para predecir)
df = df.dropna(subset=['Label'])
print(df.tail(33))
# Contar etiquetas y calcular porcentajes
label_counts = df['Label'].value_counts()
total_dias = len(df)
percentages = (label_counts / total_dias) * 100

print("\nConteo de etiquetas (desfasadas):")
print(label_counts)
print("\nPorcentajes de cada clase (%):")
print(percentages.round(2))

# Opcional: Ver los primeros días etiquetados
print("\nPrimeros días etiquetados (features del día anterior, label del día siguiente):")
print(df[['Open', 'High', 'Label']].tail(22))

  df = yf.download("GGAL", start=dt.datetime(2001, 1, 1), end=end_date)
[*********************100%***********************]  1 of 1 completed

                Close       High        Low       Open   Volume  Label_raw  \
Date                                                                         
2025-06-09  53.657017  55.156869  53.508023  54.997944   973800         -1   
2025-06-10  56.944775  57.530807  53.885469  54.153655  1459600          1   
2025-06-11  55.822369  55.901829  52.743197  54.630431  7670800          1   
2025-06-12  55.762772  56.885180  54.382112  55.027746  2270100          1   
2025-06-13  54.302647  55.623712  53.716615  55.176735  1772100         -1   
2025-06-16  54.133789  55.216465  53.885469  54.709894  1057500         -1   
2025-06-17  53.984795  54.600631  52.465077  53.418625  1329000          1   
2025-06-18  53.786144  54.948281  53.329232  53.925203   979700         -1   
2025-06-20  51.650589  53.716614  51.581060  53.716614  1124700         -1   
2025-06-23  50.031540  51.968438  49.723625  51.134082  1593000         -1   
2025-06-24  52.326019  53.090846  50.657308  50.657308  1284100 




hasta aca llegaste

**apruebas**

In [8]:
import yfinance as yf
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from xgboost import XGBClassifier
from google.colab import files
from scipy import stats
import time


# Functions for features
def add_lagged_price_features(df, etiqueta="close_lag", dato="Close"):
    for lag in range(1, 6):
        df[f'{etiqueta}_{lag}'] = df[dato].shift(lag)
    return df

def calculate_RSI(series, period=7):
    delta = series.diff(1)
    gain = delta.where(delta > 0, 0).rolling(window=period).mean()
    loss = -delta.where(delta < 0, 0).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def calculate_ROC(series, period=5):
    return ((series - series.shift(period)) / series.shift(period)) * 100

def calculate_PPO(series, fast_period=5, slow_period=9, signal_period=5):
    ema_fast = series.ewm(span=fast_period, adjust=False).mean()
    ema_slow = series.ewm(span=slow_period, adjust=False).mean()
    ppo = (ema_fast - ema_slow) / ema_slow * 100
    signal_line = ppo.ewm(span=signal_period, adjust=False).mean()
    histogram = ppo - signal_line
    return ppo, signal_line, histogram

def calculate_EWO(series, fast_period=5, slow_period=35, signal_period=5):
    ema_fast = series.ewm(span=fast_period, adjust=False).mean()
    ema_slow = series.ewm(span=slow_period, adjust=False).mean()
    ewo = (ema_fast - ema_slow) / ema_slow * 100
    signal_line = ewo.ewm(span=signal_period, adjust=False).mean()
    histogram = ewo - signal_line
    return ewo, signal_line, histogram

def calculate_volatility(series, window=20):
    return series.rolling(window).std().round(6)

def calculate_sma5(series, period=5):
    return series.rolling(window=period).mean().round(4)

def calculate_sma13(series, period=13):
    return series.rolling(window=period).mean().round(4)

def calculate_sma26(series, period=26):
    return series.rolling(window=period).mean().round(4)

def calculate_sma50(series, period=50):
    return series.rolling(window=period).mean().round(4)

def calculate_sma200(series, period=200):
    return series.rolling(window=period).mean().round(4)


def create_features(df, umbral, n_days_high=1):
    df = add_lagged_price_features(df, "close_lag", "Close")
    df = add_lagged_price_features(df, "open_lag", "Open")
    df = add_lagged_price_features(df, "high_lag", "High")
    df['Pct_change'] = df['Close'].pct_change()
    for lag in range(1, 6):
        df[f'lag_change{lag}'] = df['Pct_change'].shift(lag)
    df['RSI'] = calculate_RSI(df['Close'])
    df['ROC'] = calculate_ROC(df['Close'])
    df['PPO'], df['PPO_Signal'], df['PPO_Histogram'] = calculate_PPO(df['Close'])
    df['EWO'], df['EWO_Signal'], df['EWO_Histogram'] = calculate_EWO(df['Close'])
    df['SMA5'] = calculate_sma5(df['Close'])
    df['SMA13'] = calculate_sma13(df['Close'])
    df['SMA26'] = calculate_sma26(df['Close'])
    df['SMA50'] = calculate_sma50(df['Close'])
    df['SMA200'] = calculate_sma200(df['Close'])
    df['Volatility'] = calculate_volatility(df['Close'])

    # --- New Feature: Max Gain from Open over Past N Days ---
    # Calculate the maximum High price over the *next N days* for *each historical day*.
    # Use rolling().max() with min_periods=1 to handle ends of series.
    # Then shift to align with the start of the N-day window (the current day's Open).
    max_high_over_next_n_days_hist = df['High'].rolling(window=n_days_high, min_periods=1).max().shift(-n_days_high + 1)


    # Calculate the potential max gain from Open for *each historical day*
    # Using the Open price of that historical day
    epsilon = 1e-9 # To prevent division by zero
    df['Max_Gain_from_Open_Current'] = (max_high_over_next_n_days_hist - df['Open']) / (df['Open'] + epsilon)

    # --- Add lagged versions of the new feature ---
    for lag in range(1, 7): # Create lags from 1 to 6
        df[f'Max_Gain_from_Open_Lag_{lag}'] = df['Max_Gain_from_Open_Current'].shift(lag)


    # Calculate the target based on tomorrow's Open vs Max High over next n_days_high days
    # Use rolling().max() with min_periods=1 for the target as well.
    # Shift to align with the start of the N-day window for the target (tomorrow's Open).
    max_high_next_n_days_target = df['High'].rolling(window=n_days_high, min_periods=1).max().shift(-n_days_high + 1)
    open_next_day = df['Open'].shift(-1)
    df['Label_raw'] = ((max_high_next_n_days_target - open_next_day) / (open_next_day + epsilon) > umbral).astype(int)
    df['Label'] = df['Label_raw'].shift(-1) # Target for the next day


    # Replace inf values with NaN before dropping
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Dropping NaNs here will remove rows where features or the target are still NaN (e.g., due to initial lags,
    # or if min_periods=1 still results in NaN for very short series, although less likely now for the rolling max).
    df.dropna(inplace=True)

    return df

# Definir fecha de corte manualmente (cambiar diariamente)
end_date = dt.datetime(2025, 9, 14)  # Ejemplo: cambiar a 2025-07-18 mañana

tk =[ "ALUA.BA", "BBAR.BA", "BMA.BA", "COME.BA", "CRES.BA", "EDN.BA", "GGAL.BA", "IRSA.BA", "LOMA.BA", "METR.BA", "PAMP.BA", "SUPV.BA", "TECO2.BA", "TGNO4.BA", "TGSU2.BA", "TRAN.BA", "TXAR.BA", "VALO.BA", "YPFD.BA"]

tk =[ "ALUA.BA", "BBAR.BA", "METR.BA", "PAMP.BA", "TRAN.BA",  "YPFD.BA"]


# results = [] # el del test, para que no lo reinicie - REMOVED
resultsp = [] # las predicciones, para que no lo reinicie

umbral = 0.019
lapso = 1 # Lapso is no longer directly used for the target definition, but keeping it doesn't hurt
n_days_high_target = 3 # Define the number of days for the High target (used for both target and new feature)

# Define clipping bounds - adjust based on feature distributions
lower_bound = -1e9
upper_bound = 1e9


for papel in tk:

  symbol=papel
  #symbol="COME.BA"
  # Fechas dinámicas
  start_date = dt.datetime(2001, 1, 1)  # Inicio fijo
  train_end = end_date - pd.Timedelta(days=780)  # 6 meses antes de end_date (ajustable)
  next_day = end_date + pd.Timedelta(days=1)  # Predicción para el día siguiente


  # Select features - Add the new feature and its lags
  features = ['RSI', 'ROC', 'PPO', 'PPO_Signal', 'PPO_Histogram', 'EWO', 'EWO_Signal', 'EWO_Histogram', 'Volatility', 'SMA5', 'SMA13', 'SMA26', 'SMA50', 'SMA200' ] + [f'lag_change{i}' for i in range(1, 6)] + \
            [f'close_lag_{i}' for i in range(1, 6)] + [f'open_lag_{i}' for i in range(1, 6)]+ [f'high_lag_{i}' for i in range(1, 6)] + \
            ['Max_Gain_from_Open_Current'] + [f'Max_Gain_from_Open_Lag_{i}' for i in range(1, 7)]


  # Download data for the current ticker inside the loop
  print(f"\nDownloading data for {symbol}...")
  df = yf.download(symbol, start=start_date, end=end_date, auto_adjust=False)

  # Verify data download
  if df.empty:
      print(f"Warning: No data downloaded for {symbol}. Skipping.")
      continue # Skip to the next ticker


  # Handle MultiIndex columns and ensure standard column names - More robust logic
  required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
  processed_df = None # Initialize processed_df

  if isinstance(df.columns, pd.MultiIndex):
      print(f"MultiIndex columns detected for {symbol}.")
      try:
          # Attempt to extract columns by looking for standard names in ANY level of the MultiIndex tuple
          extracted_data = {}
          for std_name in required_cols:
              matching_col_tuple = None
              # Iterate through all column tuples
              for col_tuple in df.columns:
                  # Check if the standard name exists in ANY level of the current tuple
                  if std_name in col_tuple:
                       matching_col_tuple = col_tuple
                       break # Found a match in this tuple

              if matching_col_tuple:
                  extracted_data[std_name] = df[matching_col_tuple]
              else:
                  print(f"Warning: Could not find standard column '{std_name}' in any level of MultiIndex for {symbol}. Column missing.")
                  # Continue to look for other required columns, processed_df will be checked later

          if len(extracted_data) == len(required_cols):
              processed_df = pd.DataFrame(extracted_data)
              processed_df.index = df.index # Preserve original index
              print(f"Successfully extracted and flattened MultiIndex columns for {symbol}.")
          else:
              missing_cols = [name for name in required_cols if name not in extracted_data]
              print(f"Warning: Could not extract all required columns from MultiIndex for {symbol}. Missing: {missing_cols}. Skipping ticker.")
              continue # Skip to the next ticker

      except Exception as e:
          print(f"Warning: An error occurred while processing MultiIndex columns for {symbol}: {e}. Skipping.")
          #print(f"Original columns: {df.columns.tolist()}")
          continue # Skip to the next ticker

  else: # If not MultiIndex columns, assume standard flat DataFrame is already present
      print(f"No MultiIndex columns detected for {symbol}. Checking for standard columns.")
      # Check if the required columns are directly present
      if all(col in df.columns for col in required_cols):
          processed_df = df[required_cols].copy() # Select required columns and make a copy
          print(f"Using existing standard columns for {symbol}.")
      else:
          missing_cols = [col for col in required_cols if col not in df.columns]
          print(f"Warning: Required standard columns not found in flat DataFrame for {symbol}. Missing: {missing_cols}. Skipping ticker.")
          #print(f"Available columns: {df.columns.tolist()}")
          continue # Skip to the next ticker

  # Ensure df is set to processed_df if successful
  df = processed_df

  # Handle MultiIndex index if present (less common with single ticker download but possible)
  if isinstance(df.index, pd.MultiIndex):
      print(f"MultiIndex index detected for {symbol}. Attempting to flatten index.")
      try:
          # Assuming the MultiIndex index structure is ('Ticker', 'Date')
          if 'Ticker' in df.index.names:
               df = df.xs(symbol, level='Ticker', axis=0)
               print(f"Índice aplanado para {symbol}.")
          else:
               print(f"Warning: MultiIndex index detected for {symbol} but 'Ticker' level not found. Skipping index flattening.")
               # If 'Ticker' level is not there, maybe it's just a date/time MultiIndex?
               # Or a different structure. For now, proceed without flattening index if Ticker level is missing.


      except KeyError:
          print(f"Warning: Could not select ticker from MultiIndex index for {symbol}. Skipping.")
          continue # Skip to the next ticker
      except Exception as e:
          print(f"Warning: An error occurred while flattening MultiIndex index for {symbol}: {e}. Skipping.")
          continue # Skip to the next ticker


  df.index = pd.to_datetime(df.index)
  if not df.index.is_unique:
      print(f"Advertencia: Índice con fechas duplicadas para {symbol}. Eliminando duplicados...")
      df = df[~df.index.duplicated(keep='first')]

  if df.empty:
      print(f"Warning: DataFrame is empty after initial processing and cleaning for {symbol}. Skipping.")
      continue


  # Ensure numeric types and handle potential non-numeric data
  for col in required_cols:
      if col in df.columns: # Ensure column exists before processing
          df[col] = pd.to_numeric(df[col], errors='coerce')
      else:
           # This should ideally not happen if previous checks passed, but as a safeguard:
           print(f"Error: Required column '{col}' not found in df for {symbol} before numeric conversion. Skipping ticker.")
           df = pd.DataFrame() # Set df to empty to skip further processing
           break # Exit column processing loop


  if df.empty: # Check again if df became empty due to missing columns
       continue # Skip to the next ticker

  # Drop rows where essential price data is missing after coercion
  df.dropna(subset=['Open', 'High', 'Low', 'Close'], inplace=True)


  if df.empty:
      print(f"Warning: DataFrame is empty after dropping rows with missing price data for {symbol}. Skipping.")
      continue


  df['Open']= df['Open'].round(2)
  df['High']= df['High'].round(2)
  df['Low']= df['Low'].round(2)
  df['Close']= df['Close'].round(2)
  df['Adj Close']= df['Adj Close'].round(2)

  print("Últimas filas del DataFrame antes de crear features:")
  print(df.tail())


  # Crear features with the new target definition
  df = create_features(df, umbral=umbral, n_days_high=n_days_high_target) # Pass n_days_high_target, removed lapso

  # Verify data is not empty after feature creation and dropna
  if df.empty:
      print(f"Warning: DataFrame is empty after feature creation and dropna for {symbol}. Skipping.")
      continue


  # Verify data after creating features
  print("\nÚltimas filas del DataFrame después de crear features:")
  print(df.tail())
  print(df.columns)

  print(f"Distribucion de etiquetas para {symbol}:")
  print(df["Label"].value_counts(normalize=True))

  # Check if there are samples from both classes in the target variable
  if len(df["Label"].unique()) < 2:
      print(f"Warning: Target variable 'Label' contains only one class for {symbol} after feature creation. Cannot train a classifier. Skipping.")
      continue


  correlation = df[features + ["Label"]].corr()["Label"].sort_values(ascending=False)
  print(f"Correlacion con label para {symbol}:")
  print(correlation)


  # Dividir datos en entrenamiento y prueba
  X = df[features]
  y = df['Label']
  X_train_full = X[df.index <= train_end]
  y_train_full = y[df.index <= train_end]
  X_test = X[(df.index > train_end) & (df.index <= end_date)]  # Hasta end_date
  y_test = y[(df.index > train_end) & (df.index <= end_date)]

  # Verify training and test sets are not empty and have both classes
  if X_train_full.empty or y_train_full.empty or len(y_train_full.unique()) < 2:
      print(f"Warning: Training data is insufficient or has only one class for {symbol}. Skipping model training and prediction.")
      continue

  # Initialize test metrics before evaluation
  precision_test_alcista = None
  recall_test_alcista = None
  f1_test_alcista = None
  roc_auc_test = None
  ratio_1_test = None
  best_model = None # Initialize best_model to None
  best_threshold = 0.5 # Initialize best_threshold to default


  # Optimizar hiperparámetros con RandomizedSearchCV
  print(f"Optimizar hiperparámetros con RandomizedSearchCV para {symbol}")
  param_dist = {
      'learning_rate': [0.01, 0.05, 0.1, 0.2],
      'max_depth': [3, 5, 7, 9],
      'n_estimators': [100, 500, 900],
      'subsample': [0.6, 0.8, 1.0],
      'colsample_bytree': [0.6, 0.8, 1.0],
      'gamma': [0, 0.1, 0.2],
      'scale_pos_weight': [0.5, 1, 2, 5, 10, 20, 50, 100] # Incluir scale_pos_weight en la búsqueda
  }

  # Inicializar el clasificador XGBoost sin scale_pos_weight fijo (se tuneará)
  xgb = XGBClassifier(objective='binary:logistic', random_state=42)

  # Usar TimeSeriesSplit para cross-validation
  n_splits = 5  # Puedes ajustar el número de splits
  tscv = TimeSeriesSplit(n_splits=n_splits)

  # Definir scorer para maximizar Precision de la Clase 1
  precision_scorer = make_scorer(precision_score, pos_label=1, zero_division=0) # zero_division=0 para manejar casos sin predicciones positivas

  # Clean X_train_full and y_train_full before fitting RandomizedSearchCV
  X_train_full_cleaned_for_tuning = X_train_full.replace([np.inf, -np.inf], np.nan)
  X_train_full_cleaned_for_tuning.dropna(inplace=True)
  y_train_full_cleaned_for_tuning = y_train_full.loc[X_train_full_cleaned_for_tuning.index] # Ensure y matches cleaned X

  # Explicit check, conversion, and fallback for non-finite values before fitting RandomizedSearchCV
  X_train_full_cleaned_for_tuning = X_train_full_cleaned_for_tuning.astype(np.float64) # Ensure dtype
  if not np.isfinite(X_train_full_cleaned_for_tuning).all().all():
      print(f"\nWarning: Non-finite values detected in X_train_full_cleaned_for_tuning for {symbol} before RandomizedSearchCV fit. Attempting to fill with median.")
      for col in X_train_full_cleaned_for_tuning.columns:
          finite_values = X_train_full_cleaned_for_tuning[col][np.isfinite(X_train_full_cleaned_for_tuning[col])]
          if not finite_values.empty:
              median_val = finite_values.median()
              X_train_full_cleaned_for_tuning[col].replace([np.inf, -np.inf], np.nan, inplace=True)
              X_train_full_cleaned_for_tuning[col].fillna(median_val, inplace=True)
          else:
              print(f"Warning: Column '{col}' in X_train_full_cleaned_for_tuning is all non-finite. Filling with 0.")
              X_train_full_cleaned_for_tuning[col].fillna(0, inplace=True)


  # Perform tuning, evaluation, and prediction within a general try-except block
  # to prevent script crash on problematic tickers
  if not X_train_full_cleaned_for_tuning.empty and not y_train_full_cleaned_for_tuning.empty and len(y_train_full_cleaned_for_tuning.unique()) > 1:
      try:
          random_search = RandomizedSearchCV(xgb, param_distributions=param_dist, n_iter=20, cv=tscv, scoring=precision_scorer, n_jobs=-1, random_state=42) # Usar precision_scorer
          random_search.fit(X_train_full_cleaned_for_tuning, y_train_full_cleaned_for_tuning)
          print(f"Mejores hiperparámetros para {symbol}:", random_search.best_params_)

          # Usar el mejor modelo encontrado por RandomizedSearchCV
          best_model = random_search.best_estimator_

          # Get and print feature importance
          feature_importance = pd.Series(best_model.feature_importances_, index=features)
          print(f"\nFeature Importance for {symbol}:")
          print(feature_importance.sort_values(ascending=False))

          # Optimize the threshold for maximum Precision (Clase 1) on the full training set
          X_train_full_cleaned_for_threshold = X_train_full.replace([np.inf, -np.inf], np.nan)
          X_train_full_cleaned_for_threshold.dropna(inplace=True)
          y_train_full_cleaned_for_threshold = y_train_full.loc[X_train_full_cleaned_for_threshold.index]

          if not X_train_full_cleaned_for_threshold.empty and not y_train_full_cleaned_for_threshold.empty and len(y_train_full_cleaned_for_threshold.unique()) > 1:
              y_train_prob = best_model.predict_proba(X_train_full_cleaned_for_threshold)[:, 1]
              thresholds = np.arange(0.01, 1.0, 0.01)
              best_threshold = 0.5
              best_precision = 0

              print(f"Optimizing threshold for maximum Precision (Clase 1) on training data for {symbol}...")
              if 1 in y_train_full_cleaned_for_threshold.unique():
                  for threshold in thresholds:
                      y_pred_threshold = (y_train_prob >= threshold).astype(int)
                      if np.sum(y_pred_threshold) > 0:
                           precision = precision_score(y_train_full_cleaned_for_threshold, y_pred_threshold, pos_label=1, zero_division=0)
                           if precision > best_precision:
                               best_precision = precision
                               best_threshold = threshold
                  print(f"Mejor umbral para maximizar Precision (Clase 1) en entrenamiento para {symbol}: {best_threshold:.4f} (Precision: {best_precision:.4f})")
              else:
                  print(f"\nWarning: Training set for {symbol} contains no positive samples after cleaning for threshold optimization. Cannot optimize threshold for Precision (Clase 1). Using default threshold 0.5.")
                  best_threshold = 0.5
          else:
              print(f"\nWarning: Training data for threshold optimization is insufficient for {symbol}. Using default threshold 0.5.")
              best_threshold = 0.5


          # Evaluar el modelo en el conjunto de prueba con el best threshold
          if not X_test.empty and not y_test.empty and len(y_test.unique()) > 1:
              print(f"\nEvaluating best model on test set for {symbol} with best threshold ({best_threshold:.4f}):")

              X_test_cleaned = X_test.replace([np.inf, -np.inf], np.nan).dropna()
              y_test_cleaned = y_test.loc[X_test_cleaned.index]

              scaler = RobustScaler()
              X_train_full_cleaned_for_scaler_eval = X_train_full.replace([np.inf, -np.inf], np.nan)
              X_train_full_cleaned_for_scaler_eval.dropna(inplace=True)
              X_train_full_cleaned_for_scaler_eval = X_train_full_cleaned_for_scaler_eval.clip(lower=lower_bound, upper=upper_bound)
              X_train_full_cleaned_for_scaler_eval = X_train_full_cleaned_for_scaler_eval.astype(np.float64)

              if not np.isfinite(X_train_full_cleaned_for_scaler_eval).all().all():
                  print(f"\nWarning: Non-finite values detected in X_train_full_cleaned_for_scaler_eval for {symbol} before scaler fit. Attempting to fill with median.")
                  for col in X_train_full_cleaned_for_scaler_eval.columns:
                      finite_values = X_train_full_cleaned_for_scaler_eval[col][np.isfinite(X_train_full_cleaned_for_scaler_eval[col])]
                      if not finite_values.empty:
                          median_val = finite_values.median()
                          X_train_full_cleaned_for_scaler_eval[col].replace([np.inf, -np.inf], np.nan, inplace=True)
                          X_train_full_cleaned_for_scaler_eval[col].fillna(median_val, inplace=True)
                      else:
                          print(f"Warning: Column '{col}' in X_train_full_cleaned_for_scaler_eval is all non-finite. Filling with 0.")
                          X_train_full_cleaned_for_scaler_eval[col].fillna(0, inplace=True)


              if not X_train_full_cleaned_for_scaler_eval.empty and np.isfinite(X_train_full_cleaned_for_scaler_eval).all().all():
                  scaler.fit(X_train_full_cleaned_for_scaler_eval)

                  X_test_cleaned = X_test_cleaned.astype(np.float64)
                  if not np.isfinite(X_test_cleaned).all().all():
                       print(f"\nWarning: Non-finite values detected in X_test_cleaned for {symbol} before scaler transform. Attempting to fill with median (from train data).")
                       train_medians = X_train_full_cleaned_for_scaler_eval.median()
                       for col in X_test_cleaned.columns:
                           median_val = train_medians.get(col, 0)
                           X_test_cleaned[col].replace([np.inf, -np.inf], np.nan, inplace=True)
                           X_test_cleaned[col].fillna(median_val, inplace=True)
                       if not np.isfinite(X_test_cleaned).all().all():
                            print(f"\nERROR: Non-finite values STILL detected in X_test_cleaned for {symbol} after filling with median!")


                  if not X_test_cleaned.empty and np.isfinite(X_test_cleaned).all().all():
                      X_test_scaled = scaler.transform(X_test_cleaned)

                      y_test_pred_prob = best_model.predict_proba(X_test_scaled)[:, 1]
                      y_test_pred = (y_test_pred_prob >= best_threshold).astype(int)

                      if len(y_test_cleaned.unique()) > 1:
                          print("\nClassification Report (Test Set):")
                          print(classification_report(y_test_cleaned, y_test_pred, zero_division=0))
                          precision_test_alcista = precision_score(y_test_cleaned, y_test_pred, pos_label=1, zero_division=0)
                          recall_test_alcista = recall_score(y_test_cleaned, y_test_pred, pos_label=1, zero_division=0)
                          f1_test_alcista = f1_score(y_test_cleaned, y_test_pred, pos_label=1, zero_division=0)

                          print(f"Tamaño de y_test (cleaned): {y_test_cleaned.size}")
                          print(f"Distribución de clases en y_test (cleaned) para {symbol}:")
                          print(y_test_cleaned.value_counts())
                          if 1 in y_test_cleaned.value_counts():
                              ratio_1_test=(y_test_cleaned.value_counts()[1]/y_test_cleaned.size).round(4)
                          else:
                              ratio_1_test = 0
                          print(f"% clase 1 test para {symbol}: {ratio_1_test} ")

                          if len(y_test_cleaned.unique()) > 1:
                               roc_auc_test = roc_auc_score(y_test_cleaned, y_test_pred_prob).round(6)
                               print(f"\nROC-AUC (Test Set) para {symbol}: {roc_auc_test:.4f}")
                          else:
                               roc_auc_test = None
                               print(f"\nWarning: Test set for {symbol} contains only one class after cleaning. Cannot calculate ROC-AUC.")

                      else:
                          print(f"\nWarning: Test set for {symbol} contains only one class after cleaning. Cannot generate full classification report.")
                          precision_test_alcista = None
                          recall_test_alcista = None
                          f1_test_alcista = None
                          roc_auc_test = None
                          ratio_1_test = None


                  else:
                      print(f"\nWarning: X_test became empty after cleaning or contains non-finite values for {symbol}. Skipping test evaluation.")
                      precision_test_alcista = None
                      recall_test_alcista = None
                      f1_test_alcista = None
                      roc_auc_test = None
                      ratio_1_test = None


              else:
                  print(f"\nWarning: Training data (X_train_full) became empty or contains non-finite values after cleaning for scaler fitting for evaluation for {symbol}. Skipping test evaluation.")
                  precision_test_alcista = None
                  recall_test_alcista = None
                  f1_test_alcista = None
                  roc_auc_test = None
                  ratio_1_test = None


          else:
              print(f"\nAdvertencia: Conjunto de prueba insuficiente o con una sola clase para evaluación para {symbol}.")
              precision_test_alcista = None
              recall_test_alcista = None
              f1_test_alcista = None
              roc_auc_test = None
              ratio_1_test = None

          # Prediction for the next day is done only if model was trained successfully
          last_features = df[features].iloc[-1:]
          last_features_cleaned = None
          last_features_scaled = None
          future_pred_prob = None
          future_pred = None

          if not last_features.empty:
              # Ensure last_features is a single row DataFrame before cleaning
              if not isinstance(last_features, pd.DataFrame) or len(last_features) != 1:
                   print(f"\nError: last_features is not a single row DataFrame for {symbol}. Skipping prediction.")
                   # Set prediction results to skipped
                   last_data_date = df.index[-1] if not df.empty else None
                   last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
                   resultsp.append({
                       'Papel': symbol,
                       'Fecha Predicción': next_day,
                       'Fecha Datos': last_data_date,
                       'Predicción': 'Skipped (Prediction Data Error)',
                       'Precio actual': last_close,
                       'Probabilidad Alcista (Modelo)': None,
                       'Umbral de Clasificación': None,
                       'Mejores hiperparámetros (Incluye scale_pos_weight)': 'Skipped (Prediction Data Error)',
                       'Precision Test (Alcista)': precision_test_alcista,
                       'Recall Test (Alcista)': recall_test_alcista,
                       'F1 Test (Alcista)': f1_test_alcista,
                       'ROC-AUC Test': roc_auc_test,
                       'clase 1 en test (cleaned)': ratio_1_test,
                       'Features Limpias (Predicción)': None,
                       'Features Escaladas (Predicción)': None
                   })
                   # No need to continue here, the append is done and we move to the next ticker
                   continue # Skip to the next ticker


              last_features_cleaned = last_features.replace([np.inf, -np.inf], np.nan).dropna()
              last_features_cleaned = last_features_cleaned.clip(lower=lower_bound, upper=upper_bound)

              # Add checks for NaN, Inf, and Zero in cleaned features BEFORE scaling
              has_nan_cleaned = last_features_cleaned.isna().any().any()
              has_inf_cleaned = np.isinf(last_features_cleaned).any().any()
              has_zero_cleaned = (last_features_cleaned == 0).any().any()

              if has_nan_cleaned:
                  print(f"\nDEBUG: NaN values detected in last_features_cleaned for {symbol}.")
              if has_inf_cleaned:
                  print(f"\nDEBUG: Inf values detected in last_features_cleaned for {symbol}.")
              if has_zero_cleaned:
                   print(f"\nDEBUG: Zero values detected in last_features_cleaned for {symbol}.")


              if not last_features_cleaned.empty and np.isfinite(last_features_cleaned).all().all():
                  # Ensure scaler is fitted on the *cleaned* full training data
                  scaler = RobustScaler()
                  X_train_full_cleaned_for_scaler_pred = X_train_full.replace([np.inf, -np.inf], np.nan)
                  X_train_full_cleaned_for_scaler_pred.dropna(inplace=True)
                  X_train_full_cleaned_for_scaler_pred = X_train_full_cleaned_for_scaler_pred.clip(lower=lower_bound, upper=upper_bound)
                  X_train_full_cleaned_for_scaler_pred = X_train_full_cleaned_for_scaler_pred.astype(np.float64)

                  if not np.isfinite(X_train_full_cleaned_for_scaler_pred).all().all():
                       print(f"\nWarning: Non-finite values detected in X_train_full_cleaned_for_scaler_pred for {symbol} before scaler fit. Attempting to fill with median.")
                       for col in X_train_full_cleaned_for_scaler_pred.columns:
                           finite_values = X_train_full_cleaned_for_scaler_pred[col][np.isfinite(X_train_full_cleaned_for_scaler_pred[col])]
                           if not finite_values.empty:
                               median_val = finite_values.median()
                               X_train_full_cleaned_for_scaler_pred[col].replace([np.inf, -np.inf], np.nan, inplace=True)
                               X_train_full_cleaned_for_scaler_pred[col].fillna(median_val, inplace=True)
                           else:
                               print(f"Warning: Column '{col}' in X_train_full_cleaned_for_scaler_pred is all non-finite. Filling with 0.")
                               X_train_full_cleaned_for_scaler_pred[col].fillna(0, inplace=True)


                  if not X_train_full_cleaned_for_scaler_pred.empty and np.isfinite(X_train_full_cleaned_for_scaler_pred).all().all():
                       scaler.fit(X_train_full_cleaned_for_scaler_pred)

                       last_features_cleaned = last_features_cleaned.astype(np.float64)
                       # Add explicit fallback for non-finite values AFTER dropna for prediction data
                       # This fallback is actually redundant if dropna() was called just above and np.isfinite checked,
                       # but keeping it for safety if flow changes. The main checks should be BEFORE scaling.
                       # Let's keep the checks BEFORE scaling.
                       # if not np.isfinite(last_features_cleaned).all().all():
                       #      print(f"\nWarning: Non-finite values STILL detected in last_features_cleaned for {symbol} AFTER dropna. Attempting to fill with median (from train data).")
                       #      train_medians = X_train_full_cleaned_for_scaler_pred.median()
                       #      for col in last_features_cleaned.columns:
                       #          median_val = train_medians.get(col, 0)
                       #          last_features_cleaned[col].replace([np.inf, -np.inf], np.nan, inplace=True)
                       #          last_features_cleaned[col].fillna(median_val, inplace=True)
                       #      if not np.isfinite(last_features_cleaned).all().all():
                       #           print(f"\nERROR: Non-finite values STILL detected in last_features_cleaned for {symbol} after filling with median!")


                       if not last_features_cleaned.empty and np.isfinite(last_features_cleaned).all().all():
                           last_features_scaled = scaler.transform(last_features_cleaned)

                           # Add checks for NaN, Inf, and Zero in scaled features BEFORE prediction
                           has_nan_scaled = np.isnan(last_features_scaled).any()
                           has_inf_scaled = np.isinf(last_features_scaled).any()
                           has_zero_scaled = (last_features_scaled == 0).any()

                           if has_nan_scaled:
                               print(f"\nDEBUG: NaN values detected in last_features_scaled for {symbol} BEFORE prediction.")
                           if has_inf_scaled:
                               print(f"\nDEBUG: Inf values detected in last_features_scaled for {symbol} BEFORE prediction.")
                           if has_zero_scaled:
                                print(f"\nDEBUG: Zero values detected in last_features_scaled for {symbol} BEFORE prediction.")


                           future_pred_prob = best_model.predict_proba(last_features_scaled)[:, 1][0].round(4)
                           future_pred = 1 if future_pred_prob >= best_threshold else 0

                           last_close = None
                           last_open = None
                           last_max = None
                           if not df.empty:
                               last_close = df['Close'].iloc[-1]
                               last_open = df['Open'].iloc[-1]
                               last_max = df['High'].iloc[-1]
                               last_data_date = df.index[-1]
                           else:
                               print(f"Warning: DataFrame 'df' is empty for {symbol}. Cannot get last prices.")
                               last_data_date = None

                           action = 'BUY' if future_pred == 1 else 'SELL'
                           direction = 1 if future_pred == 1 else -1

                           # Explicitly use to_dict() and tolist()
                           cleaned_features_dict = last_features_cleaned.to_dict() if last_features_cleaned is not None and not last_features_cleaned.empty else None
                           scaled_features_list = last_features_scaled.tolist()[0] if last_features_scaled is not None and last_features_scaled.size > 0 else None


                           resultsp.append({
                                       'Papel': symbol,
                                       'Fecha Predicción': next_day,
                                       'Fecha Datos': last_data_date,
                                       'Predicción': 'Alcista' if future_pred == 1 else 'Bajista',
                                       'Probabilidad Alcista (Modelo)': future_pred_prob,
                                       'Umbral de Clasificación': best_threshold.round(4),
                                       'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)', # Add check for best_model
                                       'Precision Test (Alcista)': precision_test_alcista,
                                       'Recall Test (Alcista)': recall_test_alcista,
                                       'F1 Test (Alcista)': f1_test_alcista,
                                       'ROC-AUC Test': roc_auc_test,
                                       'clase 1 en test (cleaned)': ratio_1_test,
                                       'Features Limpias (Predicción)': cleaned_features_dict, # Add cleaned features using to_dict
                                       'Features Escaladas (Predicción)': scaled_features_list # Add scaled features as a list
                               })
                       else:
                            print(f"\nWarning: last_features became empty or contains non-finite values after final filling for scaler transform. Skipping prediction for {symbol}.")
                            last_data_date = df.index[-1] if not df.empty else None
                            last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
                            # Explicitly use None for feature columns if prediction skipped due to data issues
                            resultsp.append({
                                'Papel': symbol,
                                'Fecha Predicción': next_day,
                                'Fecha Datos': last_data_date,
                                'Predicción': 'Skipped (Prediction Data Issue)',
                                'Precio actual': last_close,
                                'Probabilidad Alcista (Modelo)': None,
                                'Umbral de Clasificación': None,
                                'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                                'Precision Test (Alcista)': precision_test_alcista,
                                'Recall Test (Alcista)': recall_test_alcista,
                                'F1 Test (Alcista)': f1_test_alcista,
                                'ROC-AUC Test': roc_auc_test,
                                'clase 1 en test (cleaned)': ratio_1_test,
                                 'Features Limpias (Predicción)': None,
                                 'Features Escaladas (Predicción)': None
                            })


                  else:
                       print(f"\nWarning: Training data (X_train_full) became empty or contains non-finite values after final filling for scaler fitting for prediction. Skipping prediction for {symbol}.")
                       last_data_date = df.index[-1] if not df.empty else None
                       last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
                       # Explicitly use None for feature columns if prediction skipped due to data issues
                       resultsp.append({
                           'Papel': symbol,
                           'Fecha Predicción': next_day,
                           'Fecha Datos': last_data_date,
                           'Predicción': 'Skipped (Prediction Data Issue)',
                           'Precio actual': last_close,
                           'Probabilidad Alcista (Modelo)': None,
                           'Umbral de Clasificación': None,
                           'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                           'Precision Test (Alcista)': precision_test_alcista,
                           'Recall Test (Alcista)': recall_test_alcista,
                           'F1 Test (Alcista)': f1_test_alcista,
                           'ROC-AUC Test': roc_auc_test,
                           'clase 1 en test (cleaned)': ratio_1_test,
                            'Features Limpias (Predicción)': last_features_cleaned.to_dict() if last_features_cleaned is not None and not last_features_cleaned.empty else None,
                            'Features Escaladas (Predicción)': last_features_scaled.tolist()[0] if last_features_scaled is not None and last_features_scaled.size > 0 else None
                       })


              else:
                  print(f"\nWarning: Could not make prediction for {symbol} as last_features became empty or contains non-finite values after cleaning.")
                  last_data_date = df.index[-1] if not df.empty else None
                  last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
                  # Explicitly use None for feature columns if prediction skipped due to data issues
                  resultsp.append({
                      'Papel': symbol,
                      'Fecha Predicción': next_day,
                      'Fecha Datos': last_data_date,
                      'Predicción': 'Skipped (Prediction Data Issue)',
                      'Precio actual': last_close,
                      'Probabilidad Alcista (Modelo)': None,
                      'Umbral de Clasificación': None,
                      'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                      'Precision Test (Alcista)': precision_test_alcista,
                      'Recall Test (Alcista)': recall_test_alcista,
                      'F1 Test (Alcista)': f1_test_alcista,
                      'ROC-AUC Test': roc_auc_test,
                      'clase 1 en test (cleaned)': ratio_1_test,
                       'Features Limpias (Predicción)': last_features_cleaned.to_dict() if last_features_cleaned is not None and not last_features_cleaned.empty else None,
                       'Features Escaladas (Predicción)': last_features_scaled.tolist()[0] if last_features_scaled is not None and last_features_scaled.size > 0 else None
                  })


          else:
              print(f"\nWarning: Could not make prediction for {symbol} as last_features was initially empty.")
              last_data_date = df.index[-1] if not df.empty else None
              last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
              # Explicitly use None for feature columns if prediction skipped due to data issues
              resultsp.append({
                  'Papel': symbol,
                  'Fecha Predicción': next_day,
                  'Fecha Datos': last_data_date,
                  'Predicción': 'Skipped (Prediction Data Issue)',
                  'Precio actual': last_close,
                  'Probabilidad Alcista (Modelo)': None,
                  'Umbral de Clasificación': None,
                  'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                  'Precision Test (Alcista)': precision_test_alcista,
                  'Recall Test (Alcista)': recall_test_alcista,
                  'F1 Test (Alcista)': f1_test_alcista,
                  'ROC-AUC Test': roc_auc_test,
                  'clase 1 en test (cleaned)': ratio_1_test,
                   'Features Limpias (Predicción)': last_features_cleaned.to_dict() if last_features_cleaned is not None and not last_features_cleaned.empty else None,
                   'Features Escaladas (Predicción)': last_features_scaled.tolist()[0] if last_features_scaled is not None and last_features_scaled.size > 0 else None
              })


      # Catch a general Exception during tuning/evaluation/prediction to prevent script crash
      except Exception as e:
          print(f"\nERROR: An error occurred during tuning, evaluation, or prediction for {symbol}: {e}. Skipping this ticker.")
          best_model = None
          best_threshold = 0.5
          precision_test_alcista = None
          recall_test_alcista = None
          f1_test_alcista = None
          roc_auc_test = None
          ratio_1_test = None

          last_data_date = df.index[-1] if not df.empty else None
          last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None

          # Append entry indicating skip due to general error, explicitly add None for feature columns
          resultsp.append({
                'Papel': symbol,
                'Fecha Predicción': next_day,
                'Fecha Datos': last_data_date,
                'Predicción': 'Skipped (Error)',
                'Precio actual': last_close,
                'Probabilidad Alcista (Modelo)': None,
                'Umbral de Clasificación': None,
                'Mejores hiperparámetros (Incluye scale_pos_weight)': 'Skipped (Error)',
                'Precision Test (Alcista)': precision_test_alcista,
                'Recall Test (Alcista)': recall_test_alcista,
                'F1 Test (Alcista)': f1_test_alcista,
                'ROC-AUC Test': roc_auc_test,
                'clase 1 en test (cleaned)': ratio_1_test,
                'Features Limpias (Predicción)': None,
                'Features Escaladas (Predicción)': None
          })


  else:
      print(f"\nWarning: Training data (X_train_full or y_train_full) is empty or has only one class after cleaning for tuning for {symbol}. Skipping model training and prediction.")
      # Append a result entry indicating skip due to insufficient training data, explicitly add None for feature columns
      last_data_date = df.index[-1] if not df.empty else None
      last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None

      resultsp.append({
            'Papel': symbol,
            'Fecha Predicción': next_day,
            'Fecha Datos': last_data_date,
            'Predicción': 'Skipped (Insufficient Training Data)',
            'Precio actual': last_close,
            'Probabilidad Alcista (Modelo)': None,
            'Umbral de Clasificación': None,
            'Mejores hiperparámetros (Incluye scale_pos_weight)': 'Skipped (Insufficient Training Data)',
            'Precision Test (Alcista)': precision_test_alcista,
            'Recall Test (Alcista)': recall_test_alcista,
            'F1 Test (Alcista)': f1_test_alcista,
            'ROC-AUC Test': roc_auc_test,
            'clase 1 en test (cleaned)': ratio_1_test,
            'Features Limpias (Predicción)': None,
            'Features Escaladas (Predicción)': None
      })


# Crear tabla de resultados de predicción
resultsp_df = pd.DataFrame(resultsp)
print(resultsp_df)
if not resultsp_df.empty:
    resultsp_df.set_index('Fecha Predicción', inplace=True)

    # Mostrar resultados de predicción
    pd.set_option('display.max_columns', None)
    #pd.set_option('display.max_rows', None) # Optional: display all rows
    pd.set_option('display.max_colwidth', None) # Optional: display full content of columns

    print("\nPrediccion para el proximo dia (hasta", next_day.strftime('%Y-%m-%d'), "):")
    print("Nota: 'Fecha Predicción' es la fecha predicha; 'Fecha Datos' es la fecha de los datos usados.")
    display(resultsp_df) # Use display for better formatting

    # Guardar y descargar el CSV de predicciones
    resultsp_df.to_csv(f"Predic_results_{end_date.strftime('%Y-%m-%d')}.csv", sep=";")
    files.download(f"Predic_results_{end_date.strftime('%Y-%m-%d')}.csv")
    print(f"\nArchivo 'Predic_results_{end_date.strftime('%Y-%m-%d')}.csv' generado y descargado.")
else:
    print("\nNo hay resultados de predicción para mostrar.")


Downloading data for ALUA.BA...


[*********************100%***********************]  1 of 1 completed

MultiIndex columns detected for ALUA.BA.
Successfully extracted and flattened MultiIndex columns for ALUA.BA.
Últimas filas del DataFrame antes de crear features:
             Open   High    Low  Close   Volume  Adj Close
Date                                                      
2025-09-08  670.0  705.0  600.0  696.5  1572445      696.5
2025-09-09  700.0  713.0  678.5  694.0  1217950      694.0
2025-09-10  700.0  716.5  691.5  702.0   641358      702.0
2025-09-11  702.0  713.0  676.0  682.0  1443135      682.0
2025-09-12  686.0  702.0  660.0  666.0   420245      666.0

Últimas filas del DataFrame después de crear features:
             Open   High    Low  Close   Volume  Adj Close  close_lag_1  \
Date                                                                      
2025-09-04  710.0  717.0  685.0  696.0   949242      696.0        700.0   
2025-09-05  680.0  696.0  664.0  686.0  2029661      686.0        696.0   
2025-09-08  670.0  705.0  600.0  696.5  1572445      696.5        68




Mejores hiperparámetros para ALUA.BA: {'subsample': 0.8, 'scale_pos_weight': 0.5, 'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.8}

Feature Importance for ALUA.BA:
Volatility                    0.041578
open_lag_2                    0.039338
close_lag_4                   0.035814
high_lag_4                    0.032640
Max_Gain_from_Open_Current    0.032418
open_lag_4                    0.031670
high_lag_1                    0.028877
close_lag_2                   0.027192
EWO_Signal                    0.025644
SMA13                         0.025469
close_lag_3                   0.025339
high_lag_5                    0.025300
SMA50                         0.024932
open_lag_5                    0.024742
SMA26                         0.024704
high_lag_3                    0.024433
open_lag_3                    0.024176
close_lag_1                   0.023360
close_lag_5                   0.023272
open_lag_1                    0.023239
PPO_Hi

[*********************100%***********************]  1 of 1 completed


MultiIndex columns detected for BBAR.BA.
Successfully extracted and flattened MultiIndex columns for BBAR.BA.
Últimas filas del DataFrame antes de crear features:
              Open    High     Low   Close   Volume  Adj Close
Date                                                          
2025-09-08  5175.0  5175.0  4482.5  4507.5  1397504     4507.5
2025-09-09  4600.0  4650.0  4385.0  4400.0   761818     4400.0
2025-09-10  4550.0  4710.0  4475.0  4655.0   206731     4655.0
2025-09-11  4695.0  4750.0  4542.5  4592.5   266476     4592.5
2025-09-12  4680.0  4680.0  4310.0  4447.5   368480     4447.5

Últimas filas del DataFrame después de crear features:
              Open    High     Low   Close   Volume  Adj Close  close_lag_1  \
Date                                                                          
2025-09-04  5580.0  5850.0  5510.0  5720.0   276714     5720.0       5540.0   
2025-09-05  5840.0  5840.0  5410.0  5640.0   596771     5640.0       5720.0   
2025-09-08  5175.0  5175

[*********************100%***********************]  1 of 1 completed

Mejor umbral para maximizar Precision (Clase 1) en entrenamiento para BBAR.BA: 0.5700 (Precision: 1.0000)

Evaluating best model on test set for BBAR.BA with best threshold (0.5700):

Classification Report (Test Set):
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        70
         1.0       0.86      1.00      0.93       448

    accuracy                           0.86       518
   macro avg       0.43      0.50      0.46       518
weighted avg       0.75      0.86      0.80       518

Tamaño de y_test (cleaned): 518
Distribución de clases en y_test (cleaned) para BBAR.BA:
Label
1.0    448
0.0     70
Name: count, dtype: int64
% clase 1 test para BBAR.BA: 0.8649 

ROC-AUC (Test Set) para BBAR.BA: 0.4410

DEBUG: Zero values detected in last_features_cleaned for BBAR.BA.

Downloading data for METR.BA...
MultiIndex columns detected for METR.BA.
Successfully extracted and flattened MultiIndex columns for METR.BA.
Últimas filas del DataFr




Correlacion con label para METR.BA:
Label                         1.000000
high_lag_1                    0.116599
open_lag_1                    0.116282
close_lag_1                   0.116013
close_lag_2                   0.115993
high_lag_2                    0.115975
Volatility                    0.115687
SMA5                          0.115661
open_lag_2                    0.115555
close_lag_3                   0.115381
high_lag_3                    0.115134
open_lag_3                    0.114637
close_lag_4                   0.114507
SMA13                         0.114335
high_lag_4                    0.114288
close_lag_5                   0.113902
open_lag_4                    0.113855
high_lag_5                    0.113644
SMA26                         0.113356
open_lag_5                    0.113030
SMA50                         0.112896
SMA200                        0.107125
EWO                           0.093745
EWO_Signal                    0.089646
PPO_Signal                  

[*********************100%***********************]  1 of 1 completed

Mejor umbral para maximizar Precision (Clase 1) en entrenamiento para METR.BA: 0.6300 (Precision: 1.0000)

Evaluating best model on test set for METR.BA with best threshold (0.6300):

Classification Report (Test Set):
              precision    recall  f1-score   support

         0.0       0.26      0.18      0.21        71
         1.0       0.88      0.92      0.90       447

    accuracy                           0.82       518
   macro avg       0.57      0.55      0.56       518
weighted avg       0.79      0.82      0.80       518

Tamaño de y_test (cleaned): 518
Distribución de clases en y_test (cleaned) para METR.BA:
Label
1.0    447
0.0     71
Name: count, dtype: int64
% clase 1 test para METR.BA: 0.8629 

ROC-AUC (Test Set) para METR.BA: 0.5490

Downloading data for PAMP.BA...
MultiIndex columns detected for PAMP.BA.
Successfully extracted and flattened MultiIndex columns for PAMP.BA.
Últimas filas del DataFrame antes de crear features:
              Open    High     Low   C





Últimas filas del DataFrame después de crear features:
              Open    High     Low   Close   Volume  Adj Close  close_lag_1  \
Date                                                                          
2025-09-04  3610.0  3755.0  3565.0  3725.0  1803914     3725.0       3610.0   
2025-09-05  3680.0  3850.0  3630.0  3760.0  2310446     3760.0       3725.0   
2025-09-08  3452.5  3452.5  3205.0  3267.5  6312048     3267.5       3760.0   
2025-09-09  3320.0  3440.0  3297.5  3372.5  3954211     3372.5       3267.5   
2025-09-10  3370.0  3625.0  3370.0  3580.0  2738737     3580.0       3372.5   

            close_lag_2  close_lag_3  close_lag_4  close_lag_5  open_lag_1  \
Date                                                                         
2025-09-04       3610.0       3535.0       3645.0       3675.0      3520.0   
2025-09-05       3610.0       3610.0       3535.0       3645.0      3610.0   
2025-09-08       3725.0       3610.0       3610.0       3535.0      3680.0   


[*********************100%***********************]  1 of 1 completed

Mejor umbral para maximizar Precision (Clase 1) en entrenamiento para PAMP.BA: 0.5000 (Precision: 1.0000)

Evaluating best model on test set for PAMP.BA with best threshold (0.5000):

Classification Report (Test Set):
              precision    recall  f1-score   support

         0.0       0.17      0.06      0.09       104
         1.0       0.80      0.93      0.86       414

    accuracy                           0.75       518
   macro avg       0.48      0.49      0.47       518
weighted avg       0.67      0.75      0.70       518

Tamaño de y_test (cleaned): 518
Distribución de clases en y_test (cleaned) para PAMP.BA:
Label
1.0    414
0.0    104
Name: count, dtype: int64
% clase 1 test para PAMP.BA: 0.7992 

ROC-AUC (Test Set) para PAMP.BA: 0.5046

DEBUG: Zero values detected in last_features_cleaned for PAMP.BA.

DEBUG: Zero values detected in last_features_scaled for PAMP.BA BEFORE prediction.

Downloading data for TRAN.BA...
MultiIndex columns detected for TRAN.BA.
Successfu





Últimas filas del DataFrame después de crear features:
              Open    High     Low   Close   Volume  Adj Close  close_lag_1  \
Date                                                                          
2025-09-04  1990.0  2030.0  1945.0  1970.0   570580     1970.0       1965.0   
2025-09-05  2000.0  2000.0  1900.0  1920.0   791867     1920.0       1970.0   
2025-09-08  1770.0  1910.0  1620.0  1883.0  1527140     1883.0       1920.0   
2025-09-09  1940.0  1940.0  1800.0  1936.0  1217507     1936.0       1883.0   
2025-09-10  1980.0  2050.0  1951.0  2037.0  1240119     2037.0       1936.0   

            close_lag_2  close_lag_3  close_lag_4  close_lag_5  open_lag_1  \
Date                                                                         
2025-09-04       2290.0       2340.0       2380.0       2365.0      2050.0   
2025-09-05       1965.0       2290.0       2340.0       2380.0      1990.0   
2025-09-08       1970.0       1965.0       2290.0       2340.0      2000.0   


[*********************100%***********************]  1 of 1 completed


MultiIndex columns detected for YPFD.BA.
Successfully extracted and flattened MultiIndex columns for YPFD.BA.
Últimas filas del DataFrame antes de crear features:
               Open     High      Low    Close   Volume  Adj Close
Date                                                              
2025-09-08  37500.0  39740.0  36400.0  37860.0  1377767    37860.0
2025-09-09  38200.0  40060.0  38000.0  38600.0   808166    38600.0
2025-09-10  38800.0  41600.0  38800.0  41120.0   590144    41120.0
2025-09-11  41200.0  41700.0  40600.0  40880.0   416272    40880.0
2025-09-12  41000.0  41600.0  39620.0  40200.0   520388    40200.0

Últimas filas del DataFrame después de crear features:
               Open     High      Low    Close   Volume  Adj Close  \
Date                                                                 
2025-09-04  41000.0  42700.0  40425.0  42525.0   683759    42525.0   
2025-09-05  42200.0  43725.0  41600.0  42875.0   584925    42875.0   
2025-09-08  37500.0  39740.0  36

Unnamed: 0_level_0,Papel,Fecha Datos,Predicción,Probabilidad Alcista (Modelo),Umbral de Clasificación,Mejores hiperparámetros (Incluye scale_pos_weight),Precision Test (Alcista),Recall Test (Alcista),F1 Test (Alcista),ROC-AUC Test,clase 1 en test (cleaned),Features Limpias (Predicción),Features Escaladas (Predicción)
Fecha Predicción,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2025-09-15,ALUA.BA,2025-09-10,Alcista,0.6907,0.53,"{'subsample': 0.8, 'scale_pos_weight': 0.5, 'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.8}",0.777083,0.937186,0.849658,0.573765,0.7683,"{'RSI': {2025-09-10 00:00:00: 22.02380952380952}, 'ROC': {2025-09-10 00:00:00: 0.2857142857142857}, 'PPO': {2025-09-10 00:00:00: -0.4910590194243401}, 'PPO_Signal': {2025-09-10 00:00:00: -0.44710124137936935}, 'PPO_Histogram': {2025-09-10 00:00:00: -0.04395777804497075}, 'EWO': {2025-09-10 00:00:00: -1.4486191078044606}, 'EWO_Signal': {2025-09-10 00:00:00: -1.111244869861243}, 'EWO_Histogram': {2025-09-10 00:00:00: -0.33737423794321764}, 'Volatility': {2025-09-10 00:00:00: 20.12283}, 'SMA5': {2025-09-10 00:00:00: 694.9}, 'SMA13': {2025-09-10 00:00:00: 711.9615}, 'SMA26': {2025-09-10 00:00:00: 710.0577}, 'SMA50': {2025-09-10 00:00:00: 713.85}, 'SMA200': {2025-09-10 00:00:00: 784.9925}, 'lag_change1': {2025-09-10 00:00:00: -0.003589375448671883}, 'lag_change2': {2025-09-10 00:00:00: 0.015306122448979664}, 'lag_change3': {2025-09-10 00:00:00: -0.014367816091954033}, 'lag_change4': {2025-09-10 00:00:00: -0.005714285714285672}, 'lag_change5': {2025-09-10 00:00:00: -0.051490514905149}, 'close_lag_1': {2025-09-10 00:00:00: 694.0}, 'close_lag_2': {2025-09-10 00:00:00: 696.5}, 'close_lag_3': {2025-09-10 00:00:00: 686.0}, 'close_lag_4': {2025-09-10 00:00:00: 696.0}, 'close_lag_5': {2025-09-10 00:00:00: 700.0}, 'open_lag_1': {2025-09-10 00:00:00: 700.0}, 'open_lag_2': {2025-09-10 00:00:00: 670.0}, 'open_lag_3': {2025-09-10 00:00:00: 680.0}, 'open_lag_4': {2025-09-10 00:00:00: 710.0}, 'open_lag_5': {2025-09-10 00:00:00: 735.0}, 'high_lag_1': {2025-09-10 00:00:00: 713.0}, 'high_lag_2': {2025-09-10 00:00:00: 705.0}, 'high_lag_3': {2025-09-10 00:00:00: 696.0}, 'high_lag_4': {2025-09-10 00:00:00: 717.0}, 'high_lag_5': {2025-09-10 00:00:00: 742.0}, 'Max_Gain_from_Open_Current': {2025-09-10 00:00:00: 0.0235714285713949}, 'Max_Gain_from_Open_Lag_1': {2025-09-10 00:00:00: 0.0235714285713949}, 'Max_Gain_from_Open_Lag_2': {2025-09-10 00:00:00: 0.06940298507452328}, 'Max_Gain_from_Open_Lag_3': {2025-09-10 00:00:00: 0.04852941176463452}, 'Max_Gain_from_Open_Lag_4': {2025-09-10 00:00:00: 0.009859154929563579}, 'Max_Gain_from_Open_Lag_5': {2025-09-10 00:00:00: 0.009523809523796566}, 'Max_Gain_from_Open_Lag_6': {2025-09-10 00:00:00: 0.003994673768303602}}","[-0.7818634853518561, 0.044757316601609594, -0.3832495566899804, -0.3924117529798455, -0.06086068095672429, -0.31706707436920106, -0.2703206224154513, -0.19779355102866564, 30.043158689979204, 52.73595848595849, 54.12918563364025, 54.0415509285829, 55.258611297025546, 67.69170730862187, -0.14348649757668872, 0.6112720219450265, -0.5736788385325617, -0.22783784833429088, -2.056347795594639, 52.64302059496568, 53.03598774885146, 52.23200612557427, 53.09934790947449, 53.61185983827493, 53.40621403912543, 51.202920830130665, 51.971560338201385, 54.277478862413524, 56.307277628032345, 52.50721954831545, 52.011127596439174, 51.63148079074972, 53.497374343585896, 55.3728432108027, 0.09437239351118115, 0.09441297184839482, 1.4271685027261352, 0.8201706870964343, -0.30434982888093587, -0.31410158176771164, -0.4751226744059963]"
2025-09-15,BBAR.BA,2025-09-10,Alcista,0.8137,0.57,"{'subsample': 0.8, 'scale_pos_weight': 0.5, 'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.8}",0.864603,0.997768,0.926425,0.44104,0.8649,"{'RSI': {2025-09-10 00:00:00: 32.53012048192771}, 'ROC': {2025-09-10 00:00:00: -15.974729241877256}, 'PPO': {2025-09-10 00:00:00: -5.3389766580011075}, 'PPO_Signal': {2025-09-10 00:00:00: -4.3361629455140625}, 'PPO_Histogram': {2025-09-10 00:00:00: -1.002813712487045}, 'EWO': {2025-09-10 00:00:00: -19.51935321956679}, 'EWO_Signal': {2025-09-10 00:00:00: -16.528431983648822}, 'EWO_Histogram': {2025-09-10 00:00:00: -2.990921235917966}, 'Volatility': {2025-09-10 00:00:00: 773.324792}, 'SMA5': {2025-09-10 00:00:00: 4984.5}, 'SMA13': {2025-09-10 00:00:00: 5360.9615}, 'SMA26': {2025-09-10 00:00:00: 6177.7885}, 'SMA50': {2025-09-10 00:00:00: 6404.85}, 'SMA200': {2025-09-10 00:00:00: 7363.8125}, 'lag_change1': {2025-09-10 00:00:00: -0.023849140321686058}, 'lag_change2': {2025-09-10 00:00:00: -0.20079787234042556}, 'lag_change3': {2025-09-10 00:00:00: -0.013986013986013957}, 'lag_change4': {2025-09-10 00:00:00: 0.032490974729241895}, 'lag_change5': {2025-09-10 00:00:00: -0.014234875444839812}, 'close_lag_1': {2025-09-10 00:00:00: 4400.0}, 'close_lag_2': {2025-09-10 00:00:00: 4507.5}, 'close_lag_3': {2025-09-10 00:00:00: 5640.0}, 'close_lag_4': {2025-09-10 00:00:00: 5720.0}, 'close_lag_5': {2025-09-10 00:00:00: 5540.0}, 'open_lag_1': {2025-09-10 00:00:00: 4600.0}, 'open_lag_2': {2025-09-10 00:00:00: 5175.0}, 'open_lag_3': {2025-09-10 00:00:00: 5840.0}, 'open_lag_4': {2025-09-10 00:00:00: 5580.0}, 'open_lag_5': {2025-09-10 00:00:00: 5650.0}, 'high_lag_1': {2025-09-10 00:00:00: 4650.0}, 'high_lag_2': {2025-09-10 00:00:00: 5175.0}, 'high_lag_3': {2025-09-10 00:00:00: 5840.0}, 'high_lag_4': {2025-09-10 00:00:00: 5850.0}, 'high_lag_5': {2025-09-10 00:00:00: 5690.0}, 'Max_Gain_from_Open_Current': {2025-09-10 00:00:00: 0.043956043956034294}, 'Max_Gain_from_Open_Lag_1': {2025-09-10 00:00:00: 0.03260869565216682}, 'Max_Gain_from_Open_Lag_2': {2025-09-10 00:00:00: 0.0}, 'Max_Gain_from_Open_Lag_3': {2025-09-10 00:00:00: 0.0}, 'Max_Gain_from_Open_Lag_4': {2025-09-10 00:00:00: 0.04838709677418487}, 'Max_Gain_from_Open_Lag_5': {2025-09-10 00:00:00: 0.03539823008848931}, 'Max_Gain_from_Open_Lag_6': {2025-09-10 00:00:00: 0.08534322820035521}}","[-0.5563744682079752, -2.007197656994868, -2.76833937912835, -2.476760723952092, -1.4372688784040728, -2.33958696404983, -2.070297733948386, -1.6887037198218429, 155.00641285592303, 51.59512741761435, 56.131931210490535, 64.69844242022965, 67.99798653435019, 72.7667965570392, -0.7265931849854965, -6.115559915879347, -0.4255501722409742, 0.9894397549657818, -0.4335416140785042, 45.28714812808009, 46.432824644305015, 58.163131378374885, 59.11360811930406, 57.42790359443175, 47.10431211498973, 53.08001130901894, 60.01091489471245, 57.37798845836768, 58.13378702343206, 46.95259547973982, 52.30331273427211, 59.044705047997766, 59.177496198682206, 57.57456840824396, 0.3623632910310479, 0.11047432367939544, -0.6138382151584646, -0.6138382151584646, 0.45969386862171524, 0.17163887669039912, 1.2792725431247385]"
2025-09-15,METR.BA,2025-09-10,Alcista,0.9124,0.63,"{'subsample': 0.8, 'scale_pos_weight': 0.5, 'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.8}",0.876068,0.917226,0.896175,0.548981,0.8629,"{'RSI': {2025-09-10 00:00:00: 35.102739726027394}, 'ROC': {2025-09-10 00:00:00: -5.429553264604811}, 'PPO': {2025-09-10 00:00:00: -5.1254883136060885}, 'PPO_Signal': {2025-09-10 00:00:00: -5.245569085162645}, 'PPO_Histogram': {2025-09-10 00:00:00: 0.12008077155655617}, 'EWO': {2025-09-10 00:00:00: -21.51889874854952}, 'EWO_Signal': {2025-09-10 00:00:00: -19.964037042024817}, 'EWO_Histogram': {2025-09-10 00:00:00: -1.5548617065247043}, 'Volatility': {2025-09-10 00:00:00: 249.501376}, 'SMA5': {2025-09-10 00:00:00: 1377.4}, 'SMA13': {2025-09-10 00:00:00: 1519.7692}, 'SMA26': {2025-09-10 00:00:00: 1799.3077}, 'SMA50': {2025-09-10 00:00:00: 1889.84}, 'SMA200': {2025-09-10 00:00:00: 2188.235}, 'lag_change1': {2025-09-10 00:00:00: -0.034375000000000044}, 'lag_change2': {2025-09-10 00:00:00: -0.1578947368421053}, 'lag_change3': {2025-09-10 00:00:00: 0.03050847457627115}, 'lag_change4': {2025-09-10 00:00:00: 0.013745704467353903}, 'lag_change5': {2025-09-10 00:00:00: -0.03642384105960261}, 'close_lag_1': {2025-09-10 00:00:00: 1236.0}, 'close_lag_2': {2025-09-10 00:00:00: 1280.0}, 'close_lag_3': {2025-09-10 00:00:00: 1520.0}, 'close_lag_4': {2025-09-10 00:00:00: 1475.0}, 'close_lag_5': {2025-09-10 00:00:00: 1455.0}, 'open_lag_1': {2025-09-10 00:00:00: 1310.0}, 'open_lag_2': {2025-09-10 00:00:00: 1360.0}, 'open_lag_3': {2025-09-10 00:00:00: 1500.0}, 'open_lag_4': {2025-09-10 00:00:00: 1455.0}, 'open_lag_5': {2025-09-10 00:00:00: 1520.0}, 'high_lag_1': {2025-09-10 00:00:00: 1330.0}, 'high_lag_2': {2025-09-10 00:00:00: 1405.0}, 'high_lag_3': {2025-09-10 00:00:00: 1550.0}, 'high_lag_4': {2025-09-10 00:00:00: 1515.0}, 'high_lag_5': {2025-09-10 00:00:00: 1550.0}, 'Max_Gain_from_Open_Current': {2025-09-10 00:00:00: 0.11653543307077438}, 'Max_Gain_from_Open_Lag_1': {2025-09-10 00:00:00: 0.08244274809154012}, 'Max_Gain_from_Open_Lag_2': {2025-09-10 00:00:00: 0.03308823529409332}, 'Max_Gain_from_Open_Lag_3': {2025-09-10 00:00:00: 0.033333333333311115}, 'Max_Gain_from_Open_Lag_4': {2025-09-10 00:00:00: 0.0652920962198864}, 'Max_Gain_from_Open_Lag_5': {2025-09-10 00:00:00: 0.019736842105250174}, 'Max_Gain_from_Open_Lag_6': {2025-09-10 00:00:00: 0.012820512820504603}}","[-0.3745054107419694, -0.6281336396776755, -2.4760340510803056, -2.7628097855284848, 0.21161290719391843, -2.046821128313282, -1.9426993515867141, -0.7967262027254606, 336.6923973355626, 82.17211107793372, 91.01752208994233, 108.03069587737272, 110.79358737422626, 137.21438229600977, -1.030963541666669, -4.735526315789479, 0.9149999999999996, 0.4122565864833895, -1.0924116997792492, 73.58545671285948, 76.37873357228196, 90.71565113500598, 88.09325960245106, 87.09286998202518, 77.65994065281897, 80.62729970326409, 88.93590504451038, 86.26528189910978, 90.12284866468842, 77.55691768826618, 81.99503432160068, 90.66451990632318, 88.61533957845432, 90.73092134173135, 1.5858042500195857, 0.9882026375871523, 0.12351052107326538, 0.1287878789129126, 0.6880662296960662, -0.10817109358234392, -0.22920685607230776]"
2025-09-15,PAMP.BA,2025-09-10,Alcista,0.7236,0.5,"{'subsample': 0.8, 'scale_pos_weight': 0.5, 'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.8}",0.797101,0.929952,0.858417,0.504575,0.7992,"{'RSI': {2025-09-10 00:00:00: 52.18446601941748}, 'ROC': {2025-09-10 00:00:00: -0.8310249307479225}, 'PPO': {2025-09-10 00:00:00: -1.147398945157694}, 'PPO_Signal': {2025-09-10 00:00:00: -1.1962556510418119}, 'PPO_Histogram': {2025-09-10 00:00:00: 0.048856705884117835}, 'EWO': {2025-09-10 00:00:00: -4.905880769374763}, 'EWO_Signal': {2025-09-10 00:00:00: -4.489529457552637}, 'EWO_Histogram': {2025-09-10 00:00:00: -0.41635131182212604}, 'Volatility': {2025-09-10 00:00:00: 172.085696}, 'SMA5': {2025-09-10 00:00:00: 3541.0}, 'SMA13': {2025-09-10 00:00:00: 3589.6154}, 'SMA26': {2025-09-10 00:00:00: 3775.0}, 'SMA50': {2025-09-10 00:00:00: 3754.2}, 'SMA200': {2025-09-10 00:00:00: 3819.95}, 'lag_change1': {2025-09-10 00:00:00: 0.032134659525631326}, 'lag_change2': {2025-09-10 00:00:00: -0.13098404255319152}, 'lag_change3': {2025-09-10 00:00:00: 0.00939597315436247}, 'lag_change4': {2025-09-10 00:00:00: 0.03185595567867039}, 'lag_change5': {2025-09-10 00:00:00: 0.0}, 'close_lag_1': {2025-09-10 00:00:00: 3372.5}, 'close_lag_2': {2025-09-10 00:00:00: 3267.5}, 'close_lag_3': {2025-09-10 00:00:00: 3760.0}, 'close_lag_4': {2025-09-10 00:00:00: 3725.0}, 'close_lag_5': {2025-09-10 00:00:00: 3610.0}, 'open_lag_1': {2025-09-10 00:00:00: 3320.0}, 'open_lag_2': {2025-09-10 00:00:00: 3452.5}, 'open_lag_3': {2025-09-10 00:00:00: 3680.0}, 'open_lag_4': {2025-09-10 00:00:00: 3610.0}, 'open_lag_5': {2025-09-10 00:00:00: 3520.0}, 'high_lag_1': {2025-09-10 00:00:00: 3440.0}, 'high_lag_2': {2025-09-10 00:00:00: 3452.5}, 'high_lag_3': {2025-09-10 00:00:00: 3850.0}, 'high_lag_4': {2025-09-10 00:00:00: 3755.0}, 'high_lag_5': {2025-09-10 00:00:00: 3680.0}, 'Max_Gain_from_Open_Current': {2025-09-10 00:00:00: 0.07863501483677192}, 'Max_Gain_from_Open_Lag_1': {2025-09-10 00:00:00: 0.09487951807226058}, 'Max_Gain_from_Open_Lag_2': {2025-09-10 00:00:00: 0.04996379435190443}, 'Max_Gain_from_Open_Lag_3': {2025-09-10 00:00:00: 0.04619565217390049}, 'Max_Gain_from_Open_Lag_4': {2025-09-10 00:00:00: 0.06648199445981538}, 'Max_Gain_from_Open_Lag_5': {2025-09-10 00:00:00: 0.09374999999997337}, 'Max_Gain_from_Open_Lag_6': {2025-09-10 00:00:00: 0.05329593267880693}}","[-0.04414008902657611, -0.17823592746776742, -0.6855155997616817, -0.780649419113242, 0.0780532225507881, -0.8009510325615006, -0.7884736687961438, -0.21971376040656387, 77.2307268349907, 77.73587477465593, 78.73939154310818, 83.15648332160637, 83.33579025751025, 84.14454141684055, 1.0506682476692275, -4.282627433852966, 0.3072088142513833, 1.0415558037588895, 0.0, 74.19654070728214, 71.92283950617283, 82.87199293754139, 82.19067609368095, 79.65046398585949, 73.09592061742006, 76.00948076287068, 81.01675485008819, 79.47420634920634, 77.57559037740013, 73.69981761613562, 74.04897433143593, 82.63163550397594, 80.59015688802924, 79.06411359724613, 1.1622238090724453, 1.5087478734552227, 0.523221421139541, 0.4401811750513594, 0.8793396784865425, 1.4682448768184122, 0.592417013183931]"
2025-09-15,TRAN.BA,2025-09-10,Alcista,0.6291,0.57,"{'subsample': 0.8, 'scale_pos_weight': 0.5, 'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.8}",0.843552,0.913043,0.876923,0.49685,0.8436,"{'RSI': {2025-09-10 00:00:00: 25.60386473429952}, 'ROC': {2025-09-10 00:00:00: 3.6641221374045805}, 'PPO': {2025-09-10 00:00:00: -3.726008383735165}, 'PPO_Signal': {2025-09-10 00:00:00: -4.4009672295842925}, 'PPO_Histogram': {2025-09-10 00:00:00: 0.6749588458491274}, 'EWO': {2025-09-10 00:00:00: -13.971037469616455}, 'EWO_Signal': {2025-09-10 00:00:00: -13.05087452217676}, 'EWO_Histogram': {2025-09-10 00:00:00: -0.9201629474396942}, 'Volatility': {2025-09-10 00:00:00: 274.809499}, 'SMA5': {2025-09-10 00:00:00: 1949.2}, 'SMA13': {2025-09-10 00:00:00: 2193.5385}, 'SMA26': {2025-09-10 00:00:00: 2406.1923}, 'SMA50': {2025-09-10 00:00:00: 2387.92}, 'SMA200': {2025-09-10 00:00:00: 2395.905}, 'lag_change1': {2025-09-10 00:00:00: 0.02814657461497605}, 'lag_change2': {2025-09-10 00:00:00: -0.019270833333333348}, 'lag_change3': {2025-09-10 00:00:00: -0.025380710659898442}, 'lag_change4': {2025-09-10 00:00:00: 0.0025445292620864812}, 'lag_change5': {2025-09-10 00:00:00: -0.14192139737991272}, 'close_lag_1': {2025-09-10 00:00:00: 1936.0}, 'close_lag_2': {2025-09-10 00:00:00: 1883.0}, 'close_lag_3': {2025-09-10 00:00:00: 1920.0}, 'close_lag_4': {2025-09-10 00:00:00: 1970.0}, 'close_lag_5': {2025-09-10 00:00:00: 1965.0}, 'open_lag_1': {2025-09-10 00:00:00: 1940.0}, 'open_lag_2': {2025-09-10 00:00:00: 1770.0}, 'open_lag_3': {2025-09-10 00:00:00: 2000.0}, 'open_lag_4': {2025-09-10 00:00:00: 1990.0}, 'open_lag_5': {2025-09-10 00:00:00: 2050.0}, 'high_lag_1': {2025-09-10 00:00:00: 1940.0}, 'high_lag_2': {2025-09-10 00:00:00: 1910.0}, 'high_lag_3': {2025-09-10 00:00:00: 2000.0}, 'high_lag_4': {2025-09-10 00:00:00: 2030.0}, 'high_lag_5': {2025-09-10 00:00:00: 2050.0}, 'Max_Gain_from_Open_Current': {2025-09-10 00:00:00: 0.047979797979773746}, 'Max_Gain_from_Open_Lag_1': {2025-09-10 00:00:00: 0.06958762886594351}, 'Max_Gain_from_Open_Lag_2': {2025-09-10 00:00:00: 0.15819209039539087}, 'Max_Gain_from_Open_Lag_3': {2025-09-10 00:00:00: 0.0}, 'Max_Gain_from_Open_Lag_4': {2025-09-10 00:00:00: 0.020100502512552715}, 'Max_Gain_from_Open_Lag_5': {2025-09-10 00:00:00: 0.0}, 'Max_Gain_from_Open_Lag_6': {2025-09-10 00:00:00: 0.0}}","[-0.6147590916174369, 0.408684110413545, -1.6823539333465318, -2.1347188360569342, 0.9451253132079582, -1.3991407534907576, -1.3157831341465105, -0.4192381922057084, 255.05107098244065, 85.59871659634321, 95.96703152364275, 103.82942010868511, 102.21913137344772, 101.89573358867963, 0.8724283345423732, -0.5973167698103562, -0.7866978995993149, 0.07886996754252583, -4.394355239670768, 85.02593406593407, 82.69626373626373, 84.32263736263737, 86.52043956043956, 86.30065934065934, 84.7917760279965, 77.35520559930009, 87.416447944007, 87.07422815852857, 89.80008768084173, 83.31442080378248, 82.11316695352839, 85.98580034423406, 87.27667814113596, 88.13726333907056, 0.42428934062640505, 0.8505118477899479, 2.5946018206728714, -0.5254051561276185, -0.12896493007326018, -0.5256687099526277, -0.5261359703453173]"
2025-09-15,YPFD.BA,2025-09-10,Alcista,0.7976,0.61,"{'subsample': 0.8, 'scale_pos_weight': 0.5, 'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.8}",0.785714,0.661058,0.718016,0.430147,0.8031,"{'RSI': {2025-09-10 00:00:00: 52.68}, 'ROC': {2025-09-10 00:00:00: 1.156211562115621}, 'PPO': {2025-09-10 00:00:00: -0.8772394184206476}, 'PPO_Signal': {2025-09-10 00:00:00: -0.9259591218197915}, 'PPO_Histogram': {2025-09-10 00:00:00: 0.04871970339914389}, 'EWO': {2025-09-10 00:00:00: -3.672196515494028}, 'EWO_Signal': {2025-09-10 00:00:00: -3.3490853662592004}, 'EWO_Histogram': {2025-09-10 00:00:00: -0.32311114923482753}, 'Volatility': {2025-09-10 00:00:00: 1577.238155}, 'SMA5': {2025-09-10 00:00:00: 40596.0}, 'SMA13': {2025-09-10 00:00:00: 40965.7692}, 'SMA26': {2025-09-10 00:00:00: 42721.3462}, 'SMA50': {2025-09-10 00:00:00: 42052.1}, 'SMA200': {2025-09-10 00:00:00: 43559.025}, 'lag_change1': {2025-09-10 00:00:00: 0.01954569466455358}, 'lag_change2': {2025-09-10 00:00:00: -0.11696793002915451}, 'lag_change3': {2025-09-10 00:00:00: 0.008230452674897082}, 'lag_change4': {2025-09-10 00:00:00: 0.046125461254612476}, 'lag_change5': {2025-09-10 00:00:00: -0.02166064981949456}, 'close_lag_1': {2025-09-10 00:00:00: 38600.0}, 'close_lag_2': {2025-09-10 00:00:00: 37860.0}, 'close_lag_3': {2025-09-10 00:00:00: 42875.0}, 'close_lag_4': {2025-09-10 00:00:00: 42525.0}, 'close_lag_5': {2025-09-10 00:00:00: 40650.0}, 'open_lag_1': {2025-09-10 00:00:00: 38200.0}, 'open_lag_2': {2025-09-10 00:00:00: 37500.0}, 'open_lag_3': {2025-09-10 00:00:00: 42200.0}, 'open_lag_4': {2025-09-10 00:00:00: 41000.0}, 'open_lag_5': {2025-09-10 00:00:00: 41000.0}, 'high_lag_1': {2025-09-10 00:00:00: 40060.0}, 'high_lag_2': {2025-09-10 00:00:00: 39740.0}, 'high_lag_3': {2025-09-10 00:00:00: 43725.0}, 'high_lag_4': {2025-09-10 00:00:00: 42700.0}, 'high_lag_5': {2025-09-10 00:00:00: 42050.0}, 'Max_Gain_from_Open_Current': {2025-09-10 00:00:00: 0.0747422680412352}, 'Max_Gain_from_Open_Lag_1': {2025-09-10 00:00:00: 0.09162303664921227}, 'Max_Gain_from_Open_Lag_2': {2025-09-10 00:00:00: 0.10933333333333042}, 'Max_Gain_from_Open_Lag_3': {2025-09-10 00:00:00: 0.036137440758292984}, 'Max_Gain_from_Open_Lag_4': {2025-09-10 00:00:00: 0.06646341463414472}, 'Max_Gain_from_Open_Lag_5': {2025-09-10 00:00:00: 0.06646341463414472}, 'Max_Gain_from_Open_Lag_6': {2025-09-10 00:00:00: 0.05627705627705489}}","[-0.029614419944876184, 0.09918919765174586, -0.6781460952282776, -0.7559840974273326, 0.09703139220566527, -0.6861214233390261, -0.6601505490191427, -0.21067204240307297, 91.51343809283078, 132.67664533070737, 133.60011417233736, 138.77738202887585, 134.0727156607819, 166.83783792357391, 0.7717961991695983, -4.618684849556809, 0.3249939283790559, 1.8213451244490504, -0.8556014077880957, 125.92028184015403, 123.52044579201834, 140.0508405084051, 139.07960096884108, 132.9490022172949, 124.29903563255967, 122.14129100875398, 137.765110847027, 133.84827868852457, 133.8488524590164, 127.52038369304556, 126.80128205128206, 139.57371794871796, 136.38681741640607, 134.60419512979183, 1.4887356084071173, 1.9454401748438521, 2.4264586733465277, 0.4347708688522255, 1.260602487950169, 1.26212669729396, 0.9854584453082402]"


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Archivo 'Predic_results_2025-09-14.csv' generado y descargado.


In [9]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0


SyntaxError: invalid syntax (ipython-input-1754429140.py, line 664)

In [8]:

import yfinance as yf
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from xgboost import XGBClassifier
from google.colab import files
from scipy import stats
import time
from imblearn.over_sampling import SMOTE # Import SMOTE
import matplotlib.pyplot as plt # Import for plotting (optional for this step)
import seaborn as sns # Import for plotting (optional for this step)
import os # Import os for creating directories


# --- Control Flag ---
run_full_model = True # Set to True to run model training, tuning, and prediction; Set to False to only run data/feature analysis
# --- End Control Flag ---


# Functions for features
def add_lagged_price_features(df, etiqueta="close_lag", dato="Close"):
    for lag in range(1, 6):
        df[f'{etiqueta}_{lag}'] = df[dato].shift(lag)
    return df

def calculate_RSI(series, period=7):
    delta = series.diff(1)
    gain = delta.where(delta > 0, 0).rolling(window=period).mean()
    loss = -delta.where(delta < 0, 0).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def calculate_ROC(series, period=5):
    return ((series - series.shift(period)) / series.shift(period)) * 100

def calculate_PPO(series, fast_period=5, slow_period=9, signal_period=5):
    ema_fast = series.ewm(span=fast_period, adjust=False).mean()
    ema_slow = series.ewm(span=slow_period, adjust=False).mean()
    ppo = (ema_fast - ema_slow) / ema_slow * 100
    signal_line = ppo.ewm(span=signal_period, adjust=False).mean()
    histogram = ppo - signal_line
    return ppo, signal_line, histogram

def calculate_EWO(series, fast_period=5, slow_period=35, signal_period=5):
    ema_fast = series.ewm(span=fast_period, adjust=False).mean()
    ema_slow = series.ewm(span=slow_period, adjust=False).mean()
    ewo = (ema_fast - ema_slow) / ema_slow * 100
    signal_line = ewo.ewm(span=signal_period, adjust=False).mean()
    histogram = ewo - signal_line
    return ewo, signal_line, histogram

def calculate_volatility(series, window=20):
    return series.rolling(window).std().round(6)

def calculate_sma5(series, period=5):
    return series.rolling(window=period).mean().round(4)

def calculate_sma13(series, period=13):
    return series.rolling(window=period).mean().round(4)

def calculate_sma26(series, period=26):
    return series.rolling(window=period).mean().round(4)

def calculate_sma50(series, period=50):
    return series.rolling(window=period).mean().round(4)

def calculate_sma200(series, period=200):
    return series.rolling(window=period).mean().round(4)


def create_features(df, umbral, n_days_high=1):
    df = add_lagged_price_features(df, "close_lag", "Close")
    df = add_lagged_price_features(df, "open_lag", "Open")
    df = add_lagged_price_features(df, "high_lag", "High")
    df['Pct_change'] = df['Close'].pct_change()
    for lag in range(1, 6):
        df[f'lag_change{lag}'] = df['Pct_change'].shift(lag)
    df['RSI'] = calculate_RSI(df['Close'])
    df['ROC'] = calculate_ROC(df['Close'])
    df['PPO'], df['PPO_Signal'], df['PPO_Histogram'] = calculate_PPO(df['Close'])
    df['EWO'], df['EWO_Signal'], df['EWO_Histogram'] = calculate_EWO(df['Close'])
    df['SMA5'] = calculate_sma5(df['Close'])
    df['SMA13'] = calculate_sma13(df['Close'])
    df['SMA26'] = calculate_sma26(df['Close'])
    df['SMA50'] = calculate_sma50(df['Close'])
    df['SMA200'] = calculate_sma200(df['Close'])
    df['Volatility'] = calculate_volatility(df['Close'])

    # --- New Feature: Max Gain from Open over Past N Days ---
    # Calculate the maximum High price over the *next N days* for *each historical day*.
    # Use rolling().max() with min_periods=1 to handle ends of series.
    # Then shift to align with the start of the N-day window (the current day's Open).
    max_high_over_next_n_days_hist = df['High'].rolling(window=n_days_high, min_periods=1).max().shift(-n_days_high + 1)


    # Calculate the potential max gain from Open for *each historical day*
    # Using the Open price of that historical day
    epsilon = 1e-9 # To prevent division by zero
    df['Max_Gain_from_Open_Current'] = (max_high_over_next_n_days_hist - df['Open']) / (df['Open'] + epsilon)

    # --- Add lagged versions of the new feature ---
    for lag in range(1, 7): # Create lags from 1 to 6
        df[f'Max_Gain_from_Open_Lag_{lag}'] = df['Max_Gain_from_Open_Current'].shift(lag)


    # Calculate the target based on tomorrow's Open vs Max High over next n_days_high days
    # Use rolling().max() with min_periods=1 for the target as well.
    # Shift to align with the start of the N-day window for the target (tomorrow's Open).
    max_high_next_n_days_target = df['High'].rolling(window=n_days_high, min_periods=1).max().shift(-n_days_high + 1)
    open_next_day = df['Open'].shift(-1)
    df['Label_raw'] = ((max_high_next_n_days_target - open_next_day) / (open_next_day + epsilon) > umbral).astype(int)
    df['Label'] = df['Label_raw'].shift(-1) # Target for the next day


    # Replace inf values with NaN before dropping
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Dropping NaNs here will remove rows where features or the target are still NaN (e.g., due to initial lags,
    # or if min_periods=1 still results in NaN for very short series, although less likely now for the rolling max).
    df.dropna(inplace=True)

    return df

# Definir fecha de corte manualmente (cambiar diariamente)
end_date = dt.datetime(2025, 9, 14)  # Ejemplo: cambiar a 2025-07-18 mañana

tk =[ "ALUA.BA", "BBAR.BA", "BMA.BA", "COME.BA", "CRES.BA", "EDN.BA", "GGAL.BA", "IRSA.BA", "LOMA.BA", "METR.BA", "PAMP.BA", "SUPV.BA", "TECO2.BA", "TGNO4.BA", "TGSU2.BA", "TRAN.BA", "TXAR.BA", "VALO.BA", "YPFD.BA"]


# results = [] # el del test, para que no lo reinicie - REMOVED
resultsp = [] # las predicciones, para que no lo reinicie

umbral = 0.019
lapso = 1 # Lapso is no longer directly used for the target definition, but keeping it doesn't hurt
n_days_high_target = 3 # Define the number of days for the High target (used for both target and new feature)

# Define clipping bounds - adjust based on feature distributions
lower_bound = -1e9
upper_bound = 1e9


for papel in tk:

  symbol=papel
  #symbol="COME.BA"
  # Fechas dinámicas
  start_date = dt.datetime(2001, 1, 1)  # Inicio fijo
  train_end = end_date - pd.Timedelta(days=780)  # 6 meses antes de end_date (ajustable)
  next_day = end_date + pd.Timedelta(days=1)  # Predicción para el día siguiente


  # Select features - Add the new feature and its lags
  features = ['RSI', 'ROC', 'PPO', 'PPO_Signal', 'PPO_Histogram', 'EWO', 'EWO_Signal', 'EWO_Histogram', 'Volatility', 'SMA5', 'SMA13', 'SMA26', 'SMA50', 'SMA200' ] + [f'lag_change{i}' for i in range(1, 6)] + \
            [f'close_lag_{i}' for i in range(1, 6)] + [f'open_lag_{i}' for i in range(1, 6)]+ [f'high_lag_{i}' for i in range(1, 6)] + \
            ['Max_Gain_from_Open_Current'] + [f'Max_Gain_from_Open_Lag_{i}' for i in range(1, 7)]


  # Download data for the current ticker inside the loop
  print(f"\nDownloading data for {symbol}...")
  df = yf.download(symbol, start=start_date, end=end_date, auto_adjust=False)

  # Verify data download
  if df.empty:
      print(f"Warning: No data downloaded for {symbol}. Skipping.")
      continue # Skip to the next ticker


  # Handle MultiIndex columns and ensure standard column names - More robust logic
  required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
  processed_df = None # Initialize processed_df

  if isinstance(df.columns, pd.MultiIndex):
      print(f"MultiIndex columns detected for {symbol}.")
      try:
          # Attempt to extract columns by looking for standard names in ANY level of the MultiIndex tuple
          extracted_data = {}
          for std_name in required_cols:
              matching_col_tuple = None
              # Iterate through all column tuples
              for col_tuple in df.columns:
                  # Check if the standard name exists in ANY level of the current tuple
                  if std_name in col_tuple:
                       matching_col_tuple = col_tuple
                       break # Found a match in this tuple

              if matching_col_tuple:
                  extracted_data[std_name] = df[matching_col_tuple]
              else:
                  print(f"Warning: Could not find standard column '{std_name}' in any level of MultiIndex for {symbol}. Column missing.")
                  # Continue to look for other required columns, processed_df will be checked later

          if len(extracted_data) == len(required_cols):
              processed_df = pd.DataFrame(extracted_data)
              processed_df.index = df.index # Preserve original index
              print(f"Successfully extracted and flattened MultiIndex columns for {symbol}.")
          else:
              missing_cols = [name for name in required_cols if name not in extracted_data]
              print(f"Warning: Could not extract all required columns from MultiIndex for {symbol}. Missing: {missing_cols}. Skipping ticker.")
              continue # Skip to the next ticker

      except Exception as e:
          print(f"Warning: An error occurred while processing MultiIndex columns for {symbol}: {e}. Skipping.")
          #print(f"Original columns: {df.columns.tolist()}")
          continue # Skip to the next ticker

  else: # If not MultiIndex columns, assume standard flat DataFrame is already present
      print(f"No MultiIndex columns detected for {symbol}. Checking for standard columns.")
      # Check if the required columns are directly present
      if all(col in df.columns for col in required_cols):
          processed_df = df[required_cols].copy() # Select required columns and make a copy
          print(f"Using existing standard columns for {symbol}.")
      else:
          missing_cols = [col for col in required_cols if col not in df.columns]
          print(f"Warning: Required standard columns not found in flat DataFrame for {symbol}. Missing: {missing_cols}. Skipping ticker.")
          #print(f"Available columns: {df.columns.tolist()}")
          continue # Skip to the next ticker

  # Ensure df is set to processed_df if successful
  df = processed_df

  # Handle MultiIndex index if present (less common with single ticker download but possible)
  if isinstance(df.index, pd.MultiIndex):
      print(f"MultiIndex index detected for {symbol}. Attempting to flatten index.")
      try:
          # Assuming the MultiIndex index structure is ('Ticker', 'Date')
          if 'Ticker' in df.index.names:
               df = df.xs(symbol, level='Ticker', axis=0)
               print(f"Índice aplanado para {symbol}.")
          else:
               print(f"Warning: MultiIndex index detected for {symbol} but 'Ticker' level not found. Skipping index flattening.")
               # If 'Ticker' level is not there, maybe it's just a date/time MultiIndex?
               # Or a different structure. For now, proceed without flattening index if Ticker level is missing.


      except KeyError:
          print(f"Warning: Could not select ticker from MultiIndex index for {symbol}. Skipping.")
          continue # Skip to the next ticker
      except Exception as e:
          print(f"Warning: An error occurred while flattening MultiIndex index for {symbol}: {e}. Skipping.")
          continue # Skip to the next ticker


  df.index = pd.to_datetime(df.index)
  if not df.index.is_unique:
      print(f"Advertencia: Índice con fechas duplicadas para {symbol}. Eliminando duplicados...")
      df = df[~df.index.duplicated(keep='first')]

  if df.empty:
      print(f"Warning: DataFrame is empty after initial processing and cleaning for {symbol}. Skipping.")
      continue


  # Ensure numeric types and handle potential non-numeric data
  for col in required_cols:
      if col in df.columns: # Ensure column exists before processing
          df[col] = pd.to_numeric(df[col], errors='coerce')
      else:
           # This should ideally not happen if previous checks passed, but as a safeguard:
           print(f"Error: Required column '{col}' not found in df for {symbol} before numeric conversion. Skipping ticker.")
           df = pd.DataFrame() # Set df to empty to skip further processing
           break # Exit column processing loop


  if df.empty: # Check again if df became empty due to missing columns
       continue # Skip to the next ticker

  # Drop rows where essential price data is missing after coercion
  df.dropna(subset=['Open', 'High', 'Low', 'Close'], inplace=True)


  if df.empty:
      print(f"Warning: DataFrame is empty after dropping rows with missing price data for {symbol}. Skipping.")
      continue


  df['Open']= df['Open'].round(2)
  df['High']= df['High'].round(2)
  df['Low']= df['Low'].round(2)
  df['Close']= df['Close'].round(2)
  df['Adj Close']= df['Adj Close'].round(2)

  print("Últimas filas del DataFrame antes de crear features:")
  print(df.tail())


  # Crear features with the new target definition
  df = create_features(df, umbral=umbral, n_days_high=n_days_high_target) # Pass n_days_high_target, removed lapso

  # Verify data is not empty after feature creation and dropna
  if df.empty:
      print(f"Warning: DataFrame is empty after feature creation and dropna for {symbol}. Skipping.")
      continue


  # Verify data after creating features
  print(f"\nÚltimas filas del DataFrame después de crear features:")
  print(df.tail())
  print(df.columns)

  # --- Add Historical Target Distribution Check ---
  print(f"\nOverall historical target distribution for {symbol} (before train/test split):")
  historical_target_distribution = df["Label"].value_counts(normalize=True)
  print(historical_target_distribution)
  if 0 not in historical_target_distribution.index or historical_target_distribution.loc[0] < 0.01: # Threshold for very low minority class percentage
       print(f"Warning: Historical 'Bajista' (Class 0) instances are very rare (<1%) or non-existent for {symbol}.")
  # --- End Historical Target Distribution Check ---


  # --- Plot Feature Distributions for BBAR.BA by Class (Deep Dive) ---
  # Keep plotting code commented out or removed if user cannot see them
  # if symbol == "BBAR.BA":
  #     print(f"\nDEBUG: Reached plotting section for {symbol}.") # Debug print
  #     print(f"Generating feature distribution plots for {symbol} by target class...")

  #     # Create a directory to save plots
  #     plot_dir = f"{symbol}_feature_plots"
  #     if not os.path.exists(plot_dir):
  #         os.makedirs(plot_dir)
  #         print(f"Created directory: {plot_dir}")


  #     # Select a subset of features to plot - choose some representative ones
  #     features_to_plot = [
  #         'RSI', 'ROC', 'PPO_Histogram', 'EWO_Histogram', 'Volatility',
  #         'SMA5', 'lag_change1', 'Max_Gain_from_Open_Current', 'Max_Gain_from_Open_Lag_2'
  #     ]

  #     # Ensure selected features exist in the DataFrame
  #     features_to_plot_existing = [f for f in features_to_plot if f in df.columns]

  #     if not features_to_plot_existing:
  #         print(f"Warning: None of the selected features for plotting exist in the DataFrame for {symbol}.")
  #     else:
  #         # Use the full historical data (after feature creation and dropna) for these plots
  #         plot_df = df[features_to_plot_existing + ['Label']].copy()
  #         plot_df['Label'] = plot_df['Label'].astype(str) # Convert label to string for plotting hue

  #         # Set up the plotting style
  #         sns.set_style("whitegrid")

  #         for feature in features_to_plot_existing:
  #             plt.figure(figsize=(10, 5))
  #             # Use histplot for distributions
  #             sns.histplot(data=plot_df, x=feature, hue='Label', kde=True, palette='viridis', common_norm=False, stat='density') # common_norm=False & stat='density' for comparing shapes regardless of class counts
  #             plt.title(f'Distribution of {feature} for {symbol} by Target Class')
  #             plt.xlabel(feature)
  #             plt.ylabel('Density') # Changed from Frequency to Density
  #             plt.legend(title='Target', labels=['Alcista (1)', 'Bajista (0)']) # Customize legend labels

  #             # Save the plot to a file
  #             plot_filename = os.path.join(plot_dir, f"{symbol}_{feature}_distribution.png")
  #             plt.savefig(plot_filename)
  #             print(f"Saved plot: {plot_filename}")

  #             plt.close() # Close the plot figure to free up memory

  #     print("Finished generating and saving feature distribution plots for BBAR.BA.")
  # --- End Plot Feature Distributions ---

  # --- Add Feature Distribution Analysis Table for ALL Tickers by Class ---
  # This block should run regardless of run_full_model flag
  print(f"\nDEBUG: Reached feature distribution table analysis section for {symbol}.")
  print(f"Generating feature distribution analysis table for {symbol} by target class...")

  # Use the full historical data (after feature creation and dropna) for this analysis
  analysis_df = df[features + ['Label']].copy() # Use all features for table analysis

  if analysis_df.empty:
       print(f"Warning: DataFrame is empty for feature distribution analysis for {symbol}.")
  elif 'Label' not in analysis_df.columns:
       print(f"Error: 'Label' column not found in DataFrame for feature distribution analysis for {symbol}.")
  elif len(analysis_df['Label'].unique()) < 2:
       print(f"Warning: Only one class exists in 'Label' for feature distribution analysis for {symbol}. Cannot group by class.")
       # Even if only one class, we can still show the describe table for that class
       print(f"\nFeature Distribution Analysis Table for {symbol} (Only one class):")
       display(analysis_df.describe().transpose()) # Show describe for the single class
  else: # This else should be aligned with the if/elif above
      # Group by Label and calculate descriptive statistics for all features
      feature_distribution_table = analysis_df.groupby('Label').describe().transpose()

      print(f"\nFeature Distribution Analysis Table for {symbol} by Target Class:")
      display(feature_distribution_table) # Use display for better formatting

  print(f"Finished generating feature distribution analysis table for {symbol}.") # This print should be aligned with the if/elif/else block
  # --- End Feature Distribution Analysis Table ---


  # --- Start of block for full model run (training, tuning, prediction, results table) ---
  if run_full_model:
      # Dividir datos en entrenamiento y prueba (moved inside the if run_full_model block)
      X = df[features]
      y = df['Label']
      X_train_full = X[df.index <= train_end]
      y_train_full = y[df.index <= train_end]
      X_test = X[(df.index > train_end) & (df.index <= end_date)]  # Hasta end_date
      y_test = y[(df.index > train_end) & (df.index <= end_date)]

      # Check if original training data is sufficient BEFORE proceeding
      if not X_train_full.empty and not y_train_full.empty and len(y_train_full.unique()) > 1:

          correlation = df[features + ["Label"]].corr()["Label"].sort_values(ascending=False)
          print(f"Correlacion con label para {symbol}:")
          print(correlation)

          # Initialize test metrics before evaluation (only if model runs)
          precision_test_alcista = None
          recall_test_alcista = None
          f1_test_alcista = None
          roc_auc_test = None
          ratio_1_test = None
          best_model = None # Initialize best_model to None
          best_threshold = 0.5 # Initialize best_threshold to default


          # Optimizar hiperparámetros con RandomizedSearchCV
          print(f"Optimizar hiperparámetros con RandomizedSearchCV para {symbol}")
          param_dist = {
              'learning_rate': [0.01, 0.05, 0.1, 0.2],
              'max_depth': [3, 5, 7, 9],
              'n_estimators': [100, 500, 900],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'gamma': [0, 0.1, 0.2],
              'scale_pos_weight': [0.5, 1, 2, 5, 10, 20, 50, 100] # Incluir scale_pos_weight en la búsqueda
          }

          # Inicializar el clasificador XGBoost sin scale_pos_weight fijo (se tuneará)
          xgb = XGBClassifier(objective='binary:logistic', random_state=42)

          # Usar TimeSeriesSplit para cross-validation
          n_splits = 5  # Puedes ajustar el número de splits
          tscv = TimeSeriesSplit(n_splits=n_splits)

          # Definir scorer para maximizar Precision de la Clase 1 (for RandomizedSearchCV)
          precision_scorer = make_scorer(precision_score, pos_label=1, zero_division=0) # zero_division=0 para manejar casos sin predicciones positivas

          # Clean X_train_full and y_train_full before fitting RandomizedSearchCV and SMOTE
          X_train_full_cleaned_for_tuning = X_train_full.replace([np.inf, -np.inf], np.nan)
          X_train_full_cleaned_for_tuning.dropna(inplace=True)
          y_train_full_cleaned_for_tuning = y_train_full.loc[X_train_full_cleaned_for_tuning.index] # Ensure y matches cleaned X

          # Explicit check, conversion, and fallback for non-finite values before fitting RandomizedSearchCV
          X_train_full_cleaned_for_tuning = X_train_full_cleaned_for_tuning.astype(np.float64) # Ensure dtype
          if not np.isfinite(X_train_full_cleaned_for_tuning).all().all():
              print(f"\nWarning: Non-finite values detected in X_train_full_cleaned_for_tuning for {symbol} before RandomizedSearchCV fit. Attempting to fill with median.")
              for col in X_train_full_cleaned_for_tuning.columns:
                  finite_values = X_train_full_cleaned_for_tuning[col][np.isfinite(X_train_full_cleaned_for_tuning[col])]
                  if not finite_values.empty:
                      median_val = finite_values.median()
                      X_train_full_cleaned_for_tuning[col].replace([np.inf, -np.inf], np.nan, inplace=True)
                      X_train_full_cleaned_for_tuning[col].fillna(median_val, inplace=True)
                  else:
                      print(f"Warning: Column '{col}' in X_train_full_cleaned_for_tuning is all non-finite. Filling with 0.")
                      X_train_full_cleaned_for_tuning[col].fillna(0, inplace=True)

          # --- Apply SMOTE to the training data ---
          X_train_res, y_train_res = X_train_full_cleaned_for_tuning.copy(), y_train_full_cleaned_for_tuning.copy() # Initialize with cleaned data
          # Check if resampling is needed and possible
          if len(y_train_full_cleaned_for_tuning.unique()) > 1 and y_train_full_cleaned_for_tuning.value_counts().min() > 1: # SMOTE needs at least 2 samples of the minority class
              try:
                  print(f"\nApplying SMOTE to training data for {symbol}...")
                  sm = SMOTE(random_state=42)
                  X_train_res, y_train_res = sm.fit_resample(X_train_full_cleaned_for_tuning, y_train_full_cleaned_for_tuning)
                  print(f"Original training data shape: {X_train_full_cleaned_for_tuning.shape}, Resampled shape: {X_train_res.shape}")
                  print(f"Original training target distribution: {y_train_full_cleaned_for_tuning.value_counts()}, Resampled target distribution: {y_train_res.value_counts()}")
              except Exception as e:
                  print(f"\nWarning: Could not apply SMOTE to training data for {symbol}: {e}. Proceeding with original imbalanced training data.")
                  # X_train_res and y_train_res remain the original cleaned training data
          elif len(y_train_full_cleaned_for_tuning.unique()) == 1:
               print(f"\nWarning: Only one class in training data for {symbol} after cleaning. Cannot apply SMOTE.")
               # X_train_res and y_train_res remain the original cleaned training data
          else: # Not enough samples for SMOTE (e.g., only 1 minority sample)
              print(f"\nWarning: Not enough samples in minority class for SMOTE for {symbol}. Proceeding with original imbalanced training data.")
              # X_train_res and y_train_res remain the original cleaned training data


          # Check if resampled data is sufficient for tuning and evaluation
          if not X_train_res.empty and not y_train_res.empty and len(y_train_res.unique()) > 1:
              # Perform tuning, evaluation, and prediction within a general try-except block
              # to prevent script crash on problematic tickers
              try:
                  # Fit RandomizedSearchCV on the resampled training data
                  random_search = RandomizedSearchCV(xgb, param_distributions=param_dist, n_iter=20, cv=tscv, scoring=precision_scorer, n_jobs=-1, random_state=42) # Usar precision_scorer
                  random_search.fit(X_train_res, y_train_res) # Fit on resampled data
                  print(f"Mejores hiperparámetros para {symbol}:", random_search.best_params_)

                  # Usar el mejor modelo encontrado por RandomizedSearchCV
                  best_model = random_search.best_estimator_

                  # Get and print feature importance (fitted on resampled data)
                  feature_importance = pd.Series(best_model.feature_importances_, index=features)
                  print(f"\nFeature Importance for {symbol}:")
                  print(feature_importance.sort_values(ascending=False))

                  # Optimize the threshold for maximum F1-score on the full original training set (NOT resampled)
                  # Threshold optimization should reflect real-world data distribution
                  X_train_full_cleaned_for_threshold = X_train_full.replace([np.inf, -np.inf], np.nan)
                  X_train_full_cleaned_for_threshold.dropna(inplace=True)
                  y_train_full_cleaned_for_threshold = y_train_full.loc[X_train_full_cleaned_for_threshold.index]

                  if not X_train_full_cleaned_for_threshold.empty and not y_train_full_cleaned_for_threshold.empty and len(y_train_full_cleaned_for_threshold.unique()) > 1:
                      # Predict probabilities on the ORIGINAL cleaned training data for threshold optimization
                      y_train_prob = best_model.predict_proba(X_train_full_cleaned_for_threshold)[:, 1]
                      thresholds = np.arange(0.01, 1.0, 0.01)
                      best_threshold = 0.5
                      best_f1 = 0 # Changed from best_precision to best_f1

                      print(f"Optimizing threshold for maximum F1-score on training data (ORIGINAL distribution) for {symbol}...") # Updated message
                      # Calculate F1-score for each threshold
                      for threshold in thresholds:
                          y_pred_threshold = (y_train_prob >= threshold).astype(int)
                          # Calculate f1_score (handles zero_division internally based on scikit-learn version)
                          f1 = f1_score(y_train_full_cleaned_for_threshold, y_pred_threshold, zero_division=0)
                          if f1 > best_f1:
                              best_f1 = f1
                              best_threshold = threshold

                      print(f"Mejor umbral para maximizar F1-score en entrenamiento (ORIGINAL distribution) para {symbol}: {best_threshold:.4f} (F1-score: {best_f1:.4f})") # Updated message and metric
                  else:
                      print(f"\nWarning: Original training set for {symbol} contains no samples or only one class after cleaning for threshold optimization. Cannot optimize threshold for F1-score. Using default threshold 0.5.") # Updated message
                      best_threshold = 0.5


                  # Evaluar el modelo en el conjunto de prueba con el best threshold (Test set is NOT resampled)
                  if not X_test.empty and not y_test.empty and len(y_test.unique()) > 1:
                      print(f"\nEvaluating best model on test set for {symbol} with best threshold ({best_threshold:.4f}):")

                      # Clean X_test before evaluation
                      X_test_cleaned = X_test.replace([np.inf, -np.inf], np.nan).dropna()
                      y_test_cleaned = y_test.loc[X_test_cleaned.index]

                      # Scale X_test using the scaler fitted on the ORIGINAL training data
                      scaler = RobustScaler()
                      # Fit scaler on the ORIGINAL cleaned training data (NOT resampled)
                      X_train_full_cleaned_for_scaler_eval = X_train_full.replace([np.inf, -np.inf], np.nan)
                      X_train_full_cleaned_for_scaler_eval.dropna(inplace=True)
                      X_train_full_cleaned_for_scaler_eval = X_train_full_cleaned_for_scaler_eval.clip(lower=lower_bound, upper=upper_bound)
                      X_train_full_cleaned_for_scaler_eval = X_train_full_cleaned_for_scaler_eval.astype(np.float64)

                      if not X_train_full_cleaned_for_scaler_eval.empty and np.isfinite(X_train_full_cleaned_for_scaler_eval).all().all():
                          scaler.fit(X_train_full_cleaned_for_scaler_eval)

                          X_test_cleaned = X_test_cleaned.astype(np.float64)
                          if not np.isfinite(X_test_cleaned).all().all():
                               print(f"\nWarning: Non-finite values detected in X_test_cleaned for {symbol} before scaler transform. Attempting to fill with median (from train data).")
                               train_medians = X_train_full_cleaned_for_scaler_eval.median()
                               for col in X_test_cleaned.columns:
                                   median_val = train_medians.get(col, 0)
                                   X_test_cleaned[col].replace([np.inf, -np.inf], np.nan, inplace=True)
                                   X_test_cleaned[col].fillna(median_val, inplace=True)
                               if not np.isfinite(X_test_cleaned).all().all():
                                    print(f"\nERROR: Non-finite values STILL detected in X_test_cleaned for {symbol} after filling with median!")


                          if not X_test_cleaned.empty and np.isfinite(X_test_cleaned).all().all():
                              X_test_scaled = scaler.transform(X_test_cleaned)

                              y_test_pred_prob = best_model.predict_proba(X_test_scaled)[:, 1]
                              y_test_pred = (y_test_pred_prob >= best_threshold).astype(int)

                              if len(y_test_cleaned.unique()) > 1:
                                  print(f"\nClassification Report (Test Set) for {symbol}:")
                                  print(classification_report(y_test_cleaned, y_test_pred, zero_division=0))
                                  precision_test_alcista = precision_score(y_test_cleaned, y_test_pred, pos_label=1, zero_division=0)
                                  recall_test_alcista = recall_score(y_test_cleaned, y_test_pred, pos_label=1, zero_division=0)
                                  f1_test_alcista = f1_score(y_test_cleaned, y_test_pred, pos_label=1, zero_division=0)

                                  print(f"Tamaño de y_test (cleaned): {y_test_cleaned.size}")
                                  print(f"Distribución de clases en y_test (cleaned) para {symbol}:")
                                  print(y_test_cleaned.value_counts())
                                  if 1 in y_test_cleaned.value_counts():
                                      ratio_1_test=(y_test_cleaned.value_counts()[1]/y_test_cleaned.size).round(4)
                                  else:
                                      ratio_1_test = 0
                                  print(f"% clase 1 test para {symbol}: {ratio_1_test} ")

                                  if len(y_test_cleaned.unique()) > 1:
                                       roc_auc_test = roc_auc_score(y_test_cleaned, y_test_pred_prob).round(6)
                                       print(f"\nROC-AUC (Test Set) para {symbol}: {roc_auc_test:.4f}")
                                  else:
                                       roc_auc_test = None
                                       print(f"\nWarning: Test set for {symbol} contains only one class after cleaning. Cannot calculate ROC-AUC.")

                              else:
                                  print(f"\nWarning: Test set for {symbol} contains only one class after cleaning. Cannot generate full classification report.")
                                  precision_test_alcista = None
                                  recall_test_alcista = None
                                  f1_test_alcista = None
                                  roc_auc_test = None
                                  ratio_1_test = None


                          else:
                              print(f"\nWarning: X_test became empty after cleaning or contains non-finite values for {symbol}. Skipping test evaluation.")
                              precision_test_alcista = None
                              recall_test_alcista = None
                              f1_test_alcista = None
                              roc_auc_test = None
                              ratio_1_test = None





                      else:
                          print(f"\nAdvertencia: Conjunto de prueba insuficiente o con una sola clase para evaluación para {symbol}.")
                          precision_test_alcista = None
                          recall_test_alcista = None
                          f1_test_alcista = None
                          roc_auc_test = None
                          ratio_1_test = None

                      # Prediction for the next day is done only if model was trained successfully
                      last_features = df[features].iloc[-1:]
                      last_features_cleaned = None
                      last_features_scaled = None
                      future_pred_prob = None
                      future_pred = None

                      if not last_features.empty:
                          # Ensure last_features is a single row DataFrame before cleaning
                          if not isinstance(last_features, pd.DataFrame) or len(last_features) != 1:
                               print(f"\nError: last_features is not a single row DataFrame for {symbol}. Skipping prediction.")
                               # Set prediction results to skipped
                               last_data_date = df.index[-1] if not df.empty else None
                               last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
                               resultsp.append({
                                   'Papel': symbol,
                                   'Fecha Predicción': next_day,
                                   'Fecha Datos': last_data_date,
                                   'Predicción': 'Skipped (Prediction Data Error)',
                                   'Precio actual': last_close,
                                   'Probabilidad Alcista (Modelo)': None,
                                   'Umbral de Clasificación': None,
                                   'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                                   'Precision Test (Alcista)': precision_test_alcista,
                                   'Recall Test (Alcista)': recall_test_alcista,
                                   'F1 Test (Alcista)': f1_test_alcista,
                                   'ROC-AUC Test': roc_auc_test,
                                   'clase 1 en test (cleaned)': ratio_1_test,
                                   'Features Limpias (Predicción)': None,
                                   'Features Escaladas (Predicción)': None
                               })
                               # No need to continue here, the append is done and we move to the next ticker
                               continue # Skip to the next ticker


                          last_features_cleaned = last_features.replace([np.inf, -np.inf], np.nan).dropna()
                          last_features_cleaned = last_features_cleaned.clip(lower=lower_bound, upper=upper_bound)

                          # Add checks for NaN, Inf, and Zero in cleaned features BEFORE scaling
                          has_nan_cleaned = last_features_cleaned.isna().any().any()
                          has_inf_cleaned = np.isinf(last_features_cleaned).any().any()
                          has_zero_cleaned = (last_features_cleaned == 0).any().any()

                          if has_nan_cleaned:
                              print(f"\nDEBUG: NaN values detected in last_features_cleaned for {symbol}.")
                          if has_inf_cleaned:
                              print(f"\nDEBUG: Inf values detected in last_features_cleaned for {symbol}.")
                          if has_zero_cleaned:
                               print(f"\nDEBUG: Zero values detected in last_features_cleaned for {symbol}.")


                          if not last_features_cleaned.empty and np.isfinite(last_features_cleaned).all().all():
                              # Ensure scaler is fitted on the *cleaned* full training data
                              scaler = RobustScaler()
                              # Fit scaler on the ORIGINAL cleaned training data (NOT resampled)
                              X_train_full_cleaned_for_scaler_pred = X_train_full.replace([np.inf, -np.inf], np.nan)
                              X_train_full_cleaned_for_scaler_pred.dropna(inplace=True)
                              X_train_full_cleaned_for_scaler_pred = X_train_full_cleaned_for_scaler_pred.clip(lower=lower_bound, upper=upper_bound)
                              X_train_full_cleaned_for_scaler_pred = X_train_full_cleaned_for_scaler_pred.astype(np.float64)

                              if not X_train_full_cleaned_for_scaler_pred.empty and np.isfinite(X_train_full_cleaned_for_scaler_pred).all().all():
                                   scaler.fit(X_train_full_cleaned_for_scaler_pred)

                                   last_features_cleaned = last_features_cleaned.astype(np.float64)
                                   # Add explicit fallback for non-finite values AFTER dropna for prediction data
                                   # This fallback is actually redundant if dropna() was called just above and np.isfinite checked,
                                   # but keeping it for safety if flow changes. The main checks should be BEFORE scaling.
                                   # Let's rely on the check and skip if still non-finite after cleaning.

                                   if not last_features_cleaned.empty and np.isfinite(last_features_cleaned).all().all():
                                       last_features_scaled = scaler.transform(last_features_cleaned)

                                       # Add checks for NaN, Inf, and Zero in scaled features BEFORE prediction
                                       has_nan_scaled = np.isnan(last_features_scaled).any()
                                       has_inf_scaled = np.isinf(last_features_scaled).any()
                                       has_zero_scaled = (last_features_scaled == 0).any()

                                       if has_nan_scaled:
                                           print(f"\nDEBUG: NaN values detected in last_features_scaled for {symbol}.")
                                       if has_inf_scaled:
                                           print(f"\nDEBUG: Inf values detected in last_features_scaled for {symbol}.")
                                       if has_zero_scaled:
                                            print(f"\nDEBUG: Zero values detected in last_features_scaled for {symbol}.")


                                       future_pred_prob = best_model.predict_proba(last_features_scaled)[:, 1][0].round(4)
                                       future_pred = 1 if future_pred_prob >= best_threshold else 0

                                       last_close = None
                                       last_open = None
                                       last_max = None
                                       if not df.empty:
                                           last_close = df['Close'].iloc[-1]
                                           last_open = df['Open'].iloc[-1]
                                           last_max = df['High'].iloc[-1]
                                           last_data_date = df.index[-1]
                                       else:
                                           print(f"Warning: DataFrame 'df' is empty for {symbol}. Cannot get last prices.")
                                           last_data_date = None

                                       action = 'BUY' if future_pred == 1 else 'SELL'
                                       direction = 1 if future_pred == 1 else -1

                                       # Explicitly use to_dict() and tolist()
                                       cleaned_features_dict = last_features_cleaned.to_dict() if last_features_cleaned is not None and not last_features_cleaned.empty else None
                                       scaled_features_list = last_features_scaled.tolist()[0] if last_features_scaled is not None and last_features_scaled.size > 0 else None


                                       resultsp.append({
                                                   'Papel': symbol,
                                                   'Fecha Predicción': next_day,
                                                   'Fecha Datos': last_data_date,
                                                   'Predicción': 'Alcista' if future_pred == 1 else 'Bajista',
                                                   'Probabilidad Alcista (Modelo)': future_pred_prob,
                                                   'Umbral de Clasificación': best_threshold.round(4),
                                                   'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)', # Add check for best_model
                                                   'Precision Test (Alcista)': precision_test_alcista,
                                                   'Recall Test (Alcista)': recall_test_alcista,
                                                   'F1 Test (Alcista)': f1_test_alcista,
                                                   'ROC-AUC Test': roc_auc_test,
                                                   'clase 1 en test (cleaned)': ratio_1_test,
                                                   'Features Limpias (Predicción)': cleaned_features_dict, # Add cleaned features using to_dict
                                                   'Features Escaladas (Predicción)': scaled_features_list # Add scaled features as a list
                                           })



                          else:
                               print(f"Warning: Training data (X_train_full) became empty or contains non-finite values after cleaning for scaler fitting for prediction. Skipping prediction for {symbol}.")
                               last_data_date = df.index[-1] if not df.empty else None
                               last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
                               # Explicitly use None for feature columns if prediction skipped due to data issues
                               resultsp.append({
                                   'Papel': symbol,
                                   'Fecha Predicción': next_day,
                                   'Fecha Datos': last_data_date,
                                   'Predicción': 'Skipped (Prediction Data Issue)',
                                   'Precio actual': last_close,
                                   'Probabilidad Alcista (Modelo)': None,
                                   'Umbral de Clasificación': None,
                                   'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                                   'Precision Test (Alcista)': precision_test_alcista,
                                   'Recall Test (Alcista)': recall_test_alcista,
                                   'F1 Test (Alcista)': f1_test_alcista,
                                   'ROC-AUC Test': roc_auc_test,
                                   'clase 1 en test (cleaned)': ratio_1_test,
                                    'Features Limpias (Predicción)': last_features_cleaned.to_dict() if last_features_cleaned is not None and not last_features_cleaned.empty else None,
                                    'Features Escaladas (Predicción)': last_features_scaled.tolist()[0] if last_features_scaled is not None and last_features_scaled.size > 0 else None
                               })


                      else:
                          print(f"Warning: Could not make prediction for {symbol} as last_features became empty or contains non-finite values after cleaning.")
                          last_data_date = df.index[-1] if not df.empty else None
                          last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
                          # Explicitly use None for feature columns if prediction skipped due to data issues
                          resultsp.append({
                              'Papel': symbol,
                              'Fecha Predicción': next_day,
                              'Fecha Datos': last_data_date,
                              'Predicción': 'Skipped (Prediction Data Issue)',
                              'Precio actual': last_close,
                              'Probabilidad Alcista (Modelo)': None,
                              'Umbral de Clasificación': None,
                              'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                              'Precision Test (Alcista)': precision_test_alcista,
                              'Recall Test (Alcista)': recall_test_alcista,
                              'F1 Test (Alcista)': f1_test_alcista,
                              'ROC-AUC Test': roc_auc_test,
                              'clase 1 en test (cleaned)': ratio_1_test,
                       'Features Limpias (Predicción)': last_features_cleaned.to_dict() if last_features_cleaned is not None and not last_features_cleaned.empty else None,
                       'Features Escaladas (Predicción)': last_features_scaled.tolist()[0] if last_features_scaled is not None and last_features_scaled.size > 0 else None
                          })


                  else:
                      print(f"Warning: Could not make prediction for {symbol} as last_features was initially empty.")
                      last_data_date = df.index[-1] if not df.empty else None
                      last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None
                      # Explicitly use None for feature columns if prediction skipped due to data issues
                      resultsp.append({
                          'Papel': symbol,
                          'Fecha Predicción': next_day,
                          'Fecha Datos': last_data_date,
                          'Predicción': 'Skipped (Prediction Data Issue)',
                          'Precio actual': last_close,
                          'Probabilidad Alcista (Modelo)': None,
                          'Umbral de Clasificación': None,
                          'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                          'Precision Test (Alcista)': precision_test_alcista,
                          'Recall Test (Alcista)': recall_test_alcista,
                          'F1 Test (Alcista)': f1_test_alcista,
                          'ROC-AUC Test': roc_auc_test,
                          'clase 1 en test (cleaned)': ratio_1_test,
                       'Features Limpias (Predicción)': last_features_cleaned.to_dict() if last_features_cleaned is not None and not last_features_cleaned.empty else None,
                       'Features Escaladas (Predicción)': last_features_scaled.tolist()[0] if last_features_scaled is not None and last_features_scaled.size > 0 else None
                      })


              # Catch a general Exception during tuning/evaluation/prediction to prevent script crash
              except Exception as e:
                  print(f"ERROR: An error occurred during tuning, evaluation, or prediction for {symbol}: {e}. Skipping this ticker.")
                  best_model = None
                  best_threshold = 0.5
                  precision_test_alcista = None
                  recall_test_alcista = None
                  f1_test_alcista = None
                  roc_auc_test = None
                  ratio_1_test = None

                  last_data_date = df.index[-1] if not df.empty else None
                  last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None

                  # Append entry indicating skip due to general error, explicitly add None for feature columns
                  resultsp.append({
                        'Papel': symbol,
                        'Fecha Predicción': next_day,
                        'Fecha Datos': last_data_date,
                        'Predicción': 'Skipped (Error)',
                        'Precio actual': last_close,
                        'Probabilidad Alcista (Modelo)': None,
                        'Umbral de Clasificación': None,
                        'Mejores hiperparámetros (Incluye scale_pos_weight)': str(random_search.best_params_) if best_model is not None else 'Skipped (Tuning Error)',
                        'Precision Test (Alcista)': precision_test_alcista,
                        'Recall Test (Alcista)': recall_test_alcista,
                        'F1 Test (Alcista)': f1_test_alcista,
                        'ROC-AUC Test': roc_auc_test,
                        'clase 1 en test (cleaned)': ratio_1_test,
                        'Features Limpias (Predicción)': None,
                        'Features Escaladas (Predicción)': None
                  })

          else: # This else belongs to the check for sufficient RESAMPLED data
              print(f"Warning: Resampled training data is insufficient or has only one class for {symbol}. Skipping model training and prediction.")
              last_data_date = df.index[-1] if not df.empty else None
              last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None

              # Append entry indicating skip due to insufficient training data (resampled)
              resultsp.append({
                    'Papel': symbol,
                    'Fecha Predicción': next_day,
                    'Fecha Datos': last_data_date,
                    'Predicción': 'Skipped (Insufficient Resampled Training Data)', # More specific message
                    'Precio actual': last_close,
                    'Probabilidad Alcista (Modelo)': None,
                    'Umbral de Clasificación': None,
                    'Mejores hiperparámetros (Incluye scale_pos_weight)': 'Skipped (Insufficient Resampled Training Data)',
                    'Precision Test (Alcista)': None, # Set to None as no model ran
                    'Recall Test (Alcista)': None, # Set to None as no model ran
                    'F1 Test (Alcista)': None, # Set to None as no model ran
                    'ROC-AUC Test': None, # Set to None as no model ran
                    'clase 1 en test (cleaned)': None, # Set to None as no model ran
                    'Features Limpias (Predicción)': None,
                    'Features Escaladas (Predicción)': None
              })


      else: # This else belongs to the initial check for sufficient ORIGINAL training data
          print(f"Warning: Original training data is insufficient or has only one class for {symbol}. Skipping model training and prediction.")
          last_data_date = df.index[-1] if not df.empty else None
          last_close = df['Close'].iloc[-1] if not df.empty and 'Close' in df.columns else None

          # Append entry indicating skip due to insufficient training data (original)
          resultsp.append({
                'Papel': symbol,
                'Fecha Predicción': next_day,
                'Fecha Datos': last_data_date,
                'Predicción': 'Skipped (Insufficient Original Training Data)', # More specific message
                'Precio actual': last_close,
                'Probabilidad Alcista (Modelo)': None,
                'Umbral de Clasificación': None,
                'Mejores hiperparámetros (Incluye scale_pos_weight)': 'Skipped (Insufficient Original Training Data)',
                'Precision Test (Alcista)': None, # Set to None as no model ran
                'Recall Test (Alcista)': None, # Set to None as no model ran
                'F1 Test (Alcista)': None, # Set to None as no model ran
                'ROC-AUC Test': None, # Set to None as no model ran
                'clase 1 en test (cleaned)': None, # Set to None as no model ran
                'Features Limpias (Predicción)': None,
                'Features Escaladas (Predicción)': None
          })


  # --- End of block for full model run ---


# Create prediction results table ONLY if the full model was run
if run_full_model: # Moved this entire block inside the if condition
    resultsp_df = pd.DataFrame(resultsp)
    print(resultsp_df)
    if not resultsp_df.empty:
        resultsp_df.set_index('Fecha Predicción', inplace=True)

        # Mostrar resultados de predicción
        pd.set_option('display.max_columns', None)
        #pd.set_option('display.max_rows', None) # Optional: display all rows
        pd.set_option('display.max_colwidth', None) # Optional: display full content of columns

        print(f"\nPrediccion para el proximo dia (hasta {next_day.strftime('%Y-%m-%d')}):")
        print("Nota: 'Fecha Predicción' es la fecha predicha; 'Fecha Datos' es la fecha de los datos usados.")
        display(resultsp_df) # Use display for better formatting

        # Guardar y descargar el CSV de predicciones
        resultsp_df.to_csv(f"Predic_results_{end_date.strftime('%Y-%m-%d')}.csv", sep=";")
        files.download(f"Predic_results_{end_date.strftime('%Y-%m-%d')}.csv")
        print(f"\nArchivo 'Predic_results_{end_date.strftime('%Y-%m-%d')}.csv' generado y descargado.")
    else:
        print("\nNo hay resultados de predicción para mostrar.")
else: # Add a message when skipping the full model run
    print("\n'run_full_model' is set to False. Skipping model training, tuning, evaluation, and prediction.")
    print("Feature distribution analysis tables for all tickers are displayed above.")


Downloading data for ALUA.BA...


[*********************100%***********************]  1 of 1 completed


MultiIndex columns detected for ALUA.BA.
Successfully extracted and flattened MultiIndex columns for ALUA.BA.
Últimas filas del DataFrame antes de crear features:
             Open   High    Low  Close   Volume  Adj Close
Date                                                      
2025-09-08  670.0  705.0  600.0  696.5  1572445      696.5
2025-09-09  700.0  713.0  678.5  694.0  1217950      694.0
2025-09-10  700.0  716.5  691.5  702.0   641358      702.0
2025-09-11  702.0  713.0  676.0  682.0  1443135      682.0
2025-09-12  686.0  702.0  660.0  666.0   420245      666.0

Últimas filas del DataFrame después de crear features:
             Open   High    Low  Close   Volume  Adj Close  close_lag_1  \
Date                                                                      
2025-09-04  710.0  717.0  685.0  696.0   949242      696.0        700.0   
2025-09-05  680.0  696.0  664.0  686.0  2029661      686.0        696.0   
2025-09-08  670.0  705.0  600.0  696.5  1572445      696.5        68

Unnamed: 0,Label,0.0,1.0
RSI,count,2272.000000,3253.000000
RSI,mean,51.347561,51.600978
RSI,std,24.330060,25.578069
RSI,min,0.000000,0.000000
RSI,25%,33.333333,32.558140
...,...,...,...
Max_Gain_from_Open_Lag_6,min,0.000000,0.000000
Max_Gain_from_Open_Lag_6,25%,0.007240,0.009434
Max_Gain_from_Open_Lag_6,50%,0.017961,0.024064
Max_Gain_from_Open_Lag_6,75%,0.036085,0.049724


Finished generating feature distribution analysis table for ALUA.BA.
Correlacion con label para ALUA.BA:
Label                         1.000000
Max_Gain_from_Open_Current    0.176044
Volatility                    0.137457
high_lag_1                    0.133648
high_lag_2                    0.133553
open_lag_1                    0.133132
close_lag_2                   0.133121
open_lag_2                    0.133050
close_lag_1                   0.132892
close_lag_3                   0.132868
high_lag_3                    0.132730
high_lag_4                    0.132608
high_lag_5                    0.132543
SMA5                          0.132482
open_lag_4                    0.132300
SMA13                         0.132219
close_lag_5                   0.132119
open_lag_3                    0.132067
open_lag_5                    0.131868
close_lag_4                   0.131611
Max_Gain_from_Open_Lag_3      0.131285
SMA26                         0.131199
SMA50                         0.12891

[*********************100%***********************]  1 of 1 completed

Mejor umbral para maximizar F1-score en entrenamiento (ORIGINAL distribution) para ALUA.BA: 0.3300 (F1-score: 0.9610)

Evaluating best model on test set for ALUA.BA with best threshold (0.3300):

Classification Report (Test Set) for ALUA.BA:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       120
         1.0       0.77      1.00      0.87       398

    accuracy                           0.77       518
   macro avg       0.38      0.50      0.43       518
weighted avg       0.59      0.77      0.67       518

Tamaño de y_test (cleaned): 518
Distribución de clases en y_test (cleaned) para ALUA.BA:
Label
1.0    398
0.0    120
Name: count, dtype: int64
% clase 1 test para ALUA.BA: 0.7683 

ROC-AUC (Test Set) para ALUA.BA: 0.5747

Downloading data for BBAR.BA...
MultiIndex columns detected for BBAR.BA.
Successfully extracted and flattened MultiIndex columns for BBAR.BA.
Últimas filas del DataFrame antes de crear features:
              





Últimas filas del DataFrame después de crear features:
              Open    High     Low   Close   Volume  Adj Close  close_lag_1  \
Date                                                                          
2025-09-04  5580.0  5850.0  5510.0  5720.0   276714     5720.0       5540.0   
2025-09-05  5840.0  5840.0  5410.0  5640.0   596771     5640.0       5720.0   
2025-09-08  5175.0  5175.0  4482.5  4507.5  1397504     4507.5       5640.0   
2025-09-09  4600.0  4650.0  4385.0  4400.0   761818     4400.0       4507.5   
2025-09-10  4550.0  4710.0  4475.0  4655.0   206731     4655.0       4400.0   

            close_lag_2  close_lag_3  close_lag_4  close_lag_5  open_lag_1  \
Date                                                                         
2025-09-04       5620.0       5380.0       5520.0       5610.0      5650.0   
2025-09-05       5540.0       5620.0       5380.0       5520.0      5580.0   
2025-09-08       5720.0       5540.0       5620.0       5380.0      5840.0   


Unnamed: 0,Label,0.0,1.0
RSI,count,1826.000000,4102.000000
RSI,mean,52.339050,52.790074
RSI,std,24.332178,24.894616
RSI,min,0.000000,0.000000
RSI,25%,35.294118,34.161588
...,...,...,...
Max_Gain_from_Open_Lag_6,min,-0.027778,0.000000
Max_Gain_from_Open_Lag_6,25%,0.009609,0.012315
Max_Gain_from_Open_Lag_6,50%,0.023962,0.031799
Max_Gain_from_Open_Lag_6,75%,0.048536,0.062500


Finished generating feature distribution analysis table for BBAR.BA.
Correlacion con label para BBAR.BA:
Label                         1.000000
high_lag_1                    0.102475
close_lag_1                   0.102424
open_lag_1                    0.102082
high_lag_2                    0.101677
close_lag_2                   0.101671
SMA5                          0.101554
SMA13                         0.101335
open_lag_5                    0.101233
high_lag_3                    0.101142
close_lag_5                   0.101098
high_lag_4                    0.101062
open_lag_2                    0.101042
open_lag_3                    0.101000
high_lag_5                    0.100923
close_lag_4                   0.100823
open_lag_4                    0.100819
close_lag_3                   0.100795
SMA26                         0.100699
SMA50                         0.100206
SMA200                        0.097080
Volatility                    0.091043
Max_Gain_from_Open_Lag_4      0.07205

[*********************100%***********************]  1 of 1 completed

Mejor umbral para maximizar F1-score en entrenamiento (ORIGINAL distribution) para BBAR.BA: 0.3200 (F1-score: 0.9467)

Evaluating best model on test set for BBAR.BA with best threshold (0.3200):

Classification Report (Test Set) for BBAR.BA:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        70
         1.0       0.86      1.00      0.93       448

    accuracy                           0.86       518
   macro avg       0.43      0.50      0.46       518
weighted avg       0.75      0.86      0.80       518

Tamaño de y_test (cleaned): 518
Distribución de clases en y_test (cleaned) para BBAR.BA:
Label
1.0    448
0.0     70
Name: count, dtype: int64
% clase 1 test para BBAR.BA: 0.8649 

ROC-AUC (Test Set) para BBAR.BA: 0.4321

DEBUG: Zero values detected in last_features_cleaned for BBAR.BA.

Downloading data for BMA.BA...
MultiIndex columns detected for BMA.BA.
Successfully extracted and flattened MultiIndex columns for BMA.BA.
Últ




Unnamed: 0,Label,0.0,1.0
RSI,count,1832.000000,4026.000000
RSI,mean,53.655398,54.155427
RSI,std,25.252266,25.116178
RSI,min,0.000000,0.000000
RSI,25%,34.740803,34.744816
...,...,...,...
Max_Gain_from_Open_Lag_6,min,0.000000,-0.022727
Max_Gain_from_Open_Lag_6,25%,0.008000,0.012379
Max_Gain_from_Open_Lag_6,50%,0.024327,0.031017
Max_Gain_from_Open_Lag_6,75%,0.046479,0.059850


Finished generating feature distribution analysis table for BMA.BA.
Correlacion con label para BMA.BA:
Label                         1.000000
high_lag_1                    0.103232
high_lag_2                    0.103039
open_lag_1                    0.103035
close_lag_1                   0.102994
close_lag_2                   0.102914
open_lag_2                    0.102767
SMA5                          0.102722
Volatility                    0.102646
high_lag_3                    0.102602
close_lag_3                   0.102529
open_lag_3                    0.102265
close_lag_4                   0.102110
high_lag_4                    0.102033
close_lag_5                   0.101911
high_lag_5                    0.101822
open_lag_4                    0.101793
SMA13                         0.101749
open_lag_5                    0.101375
SMA26                         0.101047
SMA50                         0.100193
SMA200                        0.093373
Max_Gain_from_Open_Lag_3      0.061427


[*********************100%***********************]  1 of 1 completed

Mejor umbral para maximizar F1-score en entrenamiento (ORIGINAL distribution) para BMA.BA: 0.3200 (F1-score: 0.9521)

Evaluating best model on test set for BMA.BA with best threshold (0.3200):

Classification Report (Test Set) for BMA.BA:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        73
         1.0       0.86      1.00      0.92       445

    accuracy                           0.86       518
   macro avg       0.43      0.50      0.46       518
weighted avg       0.74      0.86      0.79       518

Tamaño de y_test (cleaned): 518
Distribución de clases en y_test (cleaned) para BMA.BA:
Label
1.0    445
0.0     73
Name: count, dtype: int64
% clase 1 test para BMA.BA: 0.8591 

ROC-AUC (Test Set) para BMA.BA: 0.4840

DEBUG: Zero values detected in last_features_cleaned for BMA.BA.

Downloading data for COME.BA...
MultiIndex columns detected for COME.BA.
Successfully extracted and flattened MultiIndex columns for COME.BA.
Últimas




Unnamed: 0,Label,0.0,1.0
RSI,count,2096.000000,2787.000000
RSI,mean,50.878266,53.236464
RSI,std,33.025019,30.029602
RSI,min,0.000000,0.000000
RSI,25%,29.411765,33.333333
...,...,...,...
Max_Gain_from_Open_Lag_6,min,0.000000,0.000000
Max_Gain_from_Open_Lag_6,25%,0.000000,0.000000
Max_Gain_from_Open_Lag_6,50%,0.016632,0.029851
Max_Gain_from_Open_Lag_6,75%,0.083333,0.071429


Finished generating feature distribution analysis table for COME.BA.
Correlacion con label para COME.BA:
Label                         1.000000
Volatility                    0.171419
high_lag_1                    0.171386
open_lag_1                    0.171181
close_lag_1                   0.171041
high_lag_2                    0.171028
high_lag_3                    0.170970
close_lag_2                   0.170963
SMA5                          0.170942
open_lag_2                    0.170849
high_lag_4                    0.170803
close_lag_3                   0.170715
open_lag_3                    0.170605
open_lag_4                    0.170575
close_lag_4                   0.170529
SMA13                         0.170464
high_lag_5                    0.170405
close_lag_5                   0.170390
open_lag_5                    0.169937
SMA26                         0.169821
SMA200                        0.169495
SMA50                         0.169389
Max_Gain_from_Open_Current    0.06704

[*********************100%***********************]  1 of 1 completed

Mejor umbral para maximizar F1-score en entrenamiento (ORIGINAL distribution) para COME.BA: 0.3600 (F1-score: 0.9520)

Evaluating best model on test set for COME.BA with best threshold (0.3600):

Classification Report (Test Set) for COME.BA:
              precision    recall  f1-score   support

         0.0       0.50      0.01      0.02        81
         1.0       0.84      1.00      0.92       437

    accuracy                           0.84       518
   macro avg       0.67      0.51      0.47       518
weighted avg       0.79      0.84      0.78       518

Tamaño de y_test (cleaned): 518
Distribución de clases en y_test (cleaned) para COME.BA:
Label
1.0    437
0.0     81
Name: count, dtype: int64
% clase 1 test para COME.BA: 0.8436 

ROC-AUC (Test Set) para COME.BA: 0.5479

Downloading data for CRES.BA...
MultiIndex columns detected for CRES.BA.
Successfully extracted and flattened MultiIndex columns for CRES.BA.
Últimas filas del DataFrame antes de crear features:
              





Últimas filas del DataFrame después de crear features:
              Open    High     Low   Close  Volume  Adj Close  close_lag_1  \
Date                                                                         
2025-09-04  1300.0  1360.0  1265.0  1325.0  664275     1325.0       1305.0   
2025-09-05  1325.0  1355.0  1300.0  1335.0  503476     1335.0       1325.0   
2025-09-08  1270.0  1300.0  1217.0  1271.0  707563     1271.0       1335.0   
2025-09-09  1267.0  1301.0  1237.0  1239.0  381354     1239.0       1271.0   
2025-09-10  1270.0  1284.0  1228.0  1278.0  304007     1278.0       1239.0   

            close_lag_2  close_lag_3  close_lag_4  close_lag_5  open_lag_1  \
Date                                                                         
2025-09-04       1350.0       1325.0       1355.0       1395.0      1340.0   
2025-09-05       1305.0       1350.0       1325.0       1355.0      1300.0   
2025-09-08       1325.0       1305.0       1350.0       1325.0      1325.0   
2025-09

Unnamed: 0,Label,0.0,1.0
RSI,count,2246.000000,3670.000000
RSI,mean,52.933957,52.317664
RSI,std,26.445104,25.455665
RSI,min,0.000000,0.000000
RSI,25%,33.969156,33.333333
...,...,...,...
Max_Gain_from_Open_Lag_6,min,0.000000,0.000000
Max_Gain_from_Open_Lag_6,25%,0.003687,0.008828
Max_Gain_from_Open_Lag_6,50%,0.019540,0.027237
Max_Gain_from_Open_Lag_6,75%,0.042816,0.054197


Finished generating feature distribution analysis table for CRES.BA.
Correlacion con label para CRES.BA:
Label                         1.000000
Volatility                    0.160670
high_lag_2                    0.154254
high_lag_3                    0.154199
high_lag_4                    0.154151
high_lag_1                    0.153946
open_lag_3                    0.153906
open_lag_1                    0.153852
close_lag_2                   0.153846
open_lag_4                    0.153823
open_lag_2                    0.153796
close_lag_5                   0.153657
high_lag_5                    0.153645
SMA5                          0.153611
close_lag_4                   0.153597
close_lag_3                   0.153584
close_lag_1                   0.153364
open_lag_5                    0.153022
SMA13                         0.152940
SMA26                         0.152026
SMA50                         0.150828
SMA200                        0.142999
Max_Gain_from_Open_Current    0.12107

[*********************100%***********************]  1 of 1 completed

Mejor umbral para maximizar F1-score en entrenamiento (ORIGINAL distribution) para CRES.BA: 0.3100 (F1-score: 0.9543)

Evaluating best model on test set for CRES.BA with best threshold (0.3100):

Classification Report (Test Set) for CRES.BA:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        84
         1.0       0.84      1.00      0.91       434

    accuracy                           0.84       518
   macro avg       0.42      0.50      0.46       518
weighted avg       0.70      0.84      0.76       518

Tamaño de y_test (cleaned): 518
Distribución de clases en y_test (cleaned) para CRES.BA:
Label
1.0    434
0.0     84
Name: count, dtype: int64
% clase 1 test para CRES.BA: 0.8378 

ROC-AUC (Test Set) para CRES.BA: 0.4519

Downloading data for EDN.BA...





MultiIndex columns detected for EDN.BA.
Successfully extracted and flattened MultiIndex columns for EDN.BA.
Últimas filas del DataFrame antes de crear features:
              Open    High     Low   Close   Volume  Adj Close
Date                                                          
2025-09-08  1380.0  1380.0  1255.0  1258.0  1505729     1258.0
2025-09-09  1280.0  1322.0  1245.0  1259.0  1488830     1259.0
2025-09-10  1300.0  1358.0  1268.0  1351.0  1052553     1351.0
2025-09-11  1358.0  1388.0  1291.0  1297.0   769554     1297.0
2025-09-12  1310.0  1310.0  1210.0  1248.0   773007     1248.0

Últimas filas del DataFrame después de crear features:
              Open    High     Low   Close   Volume  Adj Close  close_lag_1  \
Date                                                                          
2025-09-04  1420.0  1495.0  1415.0  1480.0   759997     1480.0       1415.0   
2025-09-05  1450.0  1520.0  1435.0  1515.0   984492     1515.0       1480.0   
2025-09-08  1380.0  1380.0

Unnamed: 0,Label,0.0,1.0
RSI,count,1177.000000,3112.000000
RSI,mean,51.239581,52.243195
RSI,std,25.934395,25.488300
RSI,min,0.000000,0.000000
RSI,25%,31.343284,33.129098
...,...,...,...
Max_Gain_from_Open_Lag_6,min,0.000000,-0.014286
Max_Gain_from_Open_Lag_6,25%,0.009662,0.011791
Max_Gain_from_Open_Lag_6,50%,0.026316,0.031536
Max_Gain_from_Open_Lag_6,75%,0.055160,0.065789


Finished generating feature distribution analysis table for EDN.BA.
Correlacion con label para EDN.BA:
Label                         1.000000
Volatility                    0.092790
high_lag_2                    0.092041
high_lag_1                    0.091952
open_lag_2                    0.091774
close_lag_2                   0.091624
close_lag_1                   0.091549
open_lag_1                    0.091467
close_lag_3                   0.091342
SMA5                          0.091264
high_lag_3                    0.091061
SMA13                         0.090891
open_lag_3                    0.090391
close_lag_4                   0.090370
SMA26                         0.090336
open_lag_5                    0.090291
close_lag_5                   0.090262
high_lag_5                    0.090148
high_lag_4                    0.090137
open_lag_4                    0.090055
SMA50                         0.089287
SMA200                        0.087495
Max_Gain_from_Open_Current    0.072709


KeyboardInterrupt: 