In [222]:
import joblib
import json
import os
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import TimeSeriesSplit
import yfinance
import ta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import classification_report, accuracy_score, f1_score, cohen_kappa_score
from sklearn.preprocessing import MinMaxScaler

In [223]:
# Función optimizada para obtener datos
def get_stock_data(ticker, period, interval):
    data = yfinance.download(ticker, period=period, interval=interval)
    data.columns = data.columns.droplevel(1)
    data.reset_index(inplace=True)
    data.rename(columns={'Datetime': 'Date'}, inplace=True)
    
    # Convertir y extraer características de fecha
    data['Date'] = pd.to_datetime(data['Date'])
    data['Day'] = data['Date'].dt.day
    data['Month'] = data['Date'].dt.month
    data['Hour'] = data['Date'].dt.hour
    data['Year'] = data['Date'].dt.year
    data['Dia_de_la_Semana'] = data['Date'].dt.weekday
    data['Dia_del_Año'] = data['Date'].dt.dayofyear
    
    # Asignar 1 si el cierre es mayor que la apertura (verde), 0 si es menor o igual (rojo)
    data["Volumen_Color_Num"] = (data["Close"] > data["Open"]).astype(int)

    # Definir el tamaño de la ventana (aproximadamente 33 datos por semana)
    ventana_semanal = 70

    # Calcular el porcentaje de velas verdes en la última semana
    data["Porcentaje_Velas_Verdes"] = data["Volumen_Color_Num"].rolling(window=ventana_semanal, min_periods=1).mean()

    # Asignar categorías
    data["Tendencia_Semanal"] = 1  # Neutral por defecto
    data.loc[data["Porcentaje_Velas_Verdes"] > 0.65, "Tendencia_Semanal"] = 2  # Alcista
    data.loc[data["Porcentaje_Velas_Verdes"] < 0.35, "Tendencia_Semanal"] = 0  # Bajista
    
    return data

In [224]:
# Pipeline para calcular indicadores técnicos
def create_technical_indicators_pipeline():
    def calculate_indicators(df):
        # Copia para evitar SettingWithCopyWarning
        data = df.copy()
        
        # Medias móviles
        data['EMA12'] = data['Close'].ewm(span=12, adjust=False).mean()
        data['EMA26'] = data['Close'].ewm(span=26, adjust=False).mean()
        data['EMA20'] = data['Close'].ewm(span=20, adjust=False).mean()
        data['EMA50'] = data['Close'].ewm(span=50, adjust=False).mean()
        data['SMA50'] = data['Close'].rolling(window=50).mean()
        data['SMA20'] = data['Close'].rolling(window=20).mean()
        
        # Banda de Bollinger
        data['BB_upper'] = data['Close'].rolling(window=20).mean() + 2 * data['Close'].rolling(window=20).std()
        data['BB_lower'] = data['Close'].rolling(window=20).mean() - 2 * data['Close'].rolling(window=20).std()
        data['BBW'] = (data['BB_upper'] - data['BB_lower']) / data['Close']
        
        # Volumen
        data['Volume_MA'] = data['Volume'].rolling(window=20).mean()
        data['Vol_Ratio_10h'] = data['Volume'] / data['Volume'].rolling(window=10).mean()
        
        # Retornos
        data['Hourly_Return'] = data['Close'].pct_change() * 100
        data['Cumulative_Return'] = data['Close'].pct_change(periods=5) * 100
        
        # MACD
        data['MACD'] = data['EMA12'] - data['EMA26']
        data['MACD_Signal'] = data['MACD'].ewm(span=9, adjust=False).mean()
        
        # RSI
        data['RSI'] = ta.momentum.rsi(data['Close'], window=14)
        
        # ATR
        data['ATR'] = ta.volatility.average_true_range(data['High'], data['Low'], data['Close'], window=14)
        
        # ADX
        data['ADX'] = ta.trend.adx(data['High'], data['Low'], data['Close'], window=14)
        
        # Estocástico
        data['Stoch_K'] = ta.momentum.stoch(data['High'], data['Low'], data['Close'], window=14, smooth_window=3)
        data['Stoch_D'] = ta.momentum.stoch_signal(data['High'], data['Low'], data['Close'], window=14, smooth_window=3)
        
        # Momentum
        data['MOM'] = ta.momentum.roc(data['Close'], window=10)
        
        # OBV
        data['OBV'] = ta.volume.on_balance_volume(data['Close'], data['Volume'])
        
        # CMF
        data['CMF'] = ta.volume.chaikin_money_flow(data['High'], data['Low'], data['Close'], data['Volume'], window=20)
        
        return data
    
    return Pipeline([
        ('technical_indicators', FunctionTransformer(calculate_indicators))
    ])

In [225]:
def classify_change(percentage, percentiles):
    if percentage <= percentiles[0]:
        return 0  # Venta Muy Fuerte
    elif percentage <= percentiles[1]:
        return 1  # Venta
    elif percentage <= percentiles[2]:
        return 2  # Neutral
    elif percentage <= percentiles[3]:
        return 3  # Compra
    else: 
        return 4

In [226]:
def create_2w_indicators_pipeline():
    def create_2w_indicators(df):
        data = df.copy()

        data['Past_Change_1d'] = (data['Close'] -data['Close'].shift(7)) / data['Close'].shift(7) * 100
        data['Past_Change_2d'] = (data['Close'] -data['Close'].shift(14)) / data['Close'].shift(14) * 100
        data['Past_Change_1w'] = (data['Close'] -data['Close'].shift(35)) / data['Close'].shift(35) * 100
        data['Past_Change_2w'] = (data['Close'] -data['Close'].shift(70)) / data['Close'].shift(70) * 100

        # ===================== 📌 CAMBIO PORCENTUAL FUTURO (2 SEMANAS) =====================
        data['Future_Change_2w'] = (data['Close'].shift(-70) - data['Close']) / data['Close'] * 100

        # ===================== 📌 FUNCIÓN PARA CLASIFICAR CAMBIOS =====================

        # ===================== 📌 CALCULAR PERCENTILES DINÁMICOS =====================
        percentiles_1w = np.percentile(data['Past_Change_1w'].dropna(), [15, 40,58 ,80])
        percentiles_2w = np.percentile(data['Past_Change_2w'].dropna(), [15, 40,58 ,80])
        percentiles_future_2w = np.percentile(data['Future_Change_2w'].dropna(), [15, 40,58 ,80])

        # ===================== 📌 APLICAR CLASIFICACIÓN =====================
        data['Past_Class_1w'] = data['Past_Change_1w'].apply(lambda x: classify_change(x, percentiles_1w))
        data['Past_Class_2w'] = data['Past_Change_2w'].apply(lambda x: classify_change(x, percentiles_2w))
        data['Future_Class_2w'] = data['Future_Change_2w'].apply(lambda x: classify_change(x, percentiles_future_2w))
        
        # Crear una nueva columna para identificar eventos de volatilidad con el cambio pasado y el valor actual
        data['Volatility_Spike_VH'] = (data['Past_Change_2w'] <= percentiles_2w[0]).astype(int)

        # Crear una nueva columna para identificar eventos de volatilidad con el cambio pasado y el valor actual (subida moderada)
        data['Volatility_Spike_H'] = (data['Past_Change_2w'] <= percentiles_2w[1]).astype(int)

        # Crear una nueva columna para identificar eventos de volatilidad con el cambio pasado y el valor actual (subida leve)
        data['Volatility_Spike_I'] = (data['Past_Change_2w'] <= percentiles_2w[1]).astype(int)
        return data

    return Pipeline([
        ('indicators_past', FunctionTransformer(create_2w_indicators))
    ])


In [227]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

# Función para crear características temporales (RSI, ADX, y Volumen)
def create_temporal_features_pipeline(horizon):
    def add_temporal_features(df):
        data = df.copy()
            
        data['RSI_change_1w'] = data['RSI'] - data['RSI'].shift(35)
        data['ADX_change_1w'] = data['ADX'] - data['ADX'].shift(35)
        data['Volume_MA_change_1w'] = data['Volume_MA'] - data['Volume_MA'].shift(35)
        data['RSI_change_2w'] = data['RSI'] - data['RSI'].shift(70)
        data['ADX_change_2w'] = data['ADX'] - data['ADX'].shift(70)
        
        if horizon == '2w':  # Ajuste para el caso de 2 semanas
            return data
            
        if horizon == '2m':  # Ajuste para el caso de 2 meses
            # 📌 Calcular el cambio de RSI en dos meses
            data['RSI_change_2m'] = data['RSI'] - data['RSI'].shift(280)
            data['ADX_change_2m'] = data['ADX'] - data['ADX'].shift(280)
            data['Volume_MA_change_2m'] = data['Volume_MA'] - data['Volume_MA'].shift(280)
            # 📌 Calcular el cambio de RSI en un mes
            data['Volume_MA_change_2w'] = data['Volume_MA'] - data['Volume_MA'].shift(70)
            data['RSI_change_1m'] = data['RSI'] - data['RSI'].shift(140)
            data['ADX_change_1m'] = data['ADX'] - data['ADX'].shift(140)
            data['Volume_MA_change_1m'] = data['Volume_MA'] - data['Volume_MA'].shift(140)
        return data

    return Pipeline([
        ('temporal_features', FunctionTransformer(add_temporal_features))
    ])


In [228]:
def calculate_changes(df, horizon):
    # Cálculos para cada tipo de cambio, según el período
    df['Past_Change_1d'] = (df['Close'] - df['Close'].shift(7)) / df['Close'].shift(7) * 100
    df['Past_Change_2d'] = (df['Close'] - df['Close'].shift(14)) / df['Close'].shift(14) * 100
    df['Past_Change_1w'] = (df['Close'] - df['Close'].shift(35)) / df['Close'].shift(35) * 100
    df['Past_Change_2w'] = (df['Close'] - df['Close'].shift(70)) / df['Close'].shift(70) * 100
    # Cambio futuro para 2w
    df['Future_Change_2w'] = (df['Close'].shift(-70) - df['Close']) / df['Close'] * 100
    percentiles_1w = np.percentile(df['Past_Change_1w'].dropna(), [15, 40,58 ,80])
    percentiles_2w = np.percentile(df['Past_Change_2w'].dropna(), [15, 40,58 ,80])
    df['Past_Class_1w'] = df['Past_Change_1w'].apply(lambda x: classify_change(x, percentiles_1w))
    df['Past_Class_2w'] = df['Past_Change_2w'].apply(lambda x: classify_change(x, percentiles_2w))
    
    if horizon == '2w':
        percentiles_future_2w = np.percentile(df['Future_Change_2w'].dropna(), [15, 40,58 ,80])
        df['Future_Class_2w'] = df['Future_Change_2w'].apply(lambda x: classify_change(x, percentiles_future_2w))
        
        return df

    elif horizon == '2m':
        df['Past_Change_1m'] = (df['Close'] - df['Close'].shift(140)) / df['Close'].shift(140) * 100
        df['Past_Change_2m'] = (df['Close'] - df['Close'].shift(280)) / df['Close'].shift(280) * 100

        df['Future_Change_2m'] = (df['Close'].shift(-280) - df['Close']) / df['Close'] * 100
        df['Future_Change_2w'] = (df['Close'].shift(-70) - df['Close']) / df['Close'] * 100
        percentiles_1m = np.percentile(df['Past_Change_1m'].dropna(), [15, 40,58 ,80])
        percentiles_2m = np.percentile(df['Past_Change_2m'].dropna(), [15, 40,58 ,80])
        percentiles_future_2m = np.percentile(df['Future_Change_2m'].dropna(), [15, 40,58 ,80])

        df['Past_Class_1m'] = df['Past_Change_1m'].apply(lambda x: classify_change(x, percentiles_1m))
        df['Past_Class_2m'] = df['Past_Change_2m'].apply(lambda x: classify_change(x, percentiles_2m))
        df['Future_Class_2m'] = df['Future_Change_2m'].apply(lambda x: classify_change(x, percentiles_future_2m))
        
        df['Volatility_Spike_VH_m'] = (df['Past_Change_2m'] <= percentiles_2m[0]).astype(int)
        df['Volatility_Spike_H_m'] = (df['Past_Change_2m'] <= percentiles_2m[1]).astype(int)
        df['Volatility_Spike_I_m'] = (df['Past_Change_2m'] <= percentiles_2m[1]).astype(int)
        
    return df

In [229]:
def mark_previous_hours(data, column_name, percentiles, horas_a_marcar=84):
    # Crear las nuevas columnas e inicializarlas con 0
    data[f'previousd_strongsell'] = 0
    data[f'previousd_sell'] = 0
    data[f'previousd_strongbuy'] = 0
    data[f'previousd_buy'] = 0
    
    # Iterar sobre los datos para marcar las horas previas
    for i in range(horas_a_marcar, len(data)):  # Comenzamos desde el índice de horas_a_marcar
        # Verificar si el cambio porcentual pasado cae por debajo del percentil 15 (fuerte caída)
        if data[column_name].iloc[i] <= percentiles[0]:
            # Marcar las horas previas como 1 en 'previousd_strongsell'
            data.loc[data.index[i-horas_a_marcar:i], 'previousd_strongsell'] = 1

        # Verificar si el cambio porcentual pasado cae por debajo del percentil 40 (caída moderada)
        if data[column_name].iloc[i] <= percentiles[1]:
            # Marcar las horas previas como 1 en 'previousd_sell'
            data.loc[data.index[i-horas_a_marcar:i], 'previousd_sell'] = 1

        # Verificar si el cambio porcentual pasado es mayor que el percentil 80 (fuerte subida)
        if data[column_name].iloc[i] >= percentiles[3]:
            # Marcar las horas previas como 1 en 'previousd_strongbuy'
            data.loc[data.index[i-horas_a_marcar:i], 'previousd_strongbuy'] = 1

        # Verificar si el cambio porcentual pasado es mayor que el percentil 58 (subida moderada)
        if data[column_name].iloc[i] >= percentiles[2]:
            # Marcar las horas previas como 1 en 'previousd_buy'
            data.loc[data.index[i-horas_a_marcar:i], 'previousd_buy'] = 1

    # Devolver el DataFrame con las nuevas columnas
    return data

In [230]:
def important_features(df, horizon):
    data = df.copy()
    
    correlation_matrix = data.corr()

    # 🔹 Obtener las variables con correlación débil con 'Future_Class_2w'
    low_corr_features = correlation_matrix.index[
        correlation_matrix[f'Future_Class_{horizon}'].between(-0.03, 0.03)
    ].difference(["Date","Close"])

    # 📌 Eliminar las columnas con baja correlación
    data = data.drop(columns=data[low_corr_features])
    return data.dropna()

In [231]:
def create_data_processing_pipeline(horizon):
    return Pipeline([
        ('technical_indicators', create_technical_indicators_pipeline()),
        ('indicators_past',create_2w_indicators_pipeline()),
        ('temporal_features', create_temporal_features_pipeline(horizon)),
    ])

In [232]:
def train_and_evaluate_models(X_train, y_train, X_test, y_test, model_configs):
    results = {}
    models = {}
    
    for name, config in model_configs.items():
        try:
            # Crear pipeline con escalado y modelo
            model_pipeline = Pipeline([
                ('scaler', MinMaxScaler(feature_range=(0, 1))),
                ('model', config['constructor'](**config['params']))
            ])
            
            # Entrenamiento
            model_pipeline.fit(X_train, y_train)
            
            # Predicción
            y_pred = model_pipeline.predict(X_test)
            
            # Métricas
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='weighted')
            kappa = cohen_kappa_score(y_test, y_pred)
            
            # Guardar resultados (solo datos serializables)
            results[name] = {
                'accuracy': accuracy,
                'f1_score': f1,
                'kappa': kappa,
                'predictions': y_pred.tolist(),  # Convertir a lista
                'classification_report': classification_report(y_test, y_pred, output_dict=True)
            }
            
            models[name] = model_pipeline
            
        except Exception as e:
            print(f"Error entrenando {name}: {str(e)}")
            continue
    
    return results, models

In [233]:
# Configuración de modelos
MODEL_CONFIGS = {
    'RandomForest': {
        'constructor': RandomForestClassifier,
        'params': {
            'n_estimators': 500,
            'max_depth': 10,
            'min_samples_split': 10,
            'min_samples_leaf': 5,
            'max_features': 'sqrt',
            'random_state': 42,
            'n_jobs': -1
        }
    },
    'XGBoost': {
        'constructor': xgb.XGBClassifier,
        'params': {
            'eval_metric': 'mlogloss',
            'n_estimators': 500,
            'learning_rate': 0.01,
            'max_depth': 6,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'gamma': 0.1,
            'min_child_weight': 5,
            'random_state': 42,
            'n_jobs': -1
        }
    },
    'GradientBoosting': {
        'constructor': GradientBoostingClassifier,
        'params': {
            'n_estimators': 500,
            'learning_rate': 0.01,
            'max_depth': 6,
            'min_samples_split': 10,
            'min_samples_leaf': 5,
            'subsample': 0.8,
            'random_state': 42
        }
    },
    'LightGBM': {
        'constructor': lgb.LGBMClassifier,
        'params': {
            'objective': 'multiclass',
            'num_class': 5,
            'n_estimators': 500,
            'learning_rate': 0.01,
            'max_depth': -1,
            'num_leaves': 31,
            'min_child_samples': 20,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 42,
            'n_jobs': -1,
            'verbose': -1
        }
    }
}

In [234]:
# Función principal para ejecutar el flujo completo
def run_analysis(ticker, period, interval, horizon='2w'):
    # 1. Obtener datos
    data = get_stock_data(ticker, period, interval)
    percentiles_2w = np.percentile(data['Close'].pct_change(70).dropna() * 100, [15, 40, 58, 80])
    
    if horizon == '2w':
        window = 70
        percentiles_dict = np.percentile(data['Close'].pct_change(window).dropna() * 100, [15, 40, 58, 80])
    else:  # '2m'
        window = 280
        percentiles_dict = np.percentile(data['Close'].pct_change(window).dropna() * 100, [15, 40, 58, 80])
        
    # 3. Procesar datos
    processing_pipeline = create_data_processing_pipeline(horizon)
    processed_data = processing_pipeline.fit_transform(data)
    
    # Llamadas a funciones
    processed_data = calculate_changes(processed_data, horizon)
    print(processed_data.columns)
    print("--------------------------------------------------------------------------")
    processed_data = mark_previous_hours(processed_data, 'Past_Change_2w', percentiles_2w, horas_a_marcar=84)
    processed_data = important_features(processed_data, horizon)
    
    # 4. Preparar datos para modelado
    target_col = f'Future_Class_{horizon}'
    
    if horizon == "2w":
        features = processed_data.drop(columns=['Date', 'Close', target_col, f'Future_Change_{horizon}'])
    
    if horizon == '2m':
        features = processed_data.drop(columns=['Date', 'Close', target_col, f'Future_Change_{horizon}' ,'Future_Change_2w'])
    
    target = processed_data[target_col]
    
    # 5. Dividir datos (usando validación temporal)
    tscv = TimeSeriesSplit(n_splits=3)
    for train_index, test_index in tscv.split(features):
        X_train, X_test = features.iloc[train_index], features.iloc[test_index]
        y_train, y_test = target.iloc[train_index], target.iloc[test_index]
        
        # 6. Entrenar y evaluar modelos
        results, models = train_and_evaluate_models(X_train, y_train, X_test, y_test, MODEL_CONFIGS)
        
        # 7. Guardar modelos y resultados
        timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
        for name, model in models.items():
            model_filename = f"models/{ticker}_{horizon}_{name}_{timestamp}.pkl"
            joblib.dump(model, model_filename)
        
        metrics_filename = f"metrics/{ticker}_{horizon}_metrics_{timestamp}.json"
        with open(metrics_filename, 'w') as f:
            json.dump(results, f)
    
    return results, models


In [None]:
if __name__ == "__main__":
    ticker = "AMZN"
    period = "730d"
    interval = "1h"
    
    # Crear directorios si no existen
    os.makedirs("models", exist_ok=True)
    os.makedirs("results", exist_ok=True)
    
    # Análisis para 2 semanas
    print("Ejecutando análisis para 2 semanas...")
    results_2w, models_2w = run_analysis(ticker, period, interval, '2w')
    
    # Análisis para 2 meses
    print("\nEjecutando análisis para 2 meses...")
    results_2m, models_2m = run_analysis(ticker, period, interval, '2m')
    
    # Mostrar resultados promediados
    print("\n🔹 Resultados promediados para 2 semanas:")
    for name, res in results_2w.items():
        print(f"{name}:")
        print(f"  Accuracy: {res['mean_accuracy']:.4f}")
        print(f"  F1: {res['mean_f1']:.4f}")
        print(f"  Kappa: {res['mean_kappa']:.4f}")
    
    print("\n🔹 Resultados promediados para 2 meses:")
    for name, res in results_2m.items():
        print(f"{name}:")
        print(f"  Accuracy: {res['mean_accuracy']:.4f}")
        print(f"  F1: {res['mean_f1']:.4f}")
        print(f"  Kappa: {res['mean_kappa']:.4f}")

Ejecutando análisis para 2 semanas...


[*********************100%***********************]  1 of 1 completed


Index(['Date', 'Close', 'High', 'Low', 'Open', 'Volume', 'Day', 'Month',
       'Hour', 'Year', 'Dia_de_la_Semana', 'Dia_del_Año', 'Volumen_Color_Num',
       'Porcentaje_Velas_Verdes', 'Tendencia_Semanal', 'EMA12', 'EMA26',
       'EMA20', 'EMA50', 'SMA50', 'SMA20', 'BB_upper', 'BB_lower', 'BBW',
       'Volume_MA', 'Vol_Ratio_10h', 'Hourly_Return', 'Cumulative_Return',
       'MACD', 'MACD_Signal', 'RSI', 'ATR', 'ADX', 'Stoch_K', 'Stoch_D', 'MOM',
       'OBV', 'CMF', 'Past_Change_1d', 'Past_Change_2d', 'Past_Change_1w',
       'Past_Change_2w', 'Future_Change_2w', 'Past_Class_1w', 'Past_Class_2w',
       'Future_Class_2w', 'Volatility_Spike_VH', 'Volatility_Spike_H',
       'Volatility_Spike_I', 'RSI_change_1w', 'ADX_change_1w',
       'Volume_MA_change_1w', 'RSI_change_2w', 'ADX_change_2w'],
      dtype='object', name='Price')
--------------------------------------------------------------------------
