In [2]:
!pip -q install lightgbm pyarrow fastparquet tqdm_joblib imbalanced-learn ta

In [3]:
!pip -q install lightgbm pyarrow fastparquet tqdm imbalanced-learn ta

import numpy as np
import pandas as pd
import yfinance as yf
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

import ta  # technical analysis
from google.colab import drive

# ============================
# Configurações gerais
# ============================
START_DATE = "2005-01-01"

# Permite preencher lacunas de FEATURES com bfill (útil p/ calendários diferentes).
# Isto pode introduzir leakage "quando inevitável".
ALLOW_BFILL_EXOGENOUS = True

# Defasagem das FEATURES (1 evita leakage trivial; 0 permite mais vazamento).
SHIFT_FEATURES = 0

# Médias móveis a usar (manteremos TODOS cruzamentos slow > fast)
AVERAGES = [1, 2, 5, 10, 15, 20, 25, 50, 100]

# Horizonte para cálculo de alvos
HORIZON = 90
UP_THR = 0.30   # +30%
DD_THR = -0.10  # -10%


SAVE_PARQUET = True
SAVE_CSV_FALLBACK = False
OUTPUT_PATH = "drive/My Drive/Colab Notebooks/stock/expanded_stock.parquet"
drive.mount('/content/drive')

# ============================
# Listas de tickers
# ============================
ibovespa_tickers = [
    'ABEV3.SA', 'B3SA3.SA', 'BBAS3.SA', 'BBDC4.SA', 'BBSE3.SA', 'BRFS3.SA', 'BRKM5.SA', 'CCRO3.SA',
    'CIEL3.SA', 'CMIG4.SA', 'CSAN3.SA', 'CSNA3.SA', 'CVCB3.SA', 'CYRE3.SA', 'ECOR3.SA', 'EGIE3.SA', 'ELET3.SA', 'EMBR3.SA',
    'ENGI11.SA', 'EQTL3.SA', 'EVEN3.SA', 'FIBR3.SA', 'GGBR4.SA', 'HAPV3.SA', 'ITUB4.SA', 'JBSS3.SA',
    'JHSF3.SA', 'LAME4.SA', 'LOGG3.SA', 'LREN3.SA', 'MULT3.SA', 'NATU3.SA', 'MRFG3.SA', 'MOVI3.SA',
    'MYPK3.SA', 'MDIA3.SA', 'IRBR3.SA', 'NTCO3.SA', 'PETR3.SA', 'PETR4.SA', 'PRIO3.SA', 'RADL3.SA',
    'RAIL3.SA', 'RENT3.SA', 'RAIZ4.SA', 'SBSP3.SA', 'SANB3.SA', 'SAPR3.SA', 'SUZB3.SA', 'TCSA3.SA',
    'VIVA3.SA', 'AZUL4.SA', 'GOLL4.SA', 'WEGE3.SA','BBDC3.SA', 'VVAR3.SA', 'BEEF3.SA', 'CESP6.SA',
    'USIM5.SA', 'VALE3.SA', 'POMO4.SA', 'LEVE3.SA', 'TUPY3.SA', 'RAPT4.SA', 'ROMI3.SA'
]

fii_tickers = [
    'MXRF11.SA','HGLG11.SA','KNRI11.SA','VISC11.SA','XPLG11.SA','VILG11.SA','BTLG11.SA',
    'BRCO11.SA','GGRC11.SA','LVBI11.SA','XPML11.SA','HSML11.SA',
    'BRCR11.SA','HGRE11.SA','PVBI11.SA','RCRB11.SA','VINO11.SA',
    'ALZR11.SA','TRXF11.SA','RBVA11.SA','RBRP11.SA',
    'KNCR11.SA','KNHY11.SA','KNSC11.SA','CPTS11.SA','HCTR11.SA','IRDM11.SA','URPR11.SA',
    'OUJP11.SA','VRTA11.SA','HGCR11.SA','DEVA11.SA','RBRR11.SA',
    'HFOF11.SA','KFOF11.SA','XPSF11.SA','RBRF11.SA','VGHF11.SA',
]

global_indices = [
    '^GSPC', '^DJI', '^IXIC', '^FTSE', '^FCHI', '^GDAXI', '^N225', '^HSI', '^AXJO', '^BSESN', '^SSE', '^JKSE', '^BVSP'
]

currency_commodity_tickers = [
    'UUP','FXE','FXY','GLD','USO',
    'EURUSD=X','GBPUSD=X','CNYUSD=X','AUDUSD=X','CHFUSD=X','BRLUSD=X','MXNUSD=X',
    'BTC-USD','ETH-USD','DOGE-USD','LTC-USD','SOL-USD',
    'CL=F','GC=F','NG=F','HG=F','ZC=F','HE=F','ZW=F','S=F','BZ=F',
    'XAUEUR','XAGEUR','COPX','MGC=F','HO=F'
]

ALL_TICKERS = ibovespa_tickers + fii_tickers + global_indices + currency_commodity_tickers

# ============================
# Utilidades de preenchimento/casting
# ============================
def fill_100pct(df: pd.DataFrame, allow_bfill=True) -> pd.DataFrame:
    """Garante 100% preenchido: Inf->NaN, ffill, bfill opcional, e NaN restantes->0."""
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.ffill()
    if allow_bfill:
        df = df.bfill()
    # Se alguma coluna ficou toda NaN (pode ocorrer em padrões), zera
    all_nan_cols = df.columns[df.isna().all()].tolist()
    if all_nan_cols:
        df[all_nan_cols] = 0.0
    # NaN remanescentes -> 0
    df = df.fillna(0.0)
    return df

def cast_int8_multi(df: pd.DataFrame, prefixes=(), suffixes=()):
    if not isinstance(df.columns, pd.MultiIndex):
        return df

    level0 = pd.Index(df.columns.get_level_values(0).astype(str))
    mask = np.zeros(len(level0), dtype=bool)

    if prefixes:
        starts = level0.str.startswith(prefixes)          # array-like
        mask = np.logical_or(mask, np.asarray(starts, dtype=bool))

    if suffixes:
        ends = level0.str.endswith(suffixes)              # array-like
        mask = np.logical_or(mask, np.asarray(ends, dtype=bool))

    cols = df.columns[mask]
    if len(cols):
        df = df.copy()
        df.loc[:, cols] = df.loc[:, cols].astype('int8', copy=False)
    return df

# ============================
# Download dos dados
# ============================
print("Baixando cotações do Yahoo Finance...")
data = yf.download(
    ALL_TICKERS,
    start=START_DATE,
    group_by='column',
    auto_adjust=True,
    progress=False,
    threads=True
)

# Somente colunas OHLCV relevantes e limpeza de levels
allowed_columns = ['Open','High','Low','Close','Adj Close','Volume']
data = data.loc[:, data.columns.get_level_values(0).isin(allowed_columns)].copy()
data.columns = data.columns.remove_unused_levels()

# Forward-fill para alinhar calendários; (bfill só nas FEATURES mais adiante)
data = data.ffill()

# Tickers efetivamente presentes
tickers = np.unique(data.columns.get_level_values(1))
print(f"Período: {data.index.min().date()} → {data.index.max().date()}")
print(f"Tickers com dados: {len(tickers)}")

# ============================
# Funções de padrões (com prefixos)
# ============================
def detect_head_shoulder(df, window=3, prefix="hs_"):
    out = pd.DataFrame(index=df.index)
    out[prefix+'high_roll_max'] = df['High'].rolling(window).max()
    out[prefix+'low_roll_min']  = df['Low'].rolling(window).min()
    mask_hs  = ((out[prefix+'high_roll_max'] > df['High'].shift(1)) &
                (out[prefix+'high_roll_max'] > df['High'].shift(-1)) &
                (df['High'] < df['High'].shift(1)) &
                (df['High'] < df['High'].shift(-1)))
    mask_inv = ((out[prefix+'low_roll_min'] < df['Low'].shift(1)) &
                (out[prefix+'low_roll_min'] < df['Low'].shift(-1)) &
                (df['Low'] > df['Low'].shift(1)) &
                (df['Low'] > df['Low'].shift(-1)))
    out[prefix+'pattern'] = 0
    out.loc[mask_hs,  prefix+'pattern'] = 1
    out.loc[mask_inv, prefix+'pattern'] = -1
    return out

def detect_multiple_tops_bottoms(df, window=3, prefix="mtb_"):
    out = pd.DataFrame(index=df.index)
    out[prefix+'high_roll_max']  = df['High'].rolling(window).max()
    out[prefix+'low_roll_min']   = df['Low'].rolling(window).min()
    out[prefix+'close_roll_max'] = df['Close'].rolling(window).max()
    out[prefix+'close_roll_min'] = df['Close'].rolling(window).min()
    mask_top    = (out[prefix+'high_roll_max'] >= df['High'].shift(1)) & (out[prefix+'close_roll_max'] < df['Close'].shift(1))
    mask_bottom = (out[prefix+'low_roll_min']  <= df['Low'].shift(1))  & (out[prefix+'close_roll_min']  > df['Close'].shift(1))
    out[prefix+'pattern'] = 0
    out.loc[mask_top,    prefix+'pattern'] = 1
    out.loc[mask_bottom, prefix+'pattern'] = -1
    return out

def calculate_support_resistance(df, window=3, prefix="sr_"):
    out = pd.DataFrame(index=df.index)
    mean_high = df['High'].rolling(window).mean()
    std_high  = df['High'].rolling(window).std()
    mean_low  = df['Low'].rolling(window).mean()
    std_low   = df['Low'].rolling(window).std()
    out[prefix+'support']     = mean_low - 2*std_low
    out[prefix+'resistance']  = mean_high + 2*std_high
    out[prefix+'diff_support']    = df['Close'] - out[prefix+'support']
    out[prefix+'diff_resistance'] = out[prefix+'resistance'] - df['Close']
    return out

def detect_triangle_pattern(df, window=3, prefix="tri_"):
    out = pd.DataFrame(index=df.index)
    out[prefix+'high_roll_max'] = df['High'].rolling(window).max()
    out[prefix+'low_roll_min']  = df['Low'].rolling(window).min()
    mask_asc  = (out[prefix+'high_roll_max'] >= df['High'].shift(1)) & (out[prefix+'low_roll_min'] <= df['Low'].shift(1)) & (df['Close'] > df['Close'].shift(1))
    mask_desc = (out[prefix+'high_roll_max'] <= df['High'].shift(1)) & (out[prefix+'low_roll_min'] >= df['Low'].shift(1)) & (df['Close'] < df['Close'].shift(1))
    out[prefix+'pattern'] = 0
    out.loc[mask_asc,  prefix+'pattern'] = 1
    out.loc[mask_desc, prefix+'pattern'] = -1
    return out

def detect_wedge(df, window=3, prefix="wed_"):
    out = pd.DataFrame(index=df.index)
    out[prefix+'high_roll_max'] = df['High'].rolling(window).max()
    out[prefix+'low_roll_min']  = df['Low'].rolling(window).min()
    trend_high = df['High'].rolling(window).apply(lambda x: 1 if (x[-1]-x[0])>0 else (-1 if (x[-1]-x[0])<0 else 0), raw=True)
    trend_low  = df['Low'].rolling(window).apply(lambda x: 1 if (x[-1]-x[0])>0 else (-1 if (x[-1]-x[0])<0 else 0), raw=True)
    mask_up   = (out[prefix+'high_roll_max'] >= df['High'].shift(1)) & (out[prefix+'low_roll_min'] <= df['Low'].shift(1)) & (trend_high == 1) & (trend_low == 1)
    mask_down = (out[prefix+'high_roll_max'] <= df['High'].shift(1)) & (out[prefix+'low_roll_min'] >= df['Low'].shift(1)) & (trend_high == -1) & (trend_low == -1)
    out[prefix+'pattern'] = 0
    out.loc[mask_up,   prefix+'pattern'] = 1
    out.loc[mask_down, prefix+'pattern'] = -1
    return out

def detect_channel(df, window=3, prefix="chan_", channel_range=0.1):
    out = pd.DataFrame(index=df.index)
    out[prefix+'high_roll_max'] = df['High'].rolling(window).max()
    out[prefix+'low_roll_min']  = df['Low'].rolling(window).min()
    trend_high = df['High'].rolling(window).apply(lambda x: 1 if (x[-1]-x[0])>0 else (-1 if (x[-1]-x[0])<0 else 0), raw=True)
    trend_low  = df['Low'].rolling(window).apply(lambda x: 1 if (x[-1]-x[0])>0 else (-1 if (x[-1]-x[0])<0 else 0), raw=True)
    width = out[prefix+'high_roll_max'] - out[prefix+'low_roll_min']
    mid   = (out[prefix+'high_roll_max'] + out[prefix+'low_roll_min'])/2
    mask_up   = (out[prefix+'high_roll_max'] >= df['High'].shift(1)) & (out[prefix+'low_roll_min'] <= df['Low'].shift(1)) & (width <= channel_range*mid) & (trend_high==1) & (trend_low==1)
    mask_down = (out[prefix+'high_roll_max'] <= df['High'].shift(1)) & (out[prefix+'low_roll_min'] >= df['Low'].shift(1)) & (width <= channel_range*mid) & (trend_high==-1) & (trend_low==-1)
    out[prefix+'pattern'] = 0
    out.loc[mask_up,   prefix+'pattern'] = 1
    out.loc[mask_down, prefix+'pattern'] = -1
    return out

def detect_double_top_bottom(df, window=3, threshold=0.05, prefix="dbl_"):
    out = pd.DataFrame(index=df.index)
    out[prefix+'high_roll_max'] = df['High'].rolling(window).max()
    out[prefix+'low_roll_min']  = df['Low'].rolling(window).min()
    mask_top = (out[prefix+'high_roll_max'] >= df['High'].shift(1)) & (out[prefix+'high_roll_max'] >= df['High'].shift(-1)) & \
               (df['High'] < df['High'].shift(1)) & (df['High'] < df['High'].shift(-1)) & \
               ((df['High'].shift(1)-df['Low'].shift(1)) <= threshold*(df['High'].shift(1)+df['Low'].shift(1))/2) & \
               ((df['High'].shift(-1)-df['Low'].shift(-1)) <= threshold*(df['High'].shift(-1)+df['Low'].shift(-1))/2)
    mask_bottom = (out[prefix+'low_roll_min'] <= df['Low'].shift(1)) & (out[prefix+'low_roll_min'] <= df['Low'].shift(-1)) & \
                  (df['Low'] > df['Low'].shift(1)) & (df['Low'] > df['Low'].shift(-1)) & \
                  ((df['High'].shift(1)-df['Low'].shift(1)) <= threshold*(df['High'].shift(1)+df['Low'].shift(1))/2) & \
                  ((df['High'].shift(-1)-df['Low'].shift(-1)) <= threshold*(df['High'].shift(-1)+df['Low'].shift(-1))/2)
    out[prefix+'pattern'] = 0
    out.loc[mask_top,    prefix+'pattern'] = 1
    out.loc[mask_bottom, prefix+'pattern'] = -1
    return out

def detect_trendline(df, window=2, prefix="trend_"):
    out = pd.DataFrame(index=df.index)
    slope = np.zeros(len(df), dtype='float64')
    intercept = np.zeros(len(df), dtype='float64')
    idx = np.arange(len(df))
    close = df['Close'].values
    for i in range(window, len(df)):
        x = idx[i-window:i].astype(float)
        y = close[i-window:i]
        A = np.vstack([x, np.ones_like(x)]).T
        m, c = np.linalg.lstsq(A, y, rcond=None)[0]
        slope[i] = m
        intercept[i] = c
    out[prefix+'slope'] = slope
    out[prefix+'intercept'] = intercept
    x_now = idx.astype(float)
    y_line = slope * x_now + intercept
    # Estes dois podem ficar inteiros NaN; trataremos depois com fill_100pct
    out[prefix+'support2'] = np.where(slope>0, y_line, np.nan)
    out[prefix+'resistance2'] = np.where(slope<0, y_line, np.nan)
    out[prefix+'diff_support2'] = df['Close'] - out[prefix+'support2']
    out[prefix+'diff_resistance2'] = out[prefix+'resistance2'] - df['Close']
    return out
# ============================
# New PATH-AWARE targets (split)
# ============================
def make_targets_up_down(df, horizon=30, up_thr=0.20, dd_thr=-0.05,
                         name_up='target_up20', name_dd='target_dd5',
                         keep_order=False, name_order='target_up_before_dd'):
    """
    For each day t:
      - target_up20[t] = 1 if any High in (t+1 ... t+horizon) >= Close[t]*(1+up_thr), else 0
      - target_dd5[t]  = 1 if any Low  in (t+1 ... t+horizon) <= Close[t]*(1+dd_thr), else 0
    If keep_order=True:
      - target_up_before_dd[t] = 1 if the first hit is the UP threshold, 0 if first is DOWN, -1 if none hit
    """
    close = df['Close'].values
    high  = df['High'].values
    low   = df['Low'].values
    n = len(df)

    up_hit = np.zeros(n, dtype='int8')
    dd_hit = np.zeros(n, dtype='int8')
    order  = np.full(n, -1, dtype='int8')  # -1 => none

    for t in range(n):
        end = min(n, t + horizon + 1)
        if end - t <= 1:
            continue

        entry = close[t]
        up_th = entry * (1.0 + up_thr)
        dd_th = entry * (1.0 + dd_thr)

        hseg = high[t+1:end]
        lseg = low[t+1:end]

        up_idx = np.where(hseg >= up_th)[0]
        dd_idx = np.where(lseg <= dd_th)[0]

        hit_up = up_idx[0] if len(up_idx) else None
        hit_dd = dd_idx[0] if len(dd_idx) else None

        if hit_up is not None:
            up_hit[t] = 1
        if hit_dd is not None:
            dd_hit[t] = 1

        if keep_order:
            if hit_up is None and hit_dd is None:
                order[t] = -1
            elif hit_up is None:
                order[t] = 0
            elif hit_dd is None:
                order[t] = 1
            else:
                order[t] = 1 if hit_up < hit_dd else 0

    s_up   = pd.Series(up_hit, index=df.index, name=name_up)
    s_down = pd.Series(dd_hit, index=df.index, name=name_dd)
    if keep_order:
        s_ord = pd.Series(order, index=df.index, name=name_order)
        return s_up, s_down, s_ord
    else:
        return s_up, s_down

# ============================
# FEATURES (ta + cruzamentos + padrões)
# ============================
def indicators_for_ticker(ohlcv: pd.DataFrame, shift_features: int = 1) -> pd.DataFrame:
    close_col = 'Adj Close' if 'Adj Close' in ohlcv.columns else 'Close'

    feats = ta.add_all_ta_features(
        ohlcv.copy(),
        open="Open", high="High", low="Low", close=close_col, volume="Volume",
        fillna=True
    )

    # SMAs sobre o mesmo close_col
    for avg in AVERAGES:
        feats[f'SMA_{avg}'] = ohlcv[close_col].rolling(avg).mean()

    # cruzamentos
    for fast in AVERAGES:
        for slow in AVERAGES:
            if slow > fast:
                fcol = f'SMA_{fast}'
                scol = f'SMA_{slow}'
                prev_f = feats[fcol].shift(1)
                prev_s = feats[scol].shift(1)
                crossname = f"cross_{fast}_{slow}"
                cross = pd.Series(0, index=feats.index, dtype='int8')
                cross[(feats[fcol] < feats[scol]) & (prev_f >= prev_s)] = -1
                cross[(feats[fcol] > feats[scol]) & (prev_f <= prev_s)] = 1
                feats[crossname] = cross

    feats['pct_change'] = ohlcv[close_col].pct_change()

    if shift_features > 0:
        feats = feats.shift(shift_features)

    int_cols = [c for c in feats.columns if str(c).startswith("cross_")]
    feats[int_cols] = feats[int_cols].astype('int8')
    float_cols = [c for c in feats.columns if c not in int_cols]
    feats[float_cols] = feats[float_cols].astype('float32')
    return feats


def patterns_for_ticker(ohlcv: pd.DataFrame, shift_features: int = 1) -> pd.DataFrame:
    parts = [
        detect_head_shoulder(ohlcv),
        detect_multiple_tops_bottoms(ohlcv),
        calculate_support_resistance(ohlcv),
        detect_triangle_pattern(ohlcv),
        detect_wedge(ohlcv),
        detect_channel(ohlcv),
        detect_double_top_bottom(ohlcv),
        detect_trendline(ohlcv),
    ]
    P = pd.concat(parts, axis=1)
    if shift_features > 0:
        P = P.shift(shift_features)
    patt_cols = [c for c in P.columns if c.endswith('pattern')]
    P[patt_cols] = P[patt_cols].astype('int8')
    other_cols = [c for c in P.columns if c not in patt_cols]
    P[other_cols] = P[other_cols].astype('float32')
    return P

# ============================
# TARGETS
# ============================
def make_target_path_aware(df, horizon=30, up=0.15, dd=-0.05, name='target_path'):
    close = df['Close'].values
    high  = df['High'].values
    low   = df['Low'].values
    n = len(df)
    tgt = np.zeros(n, dtype='int8')
    for t in range(n):
        end = min(n, t + horizon + 1)
        if end - t <= 1:
            tgt[t] = 0
            continue
        entry = close[t]
        up_th = entry * (1.0 + up)
        dd_th = entry * (1.0 + dd)
        hseg = high[t+1:end]
        lseg = low[t+1:end]
        hit_up_idx = np.where(hseg >= up_th)[0]
        hit_dd_idx = np.where(lseg <= dd_th)[0]
        hit_up = hit_up_idx[0] if len(hit_up_idx) else None
        hit_dd = hit_dd_idx[0] if len(hit_dd_idx) else None
        tgt[t] = 1 if (hit_up is not None and (hit_dd is None or hit_up < hit_dd)) else 0
    return pd.Series(tgt, index=df.index, name=name)

def make_best_entry_sale(df, horizon=30):
    low = df['Low'].values
    high = df['High'].values
    n = len(df)
    best_entry = np.empty(n, dtype='float32')
    best_sale  = np.empty(n, dtype='float32')
    for t in range(n):
        end = min(n, t + horizon + 1)
        window_low = low[t:end]
        window_high = high[t:end]
        best_entry[t] = float(np.nanmin(window_low))
        best_sale[t]  = float(np.nanmax(window_high))
    s_entry = pd.Series(best_entry, index=df.index, name='target_best_entry')
    s_sale  = pd.Series(best_sale,  index=df.index, name='target_best_sale')
    return s_entry, s_sale

from sklearn.feature_selection import mutual_info_classif
from sklearn.utils import check_random_state

# ============================
# Feature reduction helpers
# ============================
def _is_discrete_series(colname: str) -> bool:
    # treat crosses and pattern flags as discrete
    return colname.startswith("cross_") or colname.endswith("pattern")

def _drop_near_constant(df_tk: pd.DataFrame, var_thr: float = 1e-12):
    variances = df_tk.var(axis=0).astype(float)
    keep = variances > var_thr
    return df_tk.loc[:, keep], keep.index[keep].tolist()

def _mi_rank_per_ticker(X_tk: pd.DataFrame, y_up: pd.Series, y_dd: pd.Series,
                        random_state=42):
    """
    Compute MI vs both targets; take max(MI_up, MI_dd) per feature.
    """
    # mask discrete
    cols = X_tk.columns.tolist()
    discrete_mask = np.array([_is_discrete_series(c) for c in cols], dtype=bool)

    # y must be 1D arrays
    y_up_arr = y_up.astype('int8').values
    y_dd_arr = y_dd.astype('int8').values

    rs = check_random_state(random_state)
    # MI can fail on constant features; ensure X already filtered
    mi_up = mutual_info_classif(X_tk.values, y_up_arr,
                                discrete_features=discrete_mask,
                                random_state=rs)
    mi_dd = mutual_info_classif(X_tk.values, y_dd_arr,
                                discrete_features=discrete_mask,
                                random_state=rs)
    mi = np.maximum(mi_up, mi_dd)
    mi_s = pd.Series(mi, index=cols).sort_values(ascending=False)
    return mi_s

def _greedy_cor_filter(X_tk: pd.DataFrame, ranking: pd.Series,
                       corr_thr: float = 0.995):
    """
    Keep features in order of 'ranking' (desc), discard any that
    correlate (|r| >= corr_thr) with a feature already kept.
    """
    if X_tk.shape[1] <= 1:
        return X_tk.columns.tolist()

    ordered = [c for c in ranking.index if c in X_tk.columns]
    keep = []
    # precompute correlation in chunks to save time
    # Pearson on normalized data
    Z = (X_tk - X_tk.mean()) / (X_tk.std(ddof=0) + 1e-12)

    for c in ordered:
        if not keep:
            keep.append(c)
            continue
        # correlate c with kept
        r = Z[keep].T.dot(Z[c]) / (len(Z) - 1)
        max_abs_r = np.abs(r.values).max()
        if not np.isfinite(max_abs_r) or max_abs_r < corr_thr:
            keep.append(c)
    return keep

from sklearn.feature_selection import mutual_info_classif
from sklearn.utils import check_random_state

from sklearn.feature_selection import mutual_info_classif
from sklearn.utils import check_random_state

def reduce_features_automatic(
    X: pd.DataFrame,
    y_up: pd.DataFrame,
    y_dd: pd.DataFrame,
    top_fraction: float = 0.35,
    min_keep: int = 48,
    var_thr: float | None = None,     # <<< added back
    corr_thr: float = 0.995,
    always_keep_prefixes=("pct_change","SMA_20","SMA_50","SMA_100","tri_","sr_"),
    verbose: bool = True,
):
    """
    Robust per-ticker reduction:
      - drop constants (nunique > 1) and optionally low-variance (<= var_thr)
      - MI vs both targets; fallback to variance if MI is flat/constant
      - greedy correlation de-dup; pad back to min_keep
    """
    if not isinstance(X.columns, pd.MultiIndex):
        raise ValueError("X must use MultiIndex columns=(feature, ticker).")

    tickers = np.unique(X.columns.get_level_values(1))
    kept_cols = []
    before_cnt = X.shape[1]
    rs = check_random_state(42)

    up_tk_set = set(y_up.columns.get_level_values(1))
    dd_tk_set = set(y_dd.columns.get_level_values(1))

    for tk in tickers:
        X_tk = X.xs(tk, level=1, axis=1).copy()

        # numeric only + ensure no NaN/Inf
        X_tk = X_tk.apply(pd.to_numeric, errors='coerce').replace([np.inf,-np.inf], np.nan).fillna(0.0)

        # drop constants
        nunique = X_tk.nunique(dropna=False)
        X_tk_nc = X_tk.loc[:, nunique > 1]

        # optional variance threshold
        if var_thr is not None and X_tk_nc.shape[1] > 0:
            variances = X_tk_nc.var().astype(float)
            X_tk_nc = X_tk_nc.loc[:, variances > var_thr]

        if X_tk_nc.shape[1] == 0:
            base = [c for c in X_tk.columns if any(str(c).startswith(p) for p in always_keep_prefixes)]
            base = base[:min_keep] if base else X_tk.columns[:min_keep].tolist()
            kept_cols.extend([(c, tk) for c in base])
            continue

        # if labels missing for ticker, keep top variance
        if (tk not in up_tk_set) or (tk not in dd_tk_set):
            var_rank = X_tk_nc.var().sort_values(ascending=False)
            base = var_rank.index[:min(min_keep, len(var_rank))].tolist()
            kept_cols.extend([(c, tk) for c in base])
            continue

        y_up_tk = y_up.xs(tk, level=1, axis=1).iloc[:,0].astype('int8')
        y_dd_tk = y_dd.xs(tk, level=1, axis=1).iloc[:,0].astype('int8')

        cols = X_tk_nc.columns.tolist()
        discrete_mask = np.array([str(c).startswith('cross_') or str(c).endswith('pattern') for c in cols], dtype=bool)

        def safe_mi(Xarr, yarr):
            if np.unique(yarr).size < 2:
                return np.zeros(Xarr.shape[1], dtype=float)
            try:
                return mutual_info_classif(Xarr, yarr, discrete_features=discrete_mask, random_state=rs)
            except Exception:
                return np.zeros(Xarr.shape[1], dtype=float)

        mi_up = safe_mi(X_tk_nc.values, y_up_tk.values)
        mi_dd = safe_mi(X_tk_nc.values, y_dd_tk.values)
        mi = np.maximum(mi_up, mi_dd)

        mi_rank = pd.Series(mi, index=cols).sort_values(ascending=False)
        k_top = min(len(mi_rank), max(min_keep, int(np.ceil(len(mi_rank) * top_fraction))))
        top_feats = mi_rank.index[:k_top].tolist()

        # ensure interpretable anchors
        for pref in always_keep_prefixes:
            top_feats.extend([c for c in X_tk_nc.columns if str(c).startswith(pref)])
        # de-dup order
        seen = set()
        top_feats = [c for c in top_feats if not (c in seen or seen.add(c))]

        # fallback if MI flat
        if len(top_feats) == 0 or (mi_rank.iloc[0] == 0 and mi_rank.sum() == 0):
            var_rank = X_tk_nc.var().sort_values(ascending=False)
            top_feats = var_rank.index[:min(min_keep, len(var_rank))].tolist()

        # greedy correlation de-dup
        X_top = X_tk_nc[top_feats]
        Z = (X_top - X_top.mean()) / (X_top.std(ddof=0) + 1e-12)
        keep = []
        for c in X_top.columns:
            if not keep:
                keep.append(c); continue
            r = Z[keep].T.dot(Z[c]) / max(1, (len(Z) - 1))
            max_abs_r = np.abs(r.values).max() if hasattr(r, "values") else float(np.abs(r).max())
            if not np.isfinite(max_abs_r) or max_abs_r < corr_thr:
                keep.append(c)

        # pad to min_keep
        if len(keep) < min_keep:
            for c in top_feats:
                if c not in keep:
                    keep.append(c)
                    if len(keep) >= min_keep:
                        break

        kept_cols.extend([(c, tk) for c in keep])

    if len(kept_cols) == 0:
        for tk in tickers:
            cols_tk = X.xs(tk, level=1, axis=1).columns[:min_keep].tolist()
            kept_cols.extend([(c, tk) for c in cols_tk])

    kept_cols = pd.MultiIndex.from_tuples(kept_cols, names=X.columns.names)
    X_red = X.loc[:, kept_cols].copy()

    # compact dtypes
    X_red = cast_int8_multi(X_red, prefixes=("cross_",), suffixes=("pattern",))
    other0 = [c for c in X_red.columns.get_level_values(0)
              if not (str(c).startswith("cross_") or str(c).endswith("pattern"))]
    if other0:
        X_red.loc[:, (other0, slice(None))] = X_red.loc[:, (other0, slice(None))].astype('float32')

    if verbose:
        after_cnt = X_red.shape[1]
        print(f"[Feature reduction] columns: {before_cnt} → {after_cnt} ({100.0*after_cnt/before_cnt:.1f}% kept)")
    return X_red





# ============================
# Construção por ticker
# ============================
feat_frames = []
tgt_frames  = []

print("Gerando features e targets por ticker...")
from collections import Counter

feat_frames = []
tgt_frames  = []
skip_reasons = Counter()

print("Gerando features e targets por ticker...")
for tk in tqdm(tickers):
    try:
        ohlcv = data.xs(tk, level=1, axis=1).copy()

        # ✅ Require only what you truly need for features/targets
        # Targets need High/Low; indicators need a close; SMAs may use Open but not required
        have = set(ohlcv.columns.astype(str))
        req = {'Close','High','Low'}   # do NOT require 'Adj Close' or 'Volume' here
        if not req.issubset(have):
            skip_reasons['missing_basic_ohlc'] += 1
            continue

        # Fallbacks to avoid skipping indices/FX/crypto
        if 'Adj Close' not in have:
            ohlcv['Adj Close'] = ohlcv['Close'].astype('float32')
        if 'Open' not in have:
            ohlcv['Open'] = ohlcv['Close'].astype('float32')
        if 'Volume' not in have:
            ohlcv['Volume'] = 0.0

        # Build features
        feats = indicators_for_ticker(ohlcv, shift_features=SHIFT_FEATURES)
        pats  = patterns_for_ticker(ohlcv, shift_features=SHIFT_FEATURES)
        if feats is None or feats.shape[1] == 0:
            skip_reasons['empty_feats'] += 1
            continue
        X_tk  = pd.concat([feats, pats], axis=1)

        # Fill & attach ticker level
        X_tk = fill_100pct(X_tk, allow_bfill=ALLOW_BFILL_EXOGENOUS)
        if X_tk.shape[1] == 0:
            skip_reasons['empty_after_fill'] += 1
            continue

        X_tk.columns = pd.MultiIndex.from_product([X_tk.columns, [tk]])
        feat_frames.append(X_tk)

        # Targets (split up/down)
        y_up20, y_dd5 = make_targets_up_down(
            ohlcv, horizon=HORIZON, up_thr=0.20, dd_thr=-0.05, keep_order=False
        )
        y_up20.name = ('target_up20', tk)
        y_dd5.name  = ('target_dd5',  tk)
        tgt_frames.extend([y_up20, y_dd5])

        # Best entry/sale (keep if you use downstream)
        y_entry, y_sale = make_best_entry_sale(ohlcv, horizon=HORIZON)
        y_entry.name = ('target_best_entry', tk)
        y_sale.name  = ('target_best_sale',  tk)
        tgt_frames.extend([y_entry, y_sale])

    except Exception as e:
        # don’t crash the whole build for one bad ticker
        skip_reasons[f'exception:{type(e).__name__}'] += 1
        # uncomment to inspect:
        # print(f"[{tk}] skipped due to {type(e).__name__}: {e}")
        continue

# Safety + diagnostics
if len(feat_frames) == 0:
    raise RuntimeError(f"No features built for any ticker. Skip reasons: {dict(skip_reasons)}")
else:
    print("Tickers processados:", len(feat_frames), "| Motivos de skip:", dict(skip_reasons))

# Concat globais
X = pd.concat(feat_frames, axis=1).sort_index()
y = pd.concat(tgt_frames,  axis=1).sort_index()

# Cast consistente (atualizado p/ novos targets)
X = cast_int8_multi(X, prefixes=("cross_",), suffixes=("pattern",))
other0 = [c for c in X.columns.get_level_values(0)
          if not (str(c).startswith("cross_") or str(c).endswith("pattern"))]
if other0:
    X.loc[:, (other0, slice(None))] = X.loc[:, (other0, slice(None))].astype('float32')

for tname in ('target_up20','target_dd5'):
    if tname in y.columns.get_level_values(0):
        y.loc[:, (tname, slice(None))] = y.loc[:, (tname, slice(None))].astype('int8')
for tname in ('target_best_entry','target_best_sale'):
    if tname in y.columns.get_level_values(0):
        y.loc[:, (tname, slice(None))] = y.loc[:, (tname, slice(None))].astype('float32')


# Garantir 100% preenchido (inf/NaN) globalmente (features já estão ok; segurança adicional)
X = fill_100pct(X, allow_bfill=ALLOW_BFILL_EXOGENOUS)
# Alvos não devem ser imputados para evitar vazamento de informação no horizonte futuro
y = y.replace([np.inf, -np.inf], np.nan)

# Verificações finais
assert not X.isna().any().any(), "Ainda há NaN em X após preenchimento!"

# ============================
# Saída única combinada
# ============================
DATASET = pd.concat([X, y], axis=1).sort_index()
# Segurança final (caso algum merge crie lacunas):
DATASET = fill_100pct(DATASET, allow_bfill=ALLOW_BFILL_EXOGENOUS)

# Sanidade: sem NaN
assert not DATASET.isna().any().any(), "Ainda há NaN no dataset final!"

# ============================
# Feature reduction (per ticker, MI vs targets)
# ============================
# y_up and y_dd views (ensure they exist)
y_up = y.loc[:, y.columns.get_level_values(0) == 'target_up20']
y_dd = y.loc[:, y.columns.get_level_values(0) == 'target_dd5']

X_reduced = reduce_features_automatic(
    X, y_up, y_dd,
    top_fraction=0.85,   # pega ~85% por MI antes de deduplicar
    min_keep=96,         # garanta pelo menos ~100 por ticker (se existirem)
    var_thr=None,        # não remova por variância agora
    corr_thr=0.9995,     # só remove quase idênticos
    always_keep_prefixes=("pct_change","SMA_","tri_","sr_"),
    verbose=True
)



# ============================
# Output (full and reduced)
# ============================
DATASET_FULL = pd.concat([X, y], axis=1).sort_index()
DATASET_FULL = fill_100pct(DATASET_FULL, allow_bfill=ALLOW_BFILL_EXOGENOUS)
assert not DATASET_FULL.isna().any().any(), "NaN in full dataset!"

DATASET_REDUCED = pd.concat([X_reduced, y], axis=1).sort_index()
DATASET_REDUCED = fill_100pct(DATASET_REDUCED, allow_bfill=ALLOW_BFILL_EXOGENOUS)
assert not DATASET_REDUCED.isna().any().any(), "NaN in reduced dataset!"

if SAVE_PARQUET:
    DATASET_FULL.to_parquet(OUTPUT_PATH, compression="snappy")
    OUT_REDUCED = OUTPUT_PATH.replace(".parquet", "_reduced.parquet")
    DATASET_REDUCED.to_parquet(OUT_REDUCED, compression="snappy")
    print(f"Saved FULL:     {OUTPUT_PATH}  shape={DATASET_FULL.shape}")
    print(f"Saved REDUCED:  {OUT_REDUCED}  shape={DATASET_REDUCED.shape}")
else:
    print("FULL:", DATASET_FULL.shape, " | REDUCED:", DATASET_REDUCED.shape)

print("X full:", X.shape)
print("y:", y.shape)
print("X_reduced:", X_reduced.shape)

# Per-ticker kept counts
kept_counts = pd.Series(dict(
    (tk, X_reduced.xs(tk, level=1, axis=1).shape[1])
    for tk in np.unique(X_reduced.columns.get_level_values(1))
)).sort_values(ascending=True)
print("Min/median/max kept per ticker:", kept_counts.min(), kept_counts.median(), kept_counts.max())

processed = np.unique(X.columns.get_level_values(1))
print("Processed tickers:", len(processed))
missing = sorted(set(tickers) - set(processed))
print("Missing tickers:", len(missing))

Mounted at /content/drive
Baixando cotações do Yahoo Finance...


ERROR:yfinance:HTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: NTCO3.SA"}}}
ERROR:yfinance:HTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: XAUEUR"}}}
ERROR:yfinance:
11 Failed downloads:
ERROR:yfinance:['NTCO3.SA', 'JBSS3.SA', 'XAUEUR', 'NG=F', 'GOLL4.SA', 'XAGEUR', 'VVAR3.SA', 'CIEL3.SA', 'CESP6.SA', 'CCRO3.SA', 'LAME4.SA']: YFTzMissingError('possibly delisted; no timezone found')


Período: 2005-01-03 → 2025-11-02
Tickers com dados: 147
Gerando features e targets por ticker...
Gerando features e targets por ticker...


100%|██████████| 147/147 [05:34<00:00,  2.28s/it]


Tickers processados: 147 | Motivos de skip: {}
[Feature reduction] columns: 24696 → 14435 (58.5% kept)
Saved FULL:     drive/My Drive/Colab Notebooks/stock/expanded_stock.parquet  shape=(6597, 25284)
Saved REDUCED:  drive/My Drive/Colab Notebooks/stock/expanded_stock_reduced.parquet  shape=(6597, 15023)
X full: (6597, 24696)
y: (6597, 588)
X_reduced: (6597, 14435)
Min/median/max kept per ticker: 15 103.0 122
Processed tickers: 147
Missing tickers: 0


In [4]:
DATASET_REDUCED.tail()

Unnamed: 0_level_0,trend_visual_ichimoku_b,volatility_dch,trend_ichimoku_b,volatility_dcl,volume_obv,volume_nvi,trend_ichimoku_base,trend_resistance2,mtb_close_roll_min,chan_high_roll_max,...,target_best_entry,target_best_sale,target_up20,target_dd5,target_best_entry,target_best_sale,target_up20,target_dd5,target_best_entry,target_best_sale
Unnamed: 0_level_1,ABEV3.SA,ABEV3.SA,ABEV3.SA,ABEV3.SA,ABEV3.SA,ABEV3.SA,ABEV3.SA,ABEV3.SA,ABEV3.SA,ABEV3.SA,...,^JKSE,^JKSE,^N225,^N225,^N225,^N225,^SSE,^SSE,^SSE,^SSE
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2025-10-29,12.315001,12.45,12.280001,11.74,30825930000.0,2198.156982,12.065001,11.91,12.03,12.39,...,8042.629883,8231.882812,0,0,50365.621094,52411.339844,1,1,100.029999,149.509995
2025-10-30,12.315001,12.66,12.280001,11.74,30917550000.0,2198.156982,12.17,12.03,12.03,12.66,...,8144.077148,8231.882812,0,0,50972.558594,52411.339844,1,1,100.029999,149.509995
2025-10-31,12.315001,12.92,12.3,11.74,30949660000.0,2219.108398,12.3,12.03,12.03,12.92,...,8144.077148,8215.545898,0,0,51613.03125,52411.339844,1,1,100.029999,149.509995
2025-11-01,12.315001,12.92,12.3,11.75,30981770000.0,2219.108398,12.3,12.03,12.59,12.92,...,8144.077148,8215.545898,0,0,51613.03125,52411.339844,1,1,100.029999,149.509995
2025-11-02,12.295,12.92,12.3,11.75,31013880000.0,2219.108398,12.3,12.03,12.71,12.92,...,8144.077148,8215.545898,0,0,51613.03125,52411.339844,0,0,100.029999,149.509995
