In [12]:
import pandas as pd
import numpy as np


In [13]:
# Cargar el archivo preprocesado
df = pd.read_csv('data/BTC_5m_preprocessed.csv')

# Verificar las primeras filas
df.head()


Unnamed: 0,timestamp,open,high,low,close,volume,log_return,pct_change,sma_20,stddev_20,bb_upper,bb_lower,bb_width
0,2024-01-01 07:35:00,42495.22,42500.0,42495.21,42496.55,30.61851,3.2e-05,3.2e-05,42391.037,74.828737,42540.694473,42241.379527,299.314946
1,2024-01-01 07:40:00,42496.55,42496.55,42446.71,42446.72,44.09963,-0.001173,-0.001173,42399.5085,70.753221,42541.014941,42258.002059,283.012883
2,2024-01-01 07:45:00,42446.71,42470.0,42432.56,42463.5,51.69519,0.000395,0.000395,42408.174,67.182746,42542.539492,42273.808508,268.730985
3,2024-01-01 07:50:00,42463.5,42470.82,42463.1,42470.81,26.74081,0.000172,0.000172,42417.914,60.834331,42539.582661,42296.245339,243.337323
4,2024-01-01 07:55:00,42470.81,42492.47,42466.71,42492.46,36.54524,0.00051,0.00051,42425.4365,60.2536,42545.943699,42304.929301,241.014398


In [14]:
# Crea la columna target: 1 si sube el precio en la siguiente vela, 0 si baja o se mantiene igual
df['target'] = (df['close'].shift(-1) > df['close']).astype(int)

# Mostrar algunas filas para verificar
df[['close', 'target']].head(10)


Unnamed: 0,close,target
0,42496.55,0
1,42446.72,1
2,42463.5,1
3,42470.81,1
4,42492.46,0
5,42460.0,1
6,42484.89,1
7,42514.28,0
8,42514.21,0
9,42510.84,0


In [15]:
# Indicadores básicos
df['log_return'] = np.log(df['close'] / df['close'].shift(1))
df['pct_change'] = df['close'].pct_change()

# Exponential Moving Averages (EMA)
df['ema_12'] = df['close'].ewm(span=12, adjust=False).mean()
df['ema_26'] = df['close'].ewm(span=26, adjust=False).mean()

# MACD
df['macd'] = df['ema_12'] - df['ema_26']
df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
df['macd_histogram'] = df['macd'] - df['macd_signal']


In [16]:
bb_window = 20
bb_std = 2
df['bb_middle'] = df['close'].rolling(window=bb_window).mean()
df['bb_std'] = df['close'].rolling(window=bb_window).std()
df['bb_upper'] = df['bb_middle'] + bb_std * df['bb_std']
df['bb_lower'] = df['bb_middle'] - bb_std * df['bb_std']
df['bb_width'] = df['bb_upper'] - df['bb_lower']


In [17]:
window_length = 14
delta = df['close'].diff()
gain = delta.clip(lower=0)
loss = -1 * delta.clip(upper=0)

avg_gain = gain.ewm(alpha=1/window_length, min_periods=window_length).mean()
avg_loss = loss.ewm(alpha=1/window_length, min_periods=window_length).mean()

rs = avg_gain / avg_loss
df['rsi'] = 100 - (100 / (1 + rs))


In [18]:
def calculate_adx(df, n=14):
    high = df['high']
    low = df['low']
    close = df['close']

    df['tr1'] = high - low
    df['tr2'] = (high - close.shift(1)).abs()
    df['tr3'] = (low - close.shift(1)).abs()
    df['TR'] = df[['tr1', 'tr2', 'tr3']].max(axis=1)

    df['+DM'] = np.where((high - high.shift(1)) > (low.shift(1) - low),
                         np.maximum(high - high.shift(1), 0), 0)
    df['-DM'] = np.where((low.shift(1) - low) > (high - high.shift(1)),
                         np.maximum(low.shift(1) - low, 0), 0)

    tr14 = df['TR'].rolling(window=n).sum()
    plus_dm14 = df['+DM'].rolling(window=n).sum()
    minus_dm14 = df['-DM'].rolling(window=n).sum()

    plus_di14 = 100 * (plus_dm14 / tr14)
    minus_di14 = 100 * (minus_dm14 / tr14)

    dx = (abs(plus_di14 - minus_di14) / (plus_di14 + minus_di14)) * 100
    adx = dx.rolling(window=n).mean()

    df.drop(columns=['tr1', 'tr2', 'tr3', 'TR', '+DM', '-DM'], inplace=True)

    return adx

# Aplicar el cálculo
df['adx'] = calculate_adx(df)


In [19]:
high = df['high']
low = df['low']
close = df['close']

tr1 = high - low
tr2 = (high - close.shift(1)).abs()
tr3 = (low - close.shift(1)).abs()

df['TR'] = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
df['atr'] = df['TR'].rolling(window=14).mean()

# Elimina la columna TR temporal si no la necesitas
df.drop(columns=['TR'], inplace=True)


In [20]:
df['returns_mean_5'] = df['log_return'].rolling(window=5).mean()
df['returns_std_5'] = df['log_return'].rolling(window=5).std()

df['returns_mean_20'] = df['log_return'].rolling(window=20).mean()
df['returns_std_20'] = df['log_return'].rolling(window=20).std()

df['ema_diff'] = df['ema_12'] - df['ema_26']
df['rsi_macd_ratio'] = df['rsi'] / (df['macd'] + 1e-6)
df['bb_position'] = (df['close'] - df['bb_lower']) / (df['bb_upper'] - df['bb_lower'])

df['rsi_overbought'] = np.where(df['rsi'] > 70, 1, 0)
df['rsi_oversold'] = np.where(df['rsi'] < 30, 1, 0)
df['macd_cross'] = np.where(df['macd'] > df['macd_signal'], 1, 0)

df['volatility'] = df['returns_std_20']
df['vol_ratio'] = df['volume'] / df['volume'].rolling(window=20).mean()


In [21]:
df.dropna(inplace=True)

# Revisar estructura final
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 126794 entries, 26 to 126819
Data columns (total 36 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   timestamp        126794 non-null  object 
 1   open             126794 non-null  float64
 2   high             126794 non-null  float64
 3   low              126794 non-null  float64
 4   close            126794 non-null  float64
 5   volume           126794 non-null  float64
 6   log_return       126794 non-null  float64
 7   pct_change       126794 non-null  float64
 8   sma_20           126794 non-null  float64
 9   stddev_20        126794 non-null  float64
 10  bb_upper         126794 non-null  float64
 11  bb_lower         126794 non-null  float64
 12  bb_width         126794 non-null  float64
 13  target           126794 non-null  int64  
 14  ema_12           126794 non-null  float64
 15  ema_26           126794 non-null  float64
 16  macd             126794 non-null  float64


In [22]:
df.to_csv('data/BTC_5m_features.csv', index=False)
print("✅ Dataset guardado en 'data/BTC_5m_features.csv'")


✅ Dataset guardado en 'data/BTC_5m_features.csv'
