In [9]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import numpy as np
import pandas as pd


In [5]:

def add_lags(df, target_col='Close', n_lags=30):
    """
    Adds lag features to the dataset.

    Parameters:
    - df: DataFrame containing the data.
    - target_col: The column for which lags will be created.
    - n_lags: Number of lag features to create.

    Returns:
    - DataFrame with lag features added.
    """
    df_copy = df.copy()
    for lag in range(1, n_lags + 1):
        df_copy[f'{target_col}_lag_{lag}'] = df_copy[target_col].shift(lag)
    return df_copy.dropna()

# Add technical indicators
def add_technical_indicators(df):
    # Moving averages
    df['SMA_5'] = df['Close'].rolling(window=5).mean()
    df['SMA_20'] = df['Close'].rolling(window=20).mean()
    df['EMA_12'] = df['Close'].ewm(span=12, adjust=False).mean()
    df['EMA_26'] = df['Close'].ewm(span=26, adjust=False).mean()
    
    # RSI
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))
    
    # Bollinger Bands
    df['20d_std'] = df['Close'].rolling(window=20).std()
    df['upper_band'] = df['SMA_20'] + (df['20d_std'] * 2)
    df['lower_band'] = df['SMA_20'] - (df['20d_std'] * 2)
    
    # MACD
    df['MACD'] = df['EMA_12'] - df['EMA_26']
    df['MACD_signal'] = df['MACD'].ewm(span=9, adjust=False).mean()
    
    # Volatility
    df['volatility'] = df['Close'].rolling(window=20).std() / df['Close'].rolling(window=20).mean()
    
    # Volume features
    df['volume_change'] = df['Volume'].pct_change()
    df['volume_ma'] = df['Volume'].rolling(window=10).mean()
    df['volume_ratio'] = df['Volume'] / df['volume_ma']
    
    # Price to moving average ratios
    for horizon in [2, 5, 60, 250]:
        rolling_averages = df['Close'].rolling(window=horizon).mean()
        df[f'Close_ratio_{horizon}d_MA'] = df['Close'] / rolling_averages
        
        # Trend features
        df[f'Trend_{horizon}d_MA'] = df['GreenDay'].shift(1).rolling(window=horizon).sum()
    
    return df

def add_seasonal_features(df):
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month
    df['quarter'] = df.index.quarter
    
    # One-hot encode day of week and month
    for day in range(5):  # 0-4 for weekdays
        df[f'day_{day}'] = (df['day_of_week'] == day).astype(int)
    
    for month in range(1, 13):
        df[f'month_{month}'] = (df['month'] == month).astype(int)
    
    return df


# Inicio funcion feature_engineering

def feature_engineering(data):


    # Horizontes

    horizons = [2,5,60,250] # two days, week, month, year
    new_predictors = []

    for horizon in horizons:
        rolling_averages = data['Close'].rolling(window=horizon).mean()

        ratio_column = f'Close_ratio_{horizon}d MA'
        data[ratio_column] = data['Close'] / rolling_averages # how far is the current price from the horizon day moving average

        trend_column = f'Trend_{horizon}d MA'
        data[trend_column] = data['GreenDay'].shift(1).rolling(window=horizon).sum() # on any given day, how many green days have there been in the past horizon days
        new_predictors.extend([ratio_column, trend_column])

    # Lagged features

    data = add_lags(data, target_col='Close', n_lags=30)

    # Technical indicators

    data  = add_technical_indicators(data)

    # Seasonal features

    data = add_seasonal_features(data)


    # Drop rows with NaN values
    data = data.dropna()

    return data
    


In [6]:
def split_data(data, train_size=0.8):

    '''
    Split data into training and testing sets.
    
    Args:
        data (DataFrame): Data to split.
        train_size (float): Size of training set.

    Returns:
        tuple: Training and testing sets for X and y.'''
    
    # Split the data into features and target
    X = data.drop(columns=['Close'])
    y = data['Close']
    # Split into training and testing sets
    train_size = int(len(data) * train_size)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    return X_train, X_test, y_train, y_test
# Fin de funcion split_data

# Comienzo de funcion scale_data

from sklearn.preprocessing import MinMaxScaler

def scale_data(X_train, X_test, y_train=None, y_test=None, feature_scaler=None, target_scaler=None):
    """
    Scales the training and testing data using MinMaxScaler or a provided scaler.

    Parameters:
    - X_train: Training features (numpy array or DataFrame).
    - X_test: Testing features (numpy array or DataFrame).
    - y_train: (Optional) Training target values (numpy array or Series).
    - y_test: (Optional) Testing target values (numpy array or Series).
    - feature_scaler: (Optional) Predefined scaler for features. If None, a new MinMaxScaler is created.
    - target_scaler: (Optional) Predefined scaler for target values. If None, a new MinMaxScaler is created.

    Returns:
    - X_train_scaled: Scaled training features.
    - X_test_scaled: Scaled testing features.
    - y_train_scaled: (Optional) Scaled training target values.
    - y_test_scaled: (Optional) Scaled testing target values.
    - feature_scaler: Scaler used for features.
    - target_scaler: Scaler used for target values (if applicable).
    """
    # Initialize scalers if not provided
    if feature_scaler is None:
        feature_scaler = MinMaxScaler()
    if y_train is not None and target_scaler is None:
        target_scaler = MinMaxScaler()

    # Scale features
    X_train_scaled = feature_scaler.fit_transform(X_train)
    X_test_scaled = feature_scaler.transform(X_test)

    # Scale target values if provided
    if y_train is not None and y_test is not None:
        y_train_scaled = target_scaler.fit_transform(y_train.reshape(-1, 1)).flatten()
        y_test_scaled = target_scaler.transform(y_test.reshape(-1, 1)).flatten()
        return X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, feature_scaler, target_scaler

    return X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled, feature_scaler, target_scaler



def create_sequences(X, y, time_steps=10):
    """
    Transforma los datos en secuencias adecuadas para LSTM
    
    Parameters:
    X (array): Features
    y (array): Target
    time_steps (int): Número de pasos de tiempo en cada secuencia
    
    Returns:
    X_seq, y_seq: Datos transformados en secuencias
    """
    X_seq, y_seq = [], []
    for i in range(len(X) - time_steps):
        X_seq.append(X[i:i + time_steps])
        y_seq.append(y[i + time_steps])
    return np.array(X_seq), np.array(y_seq)

