In [46]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout
from sklearn.preprocessing import MinMaxScaler
import joblib  # For saving the scaler
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Example usage
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import MinMaxScaler

In [42]:
def split_stock_data(data, target_column, test_size=0.2, time_steps=60):
    """
    Splits stock market prediction data into train and test sets.

    Parameters:
        data (pandas.DataFrame): The dataset containing stock market data.
        target_column (str): The column name for the target variable (e.g., 'Close').
        test_size (float): The proportion of the dataset to include in the test split.
        time_steps (int): The number of past time steps to use as features.

    Returns:
        tuple: X_train, y_train, X_test, y_test
    """
    # Convert the data to numpy arrays
    values = data[target_column].values

    # Create sequences of features and corresponding targets
    X, y = [], []
    for i in range(len(values) - time_steps):
        X.append(values[i:i + time_steps])
        y.append(values[i + time_steps])

    X, y = np.array(X), np.array(y)

    # Split into training and testing datasets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=False)

    return X_train, y_train, X_test, y_test


In [43]:

# Example dataset
data_path_google = "inputs/google_stock_cleaned.csv"
data = pd.read_csv(data_path_google)

X_train, y_train, X_test, y_test = split_stock_data(
    data, target_column='Close', test_size=0.2, time_steps=60
)

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


X_train shape: (554, 60), y_train shape: (554,)
X_test shape: (139, 60), y_test shape: (139,)


In [48]:
def scale_and_save_data(X_train, y_train, X_val, y_val, X_test, y_test, scaler_file):
    """
    Scales the given stock price data (for Close option) using MinMaxScaler and saves the scaler to a .pkl file.

    Parameters:
        X_train (numpy.ndarray): Training features.
        y_train (numpy.ndarray): Training labels (Close prices).
        X_val (numpy.ndarray): Validation features.
        y_val (numpy.ndarray): Validation labels (Close prices).
        X_test (numpy.ndarray): Testing features.
        y_test (numpy.ndarray): Testing labels (Close prices).
        scaler_file (str): Path to the .pkl file to save the scaler.

    Returns:
        tuple: Scaled X_train, y_train, X_val, y_val, X_test, y_test
    """
    # Initialize scalers
    scaler_X = MinMaxScaler(feature_range=(0, 1))
    scaler_y = MinMaxScaler(feature_range=(0, 1))

    # Reshape y data to 2D for scaling (Close prices)
    y_train = y_train.reshape(-1, 1)
    y_val = y_val.reshape(-1, 1)
    y_test = y_test.reshape(-1, 1)

    # Fit and transform the scalers on the respective datasets
    X_train_scaled = scaler_X.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
    X_val_scaled = scaler_X.transform(X_val.reshape(-1, X_val.shape[-1])).reshape(X_val.shape)
    X_test_scaled = scaler_X.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)

    y_train_scaled = scaler_y.fit_transform(y_train)
    y_val_scaled = scaler_y.transform(y_val)
    y_test_scaled = scaler_y.transform(y_test)

    # Save scalers to a .pkl file
    scalers_to_save = {
        "scaler_X": scaler_X,
        "scaler_y": scaler_y,
    }

    with open(scaler_file, 'wb') as f:
        pickle.dump(scalers_to_save, f)

    print(f"Scalers saved to {scaler_file}")

    return X_train_scaled, y_train_scaled, X_val_scaled, y_val_scaled, X_test_scaled, y_test_scaled


In [49]:
scaler_path = "scaled_stock_data_close_option.pkl"
X_train_scaled, y_train_scaled, X_val_scaled, y_val_scaled, X_test_scaled, y_test_scaled = scale_and_save_data(X_train, y_train, X_val, y_val, X_test, y_test, scaler_path)
# Assuming X_train, y_train, X_val, y_val, X_test, y_test are already defined
# scaled_data = scale_and_save_data(X_train, y_train, X_val, y_val, X_test, y_test, "scaler.pkl")


ValueError: X has 1 features, but MinMaxScaler is expecting 60 features as input.

In [45]:
class StockPricePredictor_Google:
    def __init__(self, input_shape, learning_rate=0.0007971184552975506, num_layers=2, units=256):
        self.input_shape = input_shape
        self.learning_rate = learning_rate
        self.num_layers = num_layers
        self.units = units
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()
        for i in range(self.num_layers):
            # Add RNN layers, making the last layer return sequences only if it's not the final RNN
            return_sequences = i < (self.num_layers - 1)
            model.add(SimpleRNN(self.units, activation='relu', return_sequences=return_sequences, input_shape=self.input_shape))
            model.add(Dropout(0.2))  # Regularization

        # Output layer for regression (single output: close value)
        model.add(Dense(1))

    # Compile the model with explicit loss and metric functions
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate),
            loss=tf.keras.losses.MeanSquaredError(),  # Explicitly use the TensorFlow implementation
            metrics=[tf.keras.metrics.MeanAbsoluteError()]  # Use TensorFlow's metric function
        )
        return model

    def train(self, X_train, y_train, batch_size=16, epochs=50, validation_data=None):
        history = self.model.fit(X_train, y_train, 
                                 batch_size=batch_size, 
                                 epochs=epochs, 
                                 validation_data=validation_data,
                                 verbose=2)
        return history

    def predict(self, X):
        return self.model.predict(X)

    def save(self, model_path, scaler_path, scaler):
        # Save the model
        self.model.save(model_path)
        print(f"Model saved to {model_path}")

        # Save the scaler
        joblib.dump(scaler, scaler_path)
        print(f"Scaler saved to {scaler_path}")

    @staticmethod
    def load(model_path, scaler_path):
        # Load the model
        model = load_model(model_path)
        print(f"Model loaded from {model_path}")

        # Load the scaler
        scaler = joblib.load(scaler_path)
        print(f"Scaler loaded from {scaler_path}")

        return model, scaler




In [39]:
data_path_google = "inputs/google_stock_cleaned.csv"

output_file = "scaled_stock_data_close_option.pkl"





In [40]:
# Sample Usage
if __name__ == "__main__":
   
    # Scale data
    #scaler = MinMaxScaler()
    #y_train_scaled = scaler.fit_transform(y_train.reshape(-1, 1)).flatten()
    #y_val_scaled = scaler.transform(y_val.reshape(-1, 1)).flatten()

    # Initialize and train the predictor
    predictor = StockPricePredictor_Google(input_shape=(X_train.shape[1], X_train.shape[2]))
    history = predictor.train(X_train, y_train_scaled, batch_size=16, epochs=20, validation_data=(X_val, y_val_scaled))

    # Save model and scaler
    model_path = "stock_price_rnn_model.h5"
    
    predictor.save(model_path, scaler_path, scaler)

    # Load model and scaler for inference
    loaded_model, loaded_scaler = StockPricePredictor.load(model_path, scaler_path)
    predictions = loaded_model.predict(X_val)
    predictions_rescaled = loaded_scaler.inverse_transform(predictions)

    print("Predictions (rescaled):", predictions_rescaled[:5])

  super().__init__(**kwargs)


ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 554
'y' sizes: 800
