In [None]:
from tensorflow.keras.utils import serialize_keras_object
from tensorflow.keras import utils
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import json
import numpy as np
import pandas as pd

In [None]:
# training period
start = datetime(2020, 1, 1)
end = datetime(2022, 1, 1)

In [None]:
# Stores all the minute data for our symbol from our training period
qb = QuantBook()
symbol = qb.add_cfd("XAUUSD", Resolution.MINUTE).symbol
history = qb.history(symbol, start, end).loc[symbol]
history.head()

In [None]:
# Records the percentage change from the previous minute instead of the value
daily_pct_change = history[["open","high","low","close"]].pct_change().dropna()
df = daily_pct_change
df.head()

In [None]:
# Creates features and labels for the model to be trained on
# Model will consider data from the past half hour to make a decision
n_steps = 30
features = []
labels = []
for i in range(len(df)-n_steps):
    input_data = df.iloc[i:i+n_steps].values
    features.append(input_data)
    
    if df['close'].iloc[i+n_steps] >= 0:
        # Price went up
        label = 1
    else:
        label = 0
    labels.append(label)



In [None]:
features = np.array(features)
labels = np.array(labels)

In [None]:
# Split features and labels into training, validation, and test sets (60/20/20)
# This provides better model evaluation and prevents overfitting
train_length = int(len(features) * 0.6)
val_length = int(len(features) * 0.2)

X_train = features[:train_length]
X_val = features[train_length:train_length + val_length]
X_test = features[train_length + val_length:]

y_train = labels[:train_length]
y_val = labels[train_length:train_length + val_length]
y_test = labels[train_length + val_length:]

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

In [None]:
# Check class distribution in training set (should be close to 0.5 for balanced dataset)
train_class_distribution = np.mean(y_train)
val_class_distribution = np.mean(y_val)
test_class_distribution = np.mean(y_test)

print(f"Training set class distribution (should be ~0.5): {train_class_distribution:.4f}")
print(f"Validation set class distribution: {val_class_distribution:.4f}")
print(f"Test set class distribution: {test_class_distribution:.4f}")

In [None]:
# NOTE: This cell was previously incorrectly swapping train/test sets
# The correct split is now done in cell 6 with train/val/test (60/20/20)
# This cell is kept for reference but should not be executed
# If you need to re-run, skip this cell
print("This cell has been deprecated. Use the split from cell 6 instead.")

In [None]:
# Verify test set class distribution
print(f"Test set class distribution: {np.mean(y_test):.4f}")

In [None]:
# Build improved ANN with regularization and batch normalization
# Architecture: Input (30 timesteps x 4 features) -> Dense(64) -> Dropout -> Dense(32) -> Dropout -> Dense(1)
# Added L2 regularization and dropout to prevent overfitting
# Added batch normalization for better training stability

model = Sequential([
    # Flatten input first (30 timesteps x 4 features = 120 features)
    Dense(64, input_shape=(X_train[0].shape[0] * X_train[0].shape[1],), 
          activation='relu', 
          kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(32, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(1, activation='sigmoid')
])

# Reshape input data for flattened input
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_val_flat = X_val.reshape(X_val.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

print("Model architecture:")
model.summary()

In [None]:
# Configure model with improved settings
# Using binary_crossentropy for binary classification
# Adam optimizer with learning rate scheduling via callbacks
model.compile(
    loss='binary_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy', 'precision', 'recall']
)

In [None]:
# Train model with validation set, early stopping, and learning rate reduction
# Early stopping prevents overfitting by monitoring validation loss
# Learning rate reduction helps fine-tune the model

callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=1e-7,
        verbose=1
    )
]

history = model.fit(
    X_train_flat, y_train,
    validation_data=(X_val_flat, y_val),
    epochs=50,  # Increased epochs with early stopping
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

In [None]:
# Get model predictions for test data
y_hat = model.predict(X_test_flat, verbose=0)

# Also get predictions for training and validation sets for comparison
y_hat_train = model.predict(X_train_flat, verbose=0)
y_hat_val = model.predict(X_val_flat, verbose=0)

In [None]:
# Create results DataFrame for visualization
results = pd.DataFrame({
    'y_actual': y_test.flatten(), 
    'y_predicted': y_hat.flatten(),
    'y_predicted_binary': (y_hat.flatten() > 0.5).astype(int)
})

In [None]:
# Plot training history
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Plot loss
axes[0].plot(history.history['loss'], label='Training Loss')
axes[0].plot(history.history['val_loss'], label='Validation Loss')
axes[0].set_title('Model Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()

# Plot accuracy
axes[1].plot(history.history['accuracy'], label='Training Accuracy')
axes[1].plot(history.history['val_accuracy'], label='Validation Accuracy')
axes[1].set_title('Model Accuracy')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].legend()

plt.tight_layout()
plt.show()

# Plot predictions vs actual
results[['y_actual', 'y_predicted']].plot(title="Model Performance: Predicted vs Actual", alpha=0.7)
plt.show()

In [None]:
# Evaluate model on all datasets
# Note: model.evaluate returns [loss, accuracy, precision, recall] based on metrics in compile

print("=" * 50)
print("MODEL EVALUATION RESULTS")
print("=" * 50)

# Training set evaluation
train_scores = model.evaluate(X_train_flat, y_train, verbose=0)
print(f"\nTraining Set:")
print(f"  Loss: {train_scores[0]:.4f}")
print(f"  Accuracy: {train_scores[1]:.4f} ({train_scores[1]*100:.2f}%)")
print(f"  Precision: {train_scores[2]:.4f}")
print(f"  Recall: {train_scores[3]:.4f}")

# Validation set evaluation
val_scores = model.evaluate(X_val_flat, y_val, verbose=0)
print(f"\nValidation Set:")
print(f"  Loss: {val_scores[0]:.4f}")
print(f"  Accuracy: {val_scores[1]:.4f} ({val_scores[1]*100:.2f}%)")
print(f"  Precision: {val_scores[2]:.4f}")
print(f"  Recall: {val_scores[3]:.4f}")

# Test set evaluation
test_scores = model.evaluate(X_test_flat, y_test, verbose=0)
print(f"\nTest Set:")
print(f"  Loss: {test_scores[0]:.4f}")
print(f"  Accuracy: {test_scores[1]:.4f} ({test_scores[1]*100:.2f}%)")
print(f"  Precision: {test_scores[2]:.4f}")
print(f"  Recall: {test_scores[3]:.4f}")

# Check for overfitting (training accuracy >> validation/test accuracy)
if train_scores[1] - val_scores[1] > 0.1:
    print("\n⚠️  WARNING: Potential overfitting detected (training accuracy much higher than validation)")
elif train_scores[1] - val_scores[1] < 0.05:
    print("\n✓ Model shows good generalization (training and validation accuracy are close)")

In [None]:
# Model representatiion
model_str = json.dumps(serialize_keras_object(model))

In [None]:
model_key = 'forex_price_predictor'

In [None]:
# Saves the model so it can be accessed later
qb.ObjectStore.Save(model_key, model_str)

In [None]:
if qb.ObjectStore.ContainsKey(model_key):
    model_str = qb.ObjectStore.Read(model_key)
    config = json.loads(model_str)['config']
    model = Sequential.from_config(config)

In [None]:
testDate = datetime.now()

In [None]:
# Get recent data for prediction
df = qb.History(symbol, testDate - timedelta(40), testDate).loc[symbol]
df_change = df[["open", "high", "low", "close"]].pct_change().dropna()

# Validate we have enough data
if len(df_change) < 30:
    print(f"Warning: Only {len(df_change)} data points available, need at least 30")

# Prepare model input (same format as training)
model_input = []
for index, row in df_change.tail(30).iterrows():
    model_input.append(np.array(row))
model_input = np.array([model_input])

# Flatten input to match model architecture
model_input_flat = model_input.reshape(1, -1)

In [None]:
# Make prediction with confidence score
prediction_prob = model.predict(model_input_flat, verbose=0)[0][0]
prediction = round(prediction_prob)

if prediction == 0:
    direction = "Down"
else:
    direction = "Up"

confidence = abs(prediction_prob - 0.5) * 2  # Convert to 0-1 scale

print(f"Prediction: {direction}")
print(f"Confidence: {confidence*100:.2f}%")
print(f"Raw probability: {prediction_prob:.4f}")
