In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

sns.set_palette('rocket')
plt.rcParams['image.cmap'] = 'rocket'
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (15, 7)

In [None]:
ds = pd.read_csv('datasets/cleaned/merged_fx_dataset.csv')

In [None]:
ds['Date'] = pd.to_datetime(ds['Date'])

In [None]:
ds.info()

In [None]:
ds.drop(columns=['Selling', 'Buying'], inplace=True)

In [None]:
ds.info()

In [None]:
df_weekly = ds.set_index('Date').resample('W-FRI').last()
df_weekly

In [None]:
# agg_dict_weekly = {
#     'MidRate': 'last',
#     # 'Buying': 'last', 'Selling': 'last',
#     'MPR': 'last', 'GhInflationRate': 'last', 'BrentOil': 'last',
#     'Cocoa': 'last', 'Gold': 'last', 'GhInterestRate': 'last',
#     'NetForeignAssets': 'last', 'NIR': 'last', 'Imports': 'sum', 'Exports': 'sum',
#     'USInflationRate': 'last', 'USInterestRate': 'last'
# }

# df_weekly = ds.set_index('Date').resample('W-FRI')

In [None]:
df_weekly.info()

In [None]:
selected_features = [
    'MidRate', 'GhInflationRate', 'USInflationRate', 'GhInterestRate', 'USInterestRate',
    'BrentOil', 'Cocoa', 'Gold', 'Imports', 'Exports'
]

In [None]:
df_weekly = df_weekly[selected_features].copy()

In [None]:
df_weekly.dropna(inplace=True)

print("Weekly Data Head:")
df_weekly.head()

In [None]:
df_weekly.info()

#### Data Preprocessing

In [None]:
# For now, we'll just define the function.
def scale_data(train, test):
    scaler = MinMaxScaler(feature_range=(0, 1))
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)
    return scaler, train_scaled, test_scaled

In [None]:
# Function to create sequences for LSTM
def create_sequences(data, lookback_period):
    X, y = [], []
    for i in range(len(data) - lookback_period):
        X.append(data[i:(i + lookback_period), :])
        # The target is the 'MidRate' of the next week (which is at index 0 of our features)
        y.append(data[i + lookback_period, 0])
    return np.array(X), np.array(y)

In [35]:
LOOKBACK_WEEKS = 20

#### Build the LSTM Model

In [36]:
# def build_lstm_model(input_shape):
#     model = Sequential()
#     # From paper: 34 nodes in LSTM layer
#     model.add(LSTM(34, activation='tanh', input_shape=input_shape))
#     # From paper: 1 node in output layer
#     model.add(Dense(1))
#     # From paper: Adam optimizer with lr=0.002
#     optimizer = tf.keras.optimizers.Adam(learning_rate=0.002)
#     model.compile(optimizer=optimizer, loss='mean_squared_error')
#     return model


# --- 5. Build a MORE STABLE LSTM Model Architecture ---
def build_lstm_model(input_shape):
    model = Sequential()
    # Reduce units and add gradient clipping to the optimizer
    model.add(LSTM(34, activation='relu', input_shape=input_shape)) # Reduced from 34 to 16 units
    model.add(Dense(1))

    # Use a lower learning rate and add gradient clipping
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005, clipnorm=1.0) # Lower LR and added clipping
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Re-define the input shape (it remains the same)
input_shape = (LOOKBACK_WEEKS, len(selected_features))
stable_lstm_model = build_lstm_model(input_shape)
stable_lstm_model.summary()

  super().__init__(**kwargs)


In [37]:
# Define the input shape (lookback weeks, number of features)
input_shape = (LOOKBACK_WEEKS, len(selected_features))
lstm_model = build_lstm_model(input_shape)
lstm_model.summary()

##### Walk-Forward Validation

In [30]:
df_weekly.index.min()

Timestamp('2008-01-04 00:00:00')

In [40]:
# --- 6. Walk-Forward Validation (WITH DEBUGGING) ---
print("\nStarting LSTM Walk-Forward Validation...")
results_lstm = []

# We need a bit more data to start, so let's start the test set a bit later
start_test_date = df_weekly.index.min() + pd.DateOffset(years=2)

# Loop through time, testing on 6-month periods
for end_test_date in pd.date_range(start_test_date, df_weekly.index.max(), freq='6M'):
    start_test_date = end_test_date - pd.DateOffset(months=6)

    # Define train and test sets for this fold
    train = df_weekly[(df_weekly.index < start_test_date)]
    test = df_weekly[(df_weekly.index >= start_test_date) & (df_weekly.index < end_test_date)]

    print(f"\n--- Iteration for test period {start_test_date.date()} - {end_test_date.date()} ---")
    print(f"Initial train size: {len(train)}, test size: {len(test)}")

    if len(train) < LOOKBACK_WEEKS * 2 or len(test) < LOOKBACK_WEEKS:
        print("SKIPPED: Not enough data in train/test split.")
        continue

    # Scale data
    scaler, train_scaled, test_scaled = scale_data(train, test)

    # Create sequences
    X_train, y_train = create_sequences(train_scaled, LOOKBACK_WEEKS)
    X_test, y_test = create_sequences(test_scaled, LOOKBACK_WEEKS)

    print(f"Sequence shapes - X_train: {X_train.shape}, y_train: {y_train.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}")

    if len(X_train) == 0 or len(X_test) == 0:
        print("SKIPPED: Not enough data to create sequences.")
        continue

    # Build and train model for this fold
    model = build_lstm_model(input_shape=(LOOKBACK_WEEKS, len(selected_features)))
    # From paper: batch_size=128, epochs=100
    model.fit(X_train, y_train, epochs=100, batch_size=128, verbose=0)


    # Make predictions
    y_pred_scaled = model.predict(X_test, verbose=0)

    # --- SIMPLIFIED ROBUST EVALUATION ---
    # Calculate R² directly on the scaled data. This avoids all inverse transform issues.
    # Note: y_test is already scaled.
    r2_model = r2_score(y_test, y_pred_scaled)

    # For the naive baseline on scaled data, we predict the last value of the sequence
    # The last value of each X_test sequence is X_test[:, -1, 0]
    naive_pred_scaled = X_test[:, -1, 0]
    r2_naive = r2_score(y_test, naive_pred_scaled)

    print(
        f"SUCCESS: Scaled R² - Model: {r2_model:.4f}, Naive: {r2_naive:.4f}")

    # We still need to store the results. Let's store the scaled scores for now.
    results_lstm.append({
        'test_start': start_test_date,
        'r2_model': r2_model,  # Storing the scaled R²
        'r2_naive': r2_naive  # Storing the scaled R²
    })
    print(f"SUCCESS: Appended result. Model R²={r2_model:.4f}, Naive R²={r2_naive:.4f}")


# --- 7. Final Analysis (WITH SAFETY CHECK) ---
print("\n--- LSTM Walk-Forward Results ---")

# Add a safety check before creating the DataFrame
if not results_lstm:
    print("WARNING: The results list is empty. The loop did not complete any successful iterations.")
    print("This is likely due to insufficient data after resampling to weekly.")
    print("Please check the debug output above to see why the loop was skipped.")
else:
    results_lstm_df = pd.DataFrame(results_lstm)
    print(results_lstm_df)

    avg_r2_model_lstm = results_lstm_df['r2_model'].mean()
    avg_r2_naive_lstm = results_lstm_df['r2_naive'].mean()

    print(f"\nAverage LSTM Model R²: {avg_r2_model_lstm:.4f}")
    print(f"Average Naive R² (Weekly): {avg_r2_naive_lstm:.4f}")

    if avg_r2_model_lstm > avg_r2_naive_lstm:
        print("\nSUCCESS: The LSTM model (from paper methodology) provided a meaningful edge.")
    else:
        print("\nFAILURE: The LSTM model did not beat the naive baseline.")


Starting LSTM Walk-Forward Validation...

--- Iteration for test period 2009-07-31 - 2010-01-31 ---
Initial train size: 82, test size: 27
Sequence shapes - X_train: (62, 20, 10), y_train: (62,), X_test: (7, 20, 10), y_test: (7,)


  for end_test_date in pd.date_range(start_test_date, df_weekly.index.max(), freq='6M'):
  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -19145.1940, Naive: 0.5489
SUCCESS: Appended result. Model R²=-19145.1940, Naive R²=0.5489

--- Iteration for test period 2010-01-31 - 2010-07-31 ---
Initial train size: 109, test size: 26
Sequence shapes - X_train: (89, 20, 10), y_train: (89,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -8256.0877, Naive: 0.3737
SUCCESS: Appended result. Model R²=-8256.0877, Naive R²=0.3737

--- Iteration for test period 2010-07-31 - 2011-01-31 ---
Initial train size: 135, test size: 26
Sequence shapes - X_train: (115, 20, 10), y_train: (115,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -532.7260, Naive: 0.4178
SUCCESS: Appended result. Model R²=-532.7260, Naive R²=0.4178

--- Iteration for test period 2011-01-31 - 2011-07-31 ---
Initial train size: 161, test size: 26
Sequence shapes - X_train: (141, 20, 10), y_train: (141,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -8903334.1789, Naive: -0.2300
SUCCESS: Appended result. Model R²=-8903334.1789, Naive R²=-0.2300

--- Iteration for test period 2011-07-31 - 2012-01-31 ---
Initial train size: 187, test size: 26
Sequence shapes - X_train: (167, 20, 10), y_train: (167,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -16.5709, Naive: 0.0719
SUCCESS: Appended result. Model R²=-16.5709, Naive R²=0.0719

--- Iteration for test period 2012-01-31 - 2012-07-31 ---
Initial train size: 213, test size: 26
Sequence shapes - X_train: (193, 20, 10), y_train: (193,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -297.8637, Naive: 0.5312
SUCCESS: Appended result. Model R²=-297.8637, Naive R²=0.5312

--- Iteration for test period 2012-07-31 - 2013-01-31 ---
Initial train size: 239, test size: 26
Sequence shapes - X_train: (219, 20, 10), y_train: (219,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -350.0376, Naive: 0.4364
SUCCESS: Appended result. Model R²=-350.0376, Naive R²=0.4364

--- Iteration for test period 2013-01-31 - 2013-07-31 ---
Initial train size: 265, test size: 26
Sequence shapes - X_train: (245, 20, 10), y_train: (245,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -61.9246, Naive: -2.1142
SUCCESS: Appended result. Model R²=-61.9246, Naive R²=-2.1142

--- Iteration for test period 2013-07-31 - 2014-01-31 ---
Initial train size: 291, test size: 26
Sequence shapes - X_train: (271, 20, 10), y_train: (271,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -2.7586, Naive: 0.7020
SUCCESS: Appended result. Model R²=-2.7586, Naive R²=0.7020

--- Iteration for test period 2014-01-31 - 2014-07-31 ---
Initial train size: 317, test size: 26
Sequence shapes - X_train: (297, 20, 10), y_train: (297,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -742.8899, Naive: -3.0159
SUCCESS: Appended result. Model R²=-742.8899, Naive R²=-3.0159

--- Iteration for test period 2014-07-31 - 2015-01-31 ---
Initial train size: 343, test size: 27
Sequence shapes - X_train: (323, 20, 10), y_train: (323,), X_test: (7, 20, 10), y_test: (7,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -17.7076, Naive: 0.7370
SUCCESS: Appended result. Model R²=-17.7076, Naive R²=0.7370

--- Iteration for test period 2015-01-31 - 2015-07-31 ---
Initial train size: 370, test size: 25
Sequence shapes - X_train: (350, 20, 10), y_train: (350,), X_test: (5, 20, 10), y_test: (5,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -1.2487, Naive: 0.5020
SUCCESS: Appended result. Model R²=-1.2487, Naive R²=0.5020

--- Iteration for test period 2015-07-31 - 2016-01-31 ---
Initial train size: 395, test size: 27
Sequence shapes - X_train: (375, 20, 10), y_train: (375,), X_test: (7, 20, 10), y_test: (7,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -366.9318, Naive: 0.6274
SUCCESS: Appended result. Model R²=-366.9318, Naive R²=0.6274

--- Iteration for test period 2016-01-31 - 2016-07-31 ---
Initial train size: 422, test size: 26
Sequence shapes - X_train: (402, 20, 10), y_train: (402,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -63.3891, Naive: 0.4344
SUCCESS: Appended result. Model R²=-63.3891, Naive R²=0.4344

--- Iteration for test period 2016-07-31 - 2017-01-31 ---
Initial train size: 448, test size: 26
Sequence shapes - X_train: (428, 20, 10), y_train: (428,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -206.7063, Naive: 0.2880
SUCCESS: Appended result. Model R²=-206.7063, Naive R²=0.2880

--- Iteration for test period 2017-01-31 - 2017-07-31 ---
Initial train size: 474, test size: 26
Sequence shapes - X_train: (454, 20, 10), y_train: (454,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -6828.1887, Naive: -0.5264
SUCCESS: Appended result. Model R²=-6828.1887, Naive R²=-0.5264

--- Iteration for test period 2017-07-31 - 2018-01-31 ---
Initial train size: 500, test size: 26
Sequence shapes - X_train: (480, 20, 10), y_train: (480,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -1430.5562, Naive: 0.6562
SUCCESS: Appended result. Model R²=-1430.5562, Naive R²=0.6562

--- Iteration for test period 2018-01-31 - 2018-07-31 ---
Initial train size: 526, test size: 26
Sequence shapes - X_train: (506, 20, 10), y_train: (506,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -48.2259, Naive: 0.4836
SUCCESS: Appended result. Model R²=-48.2259, Naive R²=0.4836

--- Iteration for test period 2018-07-31 - 2019-01-31 ---
Initial train size: 552, test size: 26
Sequence shapes - X_train: (532, 20, 10), y_train: (532,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -17.1427, Naive: 0.6788
SUCCESS: Appended result. Model R²=-17.1427, Naive R²=0.6788

--- Iteration for test period 2019-01-31 - 2019-07-31 ---
Initial train size: 578, test size: 26
Sequence shapes - X_train: (558, 20, 10), y_train: (558,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -2197.1762, Naive: -4.3692
SUCCESS: Appended result. Model R²=-2197.1762, Naive R²=-4.3692

--- Iteration for test period 2019-07-31 - 2020-01-31 ---
Initial train size: 604, test size: 26
Sequence shapes - X_train: (584, 20, 10), y_train: (584,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -685.0283, Naive: 0.1803
SUCCESS: Appended result. Model R²=-685.0283, Naive R²=0.1803

--- Iteration for test period 2020-01-31 - 2020-07-31 ---
Initial train size: 630, test size: 26
Sequence shapes - X_train: (610, 20, 10), y_train: (610,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -3961.8502, Naive: 0.1502
SUCCESS: Appended result. Model R²=-3961.8502, Naive R²=0.1502

--- Iteration for test period 2020-07-31 - 2021-01-31 ---
Initial train size: 656, test size: 27
Sequence shapes - X_train: (636, 20, 10), y_train: (636,), X_test: (7, 20, 10), y_test: (7,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -56.8370, Naive: 0.4634
SUCCESS: Appended result. Model R²=-56.8370, Naive R²=0.4634

--- Iteration for test period 2021-01-31 - 2021-07-31 ---
Initial train size: 683, test size: 26
Sequence shapes - X_train: (663, 20, 10), y_train: (663,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -88.6679, Naive: 0.6770
SUCCESS: Appended result. Model R²=-88.6679, Naive R²=0.6770

--- Iteration for test period 2021-07-31 - 2022-01-31 ---
Initial train size: 709, test size: 26
Sequence shapes - X_train: (689, 20, 10), y_train: (689,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -1179.5358, Naive: -0.4050
SUCCESS: Appended result. Model R²=-1179.5358, Naive R²=-0.4050

--- Iteration for test period 2022-01-31 - 2022-07-31 ---
Initial train size: 735, test size: 26
Sequence shapes - X_train: (715, 20, 10), y_train: (715,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -127.1223, Naive: 0.6663
SUCCESS: Appended result. Model R²=-127.1223, Naive R²=0.6663

--- Iteration for test period 2022-07-31 - 2023-01-31 ---
Initial train size: 761, test size: 26
Sequence shapes - X_train: (741, 20, 10), y_train: (741,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -34.9983, Naive: 0.5881
SUCCESS: Appended result. Model R²=-34.9983, Naive R²=0.5881

--- Iteration for test period 2023-01-31 - 2023-07-31 ---
Initial train size: 787, test size: 26
Sequence shapes - X_train: (767, 20, 10), y_train: (767,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -243902.2345, Naive: 0.4255
SUCCESS: Appended result. Model R²=-243902.2345, Naive R²=0.4255

--- Iteration for test period 2023-07-31 - 2024-01-31 ---
Initial train size: 813, test size: 26
Sequence shapes - X_train: (793, 20, 10), y_train: (793,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -5356.8606, Naive: 0.1860
SUCCESS: Appended result. Model R²=-5356.8606, Naive R²=0.1860

--- Iteration for test period 2024-01-31 - 2024-07-31 ---
Initial train size: 839, test size: 26
Sequence shapes - X_train: (819, 20, 10), y_train: (819,), X_test: (6, 20, 10), y_test: (6,)


  super().__init__(**kwargs)


SUCCESS: Scaled R² - Model: -37.1975, Naive: 0.4185
SUCCESS: Appended result. Model R²=-37.1975, Naive R²=0.4185

--- LSTM Walk-Forward Results ---
   test_start      r2_model  r2_naive
0  2009-07-31 -1.914519e+04  0.548894
1  2010-01-31 -8.256088e+03  0.373666
2  2010-07-31 -5.327260e+02  0.417811
3  2011-01-31 -8.903334e+06 -0.230000
4  2011-07-31 -1.657090e+01  0.071861
5  2012-01-31 -2.978637e+02  0.531181
6  2012-07-31 -3.500376e+02  0.436406
7  2013-01-31 -6.192464e+01 -2.114160
8  2013-07-31 -2.758647e+00  0.701956
9  2014-01-31 -7.428899e+02 -3.015919
10 2014-07-31 -1.770758e+01  0.736994
11 2015-01-31 -1.248707e+00  0.501974
12 2015-07-31 -3.669318e+02  0.627432
13 2016-01-31 -6.338915e+01  0.434405
14 2016-07-31 -2.067063e+02  0.288004
15 2017-01-31 -6.828189e+03 -0.526350
16 2017-07-31 -1.430556e+03  0.656210
17 2018-01-31 -4.822594e+01  0.483593
18 2018-07-31 -1.714272e+01  0.678762
19 2019-01-31 -2.197176e+03 -4.369159
20 2019-07-31 -6.850283e+02  0.180295
21 2020-01-31 -3