In [9]:
import pandas as pd
import numpy as np

In [10]:
# Full load with date parsing
df = pd.read_csv(
    'datasets/cleaned/fx_predictions_dataset.csv',
    parse_dates=['Date'],         # adjust to your date column name
    index_col='Date',
    dayfirst=True
).sort_index()
print(df.shape)

(3950, 10)


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3950 entries, 2008-01-02 to 2023-11-30
Data columns (total 10 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Buying                           3950 non-null   float64
 1   Selling                          3950 non-null   object 
 2   Mid Rate                         3950 non-null   object 
 3   GDP                              3950 non-null   float64
 4   Interbank weighted average rate  3950 non-null   float64
 5   Monetary policy rate             3950 non-null   float64
 6   Oil_Price                        3950 non-null   float64
 7   Cocoa_Price                      3950 non-null   float64
 8   Gold_Price                       3950 non-null   float64
 9   Headline Inflation               3950 non-null   float64
dtypes: float64(8), object(2)
memory usage: 339.5+ KB


In [12]:
df['Selling'] = pd.to_numeric(df['Selling'], errors='coerce')
df['Mid_Rate'] = pd.to_numeric(df['Mid Rate'], errors='coerce')


In [13]:
columns = {'Buying': 'Buying', 'GDP': 'GDP', 'Interbank weighted average rate': 'InterestRate',
           'Monetary policy rate': 'MPR', 'Oil_Price': 'OilPrice', 'Cocoa_Price': 'CocoaPrice',
           'Gold_Price': 'GoldPrice', 'Headline Inflation': 'Inflation'}

for column in columns.items():
    df.rename(columns={column[0]: column[1]}, inplace=True)

In [14]:
df_fe = df.copy()

# 2.1 Lag features (1, 3, 6 months)
for col in ['Buying', 'GDP', 'Inflation', 'MPR', 'InterestRate',
            'GoldPrice', 'OilPrice', 'CocoaPrice']:
    for lag in [1, 3, 6]:
        df_fe[f'{col}_lag{lag}'] = df_fe[col].shift(lag)

# 2.2 Rolling stats (3-month mean & std)
for col in ['GoldPrice', 'OilPrice', 'CocoaPrice']:
    df_fe[f'{col}_rmean3'] = df_fe[col].rolling(3).mean()
    df_fe[f'{col}_rstd3'] = df_fe[col].rolling(3).std()

# 2.3 Log transforms
for col in ['GoldPrice', 'OilPrice', 'CocoaPrice', 'Buying']:
    df_fe[f'log_{col}'] = np.log1p(df_fe[col])

# 2.4 Inflation-interest spread
df_fe['Inf_Interest_Spread'] = df_fe['Inflation'] - df_fe['InterestRate']

# 2.5 Differenced series
df_fe['d_Buying'] = df_fe['Buying'].diff()

# Drop rows with NaNs from feature creation
df_fe.dropna(inplace=True)

In [None]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression
from sklearn.linear_model import LassoCV

x = df_fe.drop(columns=['Buying', 'd_Buying'])
y = df_fe['Buying']

# 3.1 Remove near-zero variance
vt = VarianceThreshold(threshold=1e-5)
x_variance_threshold = vt.fit_transform(x)

# 3.2 Univariate selection
skb = SelectKBest(f_regression, k=20)
x_skb = skb.fit_transform(x_variance_threshold, y)

# 3.3 Lasso for final importance
lasso = LassoCV(cv=5).fit(x_skb, y)
coef_mask = np.abs(lasso.coef_) > 1e-3
X_final = x_skb[:, coef_mask]

# Keep feature names for tracking
selected_features = x.columns[vt.get_support()][skb.get_support()][coef_mask]
print("Selected features:", selected_features)

Selected features: Index([], dtype='object')


In [17]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# 4.1 Split
n = len(df_fe)
train_idx = int(n * 0.8)
X_train_raw, X_test_raw = X_final[:train_idx], X_final[train_idx:]
y_train_raw, y_test_raw = y[:train_idx], y[train_idx:]

In [18]:
X_train_raw.shape, X_test_raw.shape, y_train_raw.shape, y_test_raw.shape

((3153, 0), (789, 0), (3153,), (789,))

In [None]:
# 4.2 Scale
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_train = scaler_X.fit_transform(X_train_raw)
X_test = scaler_X.transform(X_test_raw)

y_train = scaler_y.fit_transform(y_train_raw.values.reshape(-1, 1))
y_test = scaler_y.transform(y_test_raw.values.reshape(-1, 1))



ValueError: X_train_raw has 0 features. Please check your feature selection steps before scaling.

In [None]:
# 4.3 Sequence generator
def create_sequences(X, y, window=12):
    Xs, ys = [], []
    for i in range(window, len(X)):
        Xs.append(X[i-window:i])
        ys.append(y[i])
    return np.array(Xs), np.array(ys)


window = 12
X_tr, y_tr = create_sequences(X_train, y_train, window)
X_te, y_te = create_sequences(X_test, y_test, window)

# 4.4 Model
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(window, X_tr.shape[2])),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mse'])
es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(
    X_tr, y_tr,
    epochs=200,
    batch_size=16,
    validation_split=0.2,
    callbacks=[es],
    verbose=2
)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Predictions
y_pred_scaled = model.predict(X_te)
y_pred = scaler_y.inverse_transform(y_pred_scaled)
y_true = scaler_y.inverse_transform(y_te)

# Metrics
mse_val = mean_squared_error(y_true, y_pred)
r2_val = r2_score(y_true, y_pred)

print(f"Test MSE: {mse_val:.4f}")
print(f"Test R²: {r2_val:.4f}")