In [None]:
# --- Imports ---
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import copy
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau
import random
import torch.nn.functional as F

SEED = 3407

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# (Optional but good practice)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False



# --- Device Configuration ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# --- Load Data ---
data = pd.read_parquet('data/volume_pred.parquet')  # adjust path if needed
data = data.sort_values(['ticker', 'date'])
data = data[data['VOL'] >= 1].reset_index(drop=True)
data['sentiment_score'] = data['sentiment_score'].fillna(0)

# --- Feature Engineering ---
data['log_vol'] = np.log1p(data['VOL'])

# Encode ticker
ticker_encoder = LabelEncoder()
data['ticker_encoded'] = ticker_encoder.fit_transform(data['ticker'])

# Define features
input_features = [col for col in data.columns if col not in ['VOL', 'date', 'ticker', 'log_vol']]
# Drop problematic features
problematic_features = ['execid', 'distcd']
input_features = [col for col in input_features if col not in problematic_features]

# --- Train/Test Split ---
def train_test_split_time_series(df, stock_col='ticker', date_col='date', test_ratio=0.2):
    train_dfs, test_dfs = [], []
    for stock, group in df.groupby(stock_col):
        group = group.sort_values(date_col)
        split_idx = int(len(group) * (1 - test_ratio))
        train_dfs.append(group.iloc[:split_idx])
        test_dfs.append(group.iloc[split_idx:])
    return pd.concat(train_dfs), pd.concat(test_dfs)

train_data, test_data = train_test_split_time_series(data)

# --- Scaling ---
scaler = StandardScaler()
scaler.fit(train_data[input_features])
train_data[input_features] = scaler.transform(train_data[input_features])
test_data[input_features] = scaler.transform(test_data[input_features])

for col in input_features:
    if not np.isfinite(train_data[col]).all():
        print(f"⚠️ Feature {col} has NaN or Inf after scaling!")
bad_rows = ~np.isfinite(train_data[input_features]).all(axis=1)
print(train_data.loc[bad_rows])

assert np.isfinite(train_data[input_features].values).all(), "Train data has NaN or Inf!"
assert np.isfinite(test_data[input_features].values).all(), "Test data has NaN or Inf!"
assert np.isfinite(train_data['log_vol'].values).all(), "Train target has NaN or InF!"
# --- Dataset Class ---
class StockVolumeDataset(Dataset):
    def __init__(self, df, input_features, target_col='log_vol', time_steps=10):
        self.features = df[input_features].values
        self.targets = df[target_col].values
        self.time_steps = time_steps
        self.X, self.y = self.build_sequences()

    def build_sequences(self):
        X_seq, y_seq = [], []
        for i in range(len(self.features) - self.time_steps):
            X_seq.append(self.features[i:i+self.time_steps])
            y_seq.append(self.targets[i+self.time_steps])
        return torch.tensor(np.array(X_seq), dtype=torch.float32), torch.tensor(np.array(y_seq), dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# --- DataLoader ---
time_steps = 10
train_dataset = StockVolumeDataset(train_data, input_features, time_steps=time_steps)
test_dataset = StockVolumeDataset(test_data, input_features, time_steps=time_steps)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# --- Modified Stacked Bidirectional LSTM Model ---
class StackedBiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_layers=3, dropout=0.05):
        super(StackedBiLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True
        )
        self.fc1 = nn.Linear(hidden_size * 2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        # Take the output at the last time step
        out = lstm_out[:, -1, :]
        hidden = torch.relu(self.fc1(out))
        output = self.fc2(hidden)
        return output

# --- Instantiate Improved Model ---
model = StackedBiLSTM(input_size=len(input_features)).to(device)

# --- New Optimizer with lower weight_decay ---
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=5e-6)

# --- Scheduler ---
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5, threshold=1e-4, threshold_mode='rel')

# --- Other configs ---
criterion = nn.MSELoss()
best_model_wts = copy.deepcopy(model.state_dict())
best_loss = np.inf
patience = 5
counter = 0
num_epochs = 50

# --- Start Training (With tqdm) ---
for epoch in range(num_epochs):
    model.train()
    train_losses = []
    train_loader_iter = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training")
    for X_batch, y_batch in train_loader_iter:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        preds = model(X_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        train_losses.append(loss.item())
        train_loader_iter.set_postfix(loss=loss.item())

    model.eval()
    val_losses = []
    val_loader_iter = tqdm(test_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation")
    with torch.no_grad():
        for X_val, y_val in val_loader_iter:
            X_val, y_val = X_val.to(device), y_val.to(device)
            val_preds = model(X_val)
            val_loss = criterion(val_preds, y_val)
            val_losses.append(val_loss.item())
            val_loader_iter.set_postfix(val_loss=val_loss.item())

    val_loss_epoch = np.mean(val_losses)
    scheduler.step(val_loss_epoch)

    tqdm.write(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {np.mean(train_losses):.6f} | Val Loss: {val_loss_epoch:.6f}")

    if np.isnan(val_loss_epoch):
        print("⚠️ Validation loss became NaN! Stopping training.")
        break

    if val_loss_epoch < best_loss:
        best_loss = val_loss_epoch
        best_model_wts = copy.deepcopy(model.state_dict())
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print("✅ Early stopping triggered.")
            break

# --- Load Best Model ---
model.load_state_dict(best_model_wts)
print("Best model loaded with loss:", best_loss)

# --- Evaluation ---
model.eval()
pred_list, true_list = [], []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        preds = model(X_batch).cpu().numpy()
        true_list.append(y_batch.cpu().numpy())
        pred_list.append(preds)

pred_all_log = np.vstack(pred_list)
true_all_log = np.vstack(true_list)

# --- Inverse Transform ---
pred_all_original = np.expm1(pred_all_log)
true_all_original = np.expm1(true_all_log)

# --- Plot in Log Space ---
plt.figure(figsize=(12,6))
plt.plot(true_all_log[:200], label='True log_vol')
plt.plot(pred_all_log[:200], label='Predicted log_vol', linestyle='--')
plt.title('Prediction vs True (First 200 samples, log space)')
plt.legend()
plt.grid(True)
plt.show()

# --- Evaluate on Original Scale ---
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse_orig = mean_squared_error(true_all_original, pred_all_original)
mae_orig = mean_absolute_error(true_all_original, pred_all_original)
r2_orig = r2_score(true_all_original, pred_all_original)

print(f"\n--- Metrics on Original Volume Scale ---")
print(f"MSE (Original Volume): {mse_orig:.4f}")
print(f"MAE (Original Volume): {mae_orig:.4f}")
print(f"R² (Original Volume): {r2_orig:.4f}")