<a href="https://colab.research.google.com/github/fantasybarry/MSFT-Prediction/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Attempt to forecast the price of MSFT by analyzing the prices of multiple stocks, including MSFT, over several consecutive days leading up to the target day.

## Setup from HW2(Modified)


In [None]:
from torch.utils.data import DataLoader,Dataset
import torch
import torch.nn as nn
import torch.utils.data as data
import pandas as pd
import yfinance as yf

class StockDataset(data.Dataset):
    def __init__(self,X,Y,days):
        self.X = X
        self.Y = Y.reshape(-1)
        self.days = days

        # Store normalization parameters
        self.X_mean = X.mean(axis=1, keepdims=True)
        self.X_std = X.std(axis=1, keepdims=True)
        self.Y_mean = Y.mean()
        self.Y_std = Y.std()

        # Normalize data
        self.X = (self.X - self.X_mean) / self.X_std
        self.Y = (self.Y - self.Y_mean) / self.Y_std

    def inverse_transform(self, y):
        return y * self.Y_std + self.Y_mean

    def __len__(self):
        return (len(self.Y)-self.days)

    def __getitem__(self,index):
        x = self.X[:, index:index+self.days].T  # Shape: [seq_len, num_features]
        y = self.Y[index + self.days]
        return torch.tensor(x, dtype = torch.float32), torch.tensor(y, dtype=torch.float32)

In [None]:
import numpy as np
from numpy import exp, sum, log, log10

# Fixed data fetching
def get_price(tickers, start='2020-01-01', end=None):
    df = pd.DataFrame()
    for ticker in tickers:
        data = yf.Ticker(ticker).history(start=start, end=end)
        df[ticker] = data['Close']
    return df

feature_stocks=['tsla','meta','nvda','amzn','nflx','gbtc','gdx','intc','dal','c','goog','aapl','msft','ibm','hp','orcl','sap','crm','hubs','twlo']
predict_stock='msft'

# getting data
start_date='2020-01-01'

allX=get_price(feature_stocks,start=start_date)
ally=get_price([predict_stock],start=start_date)

# Convert to numpy arrays
X = allX.to_numpy().transpose().astype(np.float32)
y = ally.to_numpy().astype(np.float32)

In [None]:
import torch.utils.data as data
import torch

# Initialize with different days values (try 5, 10, 20, 32)
days_window = 32  # <-- Adjust this value (1-32)
stockData = StockDataset(X, y, days=days_window)

train_set_size = int(len(stockData)*0.7)
valid_set_size = int(len(stockData)*0.2)
test_set_size = len(stockData)-train_set_size-valid_set_size

train_set, valid_set, test_set = data.random_split(stockData,[train_set_size,valid_set_size,test_set_size],
                                              generator=torch.Generator().manual_seed(42))

batch_size = train_set_size # use entire dataset as batch
train_dataloader = DataLoader(train_set,batch_size=batch_size,shuffle=True)  # input:(20,5), label:1
valid_dataloader = DataLoader(valid_set,batch_size=batch_size,shuffle=False)
test_dataloader = DataLoader(test_set,batch_size=batch_size,shuffle=False)

## Question 1: RNN with at least one LSTM layer.

In [None]:
class StockLSTM(nn.Module):
    def __init__(self, input_size, hidden_size = 64, num_layers = 1):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True, dropout=0.2)
        self.linear = nn.Sequential(
            nn.Linear(hidden_size, 64),
            nn.ReLU(),
            nn.Linear(64, 1))

    def forward(self, x):
        out, _ = self.lstm(x) # x shape: [batch, seq_len, features]
        return self.linear(out[:, -1, :]).squeeze()

    # Training setup
batch_size = 32
train_dataloader = data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_dataloader = data.DataLoader(valid_set, batch_size=batch_size)
test_dataloader = data.DataLoader(test_set, batch_size=batch_size)

model = StockLSTM(input_size=len(feature_stocks), hidden_size=64, num_layers=1)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5)

# Training loop
for epoch in range(50):
    model.train()
    for X_batch, y_batch in train_dataloader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        val_loss = sum(criterion(model(X_val), y_val) for X_val, y_val in valid_dataloader)
    print(f"Epoch {epoch+1}, Val Loss: {val_loss/len(valid_dataloader):.4f}")

def forecast_future(model, dataset, last_known_index):
    # Extract the last 'days'-length sequence
    x = dataset.X[:, last_known_index-dataset.days : last_known_index].T
    x = torch.tensor(x, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

    model.eval()
    with torch.no_grad():
        prediction = model(x)

    return dataset.inverse_transform(prediction.item())

last_idx = len(stockData) - 1  # Most recent data point
predicted_price = forecast_future(model, stockData, last_idx)
print(f"Next-day MSFT price prediction: ${predicted_price:.2f}")

# Add to validation/test phases
def evaluate_model(dataloader, dataset):
    model.eval()
    predictions, truths = [], []
    with torch.no_grad():
        for X, y in dataloader:
            preds = model(X)
            preds = dataset.inverse_transform(preds.numpy())
            y = dataset.inverse_transform(y.numpy())
            predictions.extend(preds)
            truths.extend(y)

    # Calculate metrics
    mae = np.mean(np.abs(np.array(predictions) - np.array(truths)))
    rmse = np.sqrt(np.mean((np.array(predictions) - np.array(truths))**2))
    return mae, rmse

# Use during validation
val_mae, val_rmse = evaluate_model(valid_dataloader, stockData)
print(f"Validation MAE: ${val_mae:.2f}, RMSE: ${val_rmse:.2f}")


  val_loss = sum(criterion(model(X_val), y_val) for X_val, y_val in valid_dataloader)


Epoch 1, Val Loss: 0.3519
Epoch 2, Val Loss: 0.0471
Epoch 3, Val Loss: 0.0250
Epoch 4, Val Loss: 0.0197
Epoch 5, Val Loss: 0.0167
Epoch 6, Val Loss: 0.0134
Epoch 7, Val Loss: 0.0119
Epoch 8, Val Loss: 0.0100
Epoch 9, Val Loss: 0.0117
Epoch 10, Val Loss: 0.0091
Epoch 11, Val Loss: 0.0088
Epoch 12, Val Loss: 0.0091
Epoch 13, Val Loss: 0.0082
Epoch 14, Val Loss: 0.0083
Epoch 15, Val Loss: 0.0075
Epoch 16, Val Loss: 0.0076
Epoch 17, Val Loss: 0.0082
Epoch 18, Val Loss: 0.0071
Epoch 19, Val Loss: 0.0066
Epoch 20, Val Loss: 0.0076
Epoch 21, Val Loss: 0.0074
Epoch 22, Val Loss: 0.0066
Epoch 23, Val Loss: 0.0088
Epoch 24, Val Loss: 0.0062
Epoch 25, Val Loss: 0.0064
Epoch 26, Val Loss: 0.0061
Epoch 27, Val Loss: 0.0068
Epoch 28, Val Loss: 0.0091
Epoch 29, Val Loss: 0.0066
Epoch 30, Val Loss: 0.0059
Epoch 31, Val Loss: 0.0060
Epoch 32, Val Loss: 0.0064
Epoch 33, Val Loss: 0.0064
Epoch 34, Val Loss: 0.0060
Epoch 35, Val Loss: 0.0058
Epoch 36, Val Loss: 0.0060
Epoch 37, Val Loss: 0.0057
Epoch 38, 

## Question 2: Attention Network with at least one self-attention layer

In [None]:
# Attention-based model with same data pipeline
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

class StockAttentionModel(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_heads=2):
        super().__init__()
        # Position-wise feedforward
        self.input_proj = nn.Linear(input_size, hidden_size)

        # Self-attention layer
        self.self_attn = nn.MultiheadAttention(
            embed_dim=hidden_size,
            num_heads=num_heads,
            batch_first=True
        )

        # Temporal processing
        self.temporal = nn.LSTM(hidden_size, hidden_size, batch_first=True)

        # Final prediction
        self.regressor = nn.Sequential(
            nn.Linear(hidden_size, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        # x shape: [batch_size, seq_len, num_features]
        x = F.relu(self.input_proj(x))

        # Self-attention
        attn_out, _ = self.self_attn(x, x, x)

        # Temporal processing
        temp_out, _ = self.temporal(attn_out)

        # Use last timestep for prediction
        return self.regressor(temp_out[:, -1, :]).squeeze()

# Modified training setup
model = StockAttentionModel(
    input_size=len(feature_stocks),
    hidden_size=128,
    num_heads=4
)
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)

# Training loop with attention-specific adjustments
for epoch in range(100):
    model.train()
    for X_batch, y_batch in train_dataloader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
        optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for X_val, y_val in valid_dataloader:
            outputs = model(X_val)
            val_loss += criterion(outputs, y_val).item()
        val_loss /= len(valid_dataloader)

    scheduler.step(val_loss)
    print(f"Epoch {epoch+1}, Val Loss: {val_loss:.4f}")

# Evaluation with attention model
def attention_forecast(model, dataset, last_index):
    # Ensure valid index range
    if last_index < dataset.days:
        raise ValueError(f"Index must be >= {dataset.days}")

    # Get input sequence
    x = dataset.X[:, last_index-dataset.days : last_index].T
    x = torch.tensor(x, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

    # Make prediction
    model.eval()
    with torch.no_grad():
        pred = model(x)

    # Convert and return scalar value
    return dataset.inverse_transform(pred.item())  # No index needed for scalar


# Enhanced evaluation with attention weights visualization
def plot_attention(model, sample_input):
    model.eval()
    with torch.no_grad():
        _, attn_weights = model.self_attn(
            sample_input, sample_input, sample_input
        )
    plt.matshow(attn_weights[0].numpy())
    plt.title('Attention Pattern for Last Prediction')
    plt.show()

def evaluate_model(dataloader, dataset):
    model.eval()
    predictions = []
    truths = []

    with torch.no_grad():
        for X, y in dataloader:
            # Get predictions
            preds = model(X)

            # Convert to numpy arrays and denormalize
            preds_denorm = dataset.inverse_transform(preds.numpy().flatten())
            y_denorm = dataset.inverse_transform(y.numpy().flatten())

            predictions.extend(preds_denorm)
            truths.extend(y_denorm)

    # Calculate metrics
    mae = np.mean(np.abs(np.array(predictions) - np.array(truths)))
    rmse = np.sqrt(np.mean((np.array(predictions) - np.array(truths))**2))

    return mae, rmse

# Usage with validation set
val_mae, val_rmse = evaluate_model(valid_dataloader, stockData)
print(f"\nValidation Metrics:")
print(f"MAE: ${val_mae:.2f}")
print(f"RMSE: ${val_rmse:.2f}")

# For test set
test_mae, test_rmse = evaluate_model(test_dataloader, stockData)
print(f"\nTest Metrics:")
print(f"MAE: ${test_mae:.2f}")
print(f"RMSE: ${test_rmse:.2f}")

# Example usage
last_idx = len(stockData) - 1
pred_price = attention_forecast(model, stockData, last_idx)
print(f"Attention Model Prediction of MSFT stock price: ${pred_price:.2f}")


Epoch 1, Val Loss: 0.6949
Epoch 2, Val Loss: 0.2180
Epoch 3, Val Loss: 0.0725
Epoch 4, Val Loss: 0.0433
Epoch 5, Val Loss: 0.0365
Epoch 6, Val Loss: 0.0320
Epoch 7, Val Loss: 0.0285
Epoch 8, Val Loss: 0.0274
Epoch 9, Val Loss: 0.0252
Epoch 10, Val Loss: 0.0238
Epoch 11, Val Loss: 0.0231
Epoch 12, Val Loss: 0.0232
Epoch 13, Val Loss: 0.0229
Epoch 14, Val Loss: 0.0242
Epoch 15, Val Loss: 0.0199
Epoch 16, Val Loss: 0.0183
Epoch 17, Val Loss: 0.0175
Epoch 18, Val Loss: 0.0172
Epoch 19, Val Loss: 0.0166
Epoch 20, Val Loss: 0.0161
Epoch 21, Val Loss: 0.0164
Epoch 22, Val Loss: 0.0160
Epoch 23, Val Loss: 0.0156
Epoch 24, Val Loss: 0.0160
Epoch 25, Val Loss: 0.0160
Epoch 26, Val Loss: 0.0158
Epoch 27, Val Loss: 0.0143
Epoch 28, Val Loss: 0.0147
Epoch 29, Val Loss: 0.0144
Epoch 30, Val Loss: 0.0132
Epoch 31, Val Loss: 0.0129
Epoch 32, Val Loss: 0.0133
Epoch 33, Val Loss: 0.0152
Epoch 34, Val Loss: 0.0139
Epoch 35, Val Loss: 0.0141
Epoch 36, Val Loss: 0.0133
Epoch 37, Val Loss: 0.0126
Epoch 38, 

## Question 3: Knowledge distillation to shrink the attention network model by half


### Model Set up, Size reduction

In [None]:
# 1. Define Teacher and Student Models
class TeacherModel(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.attn = nn.MultiheadAttention(input_size, 4, batch_first=True)
        self.lstm = nn.LSTM(input_size, 128, num_layers=2, batch_first=True)
        self.regressor = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        attn_out, _ = self.attn(x, x, x)
        lstm_out, _ = self.lstm(attn_out)
        return self.regressor(lstm_out[:, -1, :]).squeeze()

class StudentModel(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        # 50% smaller model
        self.attn = nn.MultiheadAttention(input_size, 2, batch_first=True)
        self.lstm = nn.LSTM(input_size, 64, num_layers=1, batch_first=True)
        self.regressor = nn.Linear(64, 1)

    def forward(self, x):
        attn_out, _ = self.attn(x, x, x)
        lstm_out, _ = self.lstm(attn_out)
        return self.regressor(lstm_out[:, -1, :]).squeeze()

# 2. Initialize models and trainers
teacher = TeacherModel(len(feature_stocks))
student = StudentModel(len(feature_stocks))

# Fixed evaluate_model function
def evaluate_model(dataloader, model, dataset):
    model.eval()
    predictions = []
    truths = []

    with torch.no_grad():
        for X, y in dataloader:
            preds = model(X)
            preds_denorm = dataset.inverse_transform(preds.numpy().flatten())
            y_denorm = dataset.inverse_transform(y.numpy().flatten())
            predictions.extend(preds_denorm)
            truths.extend(y_denorm)

    mae = np.mean(np.abs(np.array(predictions) - np.array(truths)))
    rmse = np.sqrt(np.mean((np.array(predictions) - np.array(truths))**2))
    return mae, rmse

# Fixed training loop with correct function calls
def train_jointly(teacher, student, epochs=100):
    # Train teacher first
    teacher_optim = torch.optim.Adam(teacher.parameters(), lr=0.001)

    print("Training Teacher Model:")
    for epoch in range(epochs//2):
        teacher.train()
        for X, y in train_dataloader:
            teacher_optim.zero_grad()
            preds = teacher(X)
            loss = F.mse_loss(preds, y)
            loss.backward()
            teacher_optim.step()

        # Validation
        teacher.eval()
        val_mae, val_rmse = evaluate_model(valid_dataloader, teacher, stockData)
        print(f"Teacher Epoch {epoch+1} | Val MAE: ${val_mae:.2f} | Val RMSE: ${val_rmse:.2f}")

    # Train student with teacher guidance
    student_optim = torch.optim.Adam(student.parameters(), lr=0.001)

    print("\nTraining Student with Teacher Guidance:")
    for epoch in range(epochs//2):
        student.train()
        total_loss = 0

        for X, y in train_dataloader:
            student_optim.zero_grad()

            with torch.no_grad():
                teacher_preds = teacher(X)

            student_preds = student(X)
            loss = 0.3*F.mse_loss(student_preds, y) + 0.7*F.mse_loss(student_preds, teacher_preds)

            loss.backward()
            student_optim.step()
            total_loss += loss.item()

        # Validation
        student.eval()
        val_mae, val_rmse = evaluate_model(valid_dataloader, student, stockData)
        print(f"Student Epoch {epoch+1} | Loss: {total_loss/len(train_dataloader):.4f} | Val MAE: ${val_mae:.2f} | Val RMSE: ${val_rmse:.2f}")

# 4. Run the joint training
train_jointly(teacher, student, epochs=100)


Training Teacher Model:
Teacher Epoch 1 | Val MAE: $22.33 | Val RMSE: $28.15
Teacher Epoch 2 | Val MAE: $13.66 | Val RMSE: $17.33
Teacher Epoch 3 | Val MAE: $11.10 | Val RMSE: $14.13
Teacher Epoch 4 | Val MAE: $11.36 | Val RMSE: $14.17
Teacher Epoch 5 | Val MAE: $10.57 | Val RMSE: $13.61
Teacher Epoch 6 | Val MAE: $9.70 | Val RMSE: $12.35
Teacher Epoch 7 | Val MAE: $9.92 | Val RMSE: $12.60
Teacher Epoch 8 | Val MAE: $9.22 | Val RMSE: $11.70
Teacher Epoch 9 | Val MAE: $9.54 | Val RMSE: $12.04
Teacher Epoch 10 | Val MAE: $9.62 | Val RMSE: $11.98
Teacher Epoch 11 | Val MAE: $11.04 | Val RMSE: $13.67
Teacher Epoch 12 | Val MAE: $8.85 | Val RMSE: $11.50
Teacher Epoch 13 | Val MAE: $9.23 | Val RMSE: $11.65
Teacher Epoch 14 | Val MAE: $10.08 | Val RMSE: $12.19
Teacher Epoch 15 | Val MAE: $9.71 | Val RMSE: $12.18
Teacher Epoch 16 | Val MAE: $8.19 | Val RMSE: $10.54
Teacher Epoch 17 | Val MAE: $8.43 | Val RMSE: $10.71
Teacher Epoch 18 | Val MAE: $9.09 | Val RMSE: $11.40
Teacher Epoch 19 | Val M

### Final Student Performance

In [None]:

# 5. Final Evaluation
print("\nFinal Student Performance:")
test_mae, test_rmse = evaluate_model(test_dataloader, student, stockData)
print(f"Test MAE: ${test_mae:.2f}")
print(f"Test RMSE: ${test_rmse:.2f}")

# 6. Size Comparison
def count_params(model):
    return sum(p.numel() for p in model.parameters())

print(f"\nModel Size Reduction: {count_params(student)/count_params(teacher):.1%}")


Final Student Performance:
Test MAE: $6.97
Test RMSE: $8.59

Model Size Reduction: 10.9%


  return sum(p.numel() for p in model.parameters())


### MSFT stock price

In [None]:
def forecast_msft_price(model, dataset, days=5):
    """Forecast next-day MSFT price using latest available data"""
    # Get the last valid sequence window
    last_idx = len(dataset) - 1

    # Extract input sequence
    x = dataset.X[:, last_idx-days:last_idx].T  # Shape: [seq_len, num_features]
    x = torch.tensor(x, dtype=torch.float32).unsqueeze(0)  # Add batch dimension

    # Make prediction
    model.eval()
    with torch.no_grad():
        pred_normalized = model(x)

    # Convert to actual price
    pred_price = dataset.inverse_transform(pred_normalized.item())

    # Get last actual price for reference
    last_actual_price = dataset.inverse_transform(dataset.Y[last_idx])

    return pred_price, last_actual_price

# Example usage
predicted_price, last_price = forecast_msft_price(student, stockData, days=5)

print("\nLatest MSFT Closing Price:", f"${last_price:.2f}")
print("Predicted Next-Day Price:", f"${predicted_price:.2f}")
print("Predicted Change:", f"{(predicted_price - last_price):.2f} ({((predicted_price/last_price)-1)*100:.2f}%)")

# Add technical validation
val_mae, val_rmse = evaluate_model(valid_dataloader, student, stockData)
print("\nModel Validation Metrics:")
print(f"Typical Error Range: ±${val_mae:.2f} (MAE)")
print(f"Maximum Likely Error: ${val_rmse:.2f} (RMSE)")


Latest MSFT Closing Price: $378.77
Predicted Next-Day Price: $386.06
Predicted Change: 7.29 (1.93%)

Model Validation Metrics:
Typical Error Range: ±$6.75 (MAE)
Maximum Likely Error: $8.48 (RMSE)


## Question 4: Mamba Newtork

### model set up

In [None]:
!pip install mamba_ssm causal-conv1d
!pip install mistral-inference



In [None]:
import torch
import torch.nn as nn
from mamba_ssm import Mamba

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the MambaStockModel class
class MambaStockModel(nn.Module):
    def __init__(self, input_size, hidden_size=64, state_size=16):
        super().__init__()
        # Input projection
        self.proj = nn.Linear(input_size, hidden_size)

        # Mamba layer (state-space model)
        self.mamba = Mamba(
            d_model=hidden_size,  # Input dimension
            d_state=state_size,   # State expansion factor
            d_conv=4,             # Convolution kernel size
            expand=2              # Expansion factor
        ).to(device)

        # Prediction head
        self.regressor = nn.Sequential(
            nn.Linear(hidden_size, 32),
            nn.SiLU(),
            nn.Linear(32, 1)
        ).to(device)

    def forward(self, x):
        # x shape: [batch_size, seq_len, num_features]
        x = self.proj(x)          # Project to hidden dimension
        x = self.mamba(x)         # Process with Mamba
        return self.regressor(x[:, -1, :]).squeeze()  # Last timestep prediction

# Initialize Mamba model
mamba_model = MambaStockModel(
    input_size=len(feature_stocks),
    hidden_size=128,
    state_size=32
).to(device)

# Modified training setup
optimizer = torch.optim.AdamW(mamba_model.parameters(), lr=0.0002)
criterion = nn.HuberLoss()  # More robust than MSE

# Training loop with Mamba-specific adjustments
for epoch in range(100):
    mamba_model.train()
    epoch_loss = 0

    for X_batch, y_batch in train_dataloader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()

        # Add sequence dimension (batch_size, seq_len, features)
        outputs = mamba_model(X_batch)
        loss = criterion(outputs, y_batch)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(mamba_model.parameters(), 1.0)
        optimizer.step()

        epoch_loss += loss.item()

    # Validation
    mamba_model.eval()
    with torch.no_grad():
        val_loss = 0
        predictions = []
        truths = []

        for X_val, y_val in valid_dataloader:
            X_val, y_val = X_val.to(device), y_val.to(device)
            preds = mamba_model(X_val)
            val_loss += criterion(preds, y_val).item()

            # Denormalize
            preds_denorm = stockData.inverse_transform(preds.cpu().numpy())
            y_denorm = stockData.inverse_transform(y_val.cpu().numpy())
            predictions.extend(preds_denorm)
            truths.extend(y_denorm)

        mae = np.mean(np.abs(np.array(predictions) - np.array(truths)))
        rmse = np.sqrt(np.mean((np.array(predictions) - np.array(truths))**2))

    print(f"Epoch {epoch+1} | Train Loss: {epoch_loss/len(train_dataloader):.4f} | Val MAE: ${mae:.2f} | Val RMSE: ${rmse:.2f}")

Epoch 1 | Train Loss: 0.3861 | Val MAE: $52.10 | Val RMSE: $63.95
Epoch 2 | Train Loss: 0.1930 | Val MAE: $28.62 | Val RMSE: $37.23
Epoch 3 | Train Loss: 0.0512 | Val MAE: $13.65 | Val RMSE: $16.99
Epoch 4 | Train Loss: 0.0152 | Val MAE: $10.05 | Val RMSE: $12.55
Epoch 5 | Train Loss: 0.0084 | Val MAE: $7.97 | Val RMSE: $9.66
Epoch 6 | Train Loss: 0.0059 | Val MAE: $6.84 | Val RMSE: $8.32
Epoch 7 | Train Loss: 0.0048 | Val MAE: $6.39 | Val RMSE: $7.83
Epoch 8 | Train Loss: 0.0046 | Val MAE: $6.23 | Val RMSE: $7.57
Epoch 9 | Train Loss: 0.0040 | Val MAE: $5.91 | Val RMSE: $7.13
Epoch 10 | Train Loss: 0.0037 | Val MAE: $5.66 | Val RMSE: $6.94
Epoch 11 | Train Loss: 0.0034 | Val MAE: $5.51 | Val RMSE: $6.73
Epoch 12 | Train Loss: 0.0032 | Val MAE: $5.45 | Val RMSE: $6.65
Epoch 13 | Train Loss: 0.0031 | Val MAE: $5.31 | Val RMSE: $6.59
Epoch 14 | Train Loss: 0.0030 | Val MAE: $5.48 | Val RMSE: $6.67
Epoch 15 | Train Loss: 0.0029 | Val MAE: $5.54 | Val RMSE: $6.70
Epoch 16 | Train Loss: 0.0

In [None]:
def mamba_forecast(model, dataset, days=32):
    """Forecast next-day MSFT price using Mamba model"""
    # Ensure model is in eval mode
    model.eval()
    device = next(model.parameters()).device

    # Get last valid sequence
    last_idx = len(dataset) - 1
    if last_idx < days:
        raise ValueError(f"Need at least {days} days of historical data")

    # Prepare input tensor
    x = dataset.X[:, last_idx-days:last_idx].T  # [seq_len, num_features]
    x = torch.tensor(x, dtype=torch.float32, device=device)
    x = x.unsqueeze(0).to(device)  # Add batch dim and move to device

    # Make prediction
    with torch.no_grad():
        pred_normalized = model(x)

    # Convert to actual price
    pred_price = dataset.inverse_transform(pred_normalized.cpu().item())

    # Get reference price
    last_price = dataset.inverse_transform(dataset.Y[last_idx])

    return pred_price, last_price

# Modified evaluate_model with device handling
def evaluate_model(dataloader, model, dataset):
    model.eval()
    device = next(model.parameters()).device
    predictions = []
    truths = []

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)  # Move to model's device
            preds = model(X)

            # Move back to CPU for numpy conversion
            preds_denorm = dataset.inverse_transform(preds.cpu().numpy().flatten())
            y_denorm = dataset.inverse_transform(y.cpu().numpy().flatten())

            predictions.extend(preds_denorm)
            truths.extend(y_denorm)

    mae = np.mean(np.abs(np.array(predictions) - np.array(truths)))
    rmse = np.sqrt(np.mean((np.array(predictions) - np.array(truths))**2))
    return mae, rmse

# Get prediction
try:
    pred_price, last_price = mamba_forecast(mamba_model, stockData, days=5)

    print("\n📈 MSFT Stock Forecast using Mamba Network")
    print(f"│ Last Closing Price:   ${last_price:.2f}")
    print(f"│ Predicted Next Close: ${pred_price:.2f}")
    print(f"│ Expected Change:      +{pred_price - last_price:.2f} (+{(pred_price/last_price-1)*100:.2f}%)")

    # Show model accuracy
    val_mae, val_rmse = evaluate_model(valid_dataloader, mamba_model, stockData)
    print(f"\n📊 Model Validation Metrics")
    print(f"│ Typical Error (MAE):  ±${val_mae:.2f}")
    print(f"│ Max Likely Error:     ${val_rmse:.2f} (RMSE)")

except ValueError as e:
    print(f"Forecast error: {e}")


📈 MSFT Stock Forecast using Mamba Network
│ Last Closing Price:   $378.77
│ Predicted Next Close: $389.99
│ Expected Change:      +11.22 (+2.96%)

📊 Model Validation Metrics
│ Typical Error (MAE):  ±$4.42
│ Max Likely Error:     $5.75 (RMSE)
