In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

### Data
--- 
- Process Log Returns and Volatility for largest 11 cryptocurrencies
- Volatility is aggregated over a rolling 5 interval period
- Aggregate market data (Using Market Data for largest Coins since smaller ones produce too much noise)

In [70]:
coins = ['bitcoin','ethereum','ripple','binancecoin','solana','dogecoin','staked-ether','cardano','tron','avalanche-2','chainlink']
data = pd.read_csv('./processed_data/data1.csv')
for col in data.columns:
    if col != 'timestamp':
        data[f'{col}_squared_log_returns'] = np.log(data[col] / data[col].shift(1)) ** 2
        data[f'{col}_log_volatility'] = np.log(data[col] / data[col].shift(1)).rolling(5).std()
        
        # Calculate market volatility
data['market_squared_log_returns'] = data[[f'{coin}_squared_log_returns' for coin in coins]].mean(axis=1)
data['market_log_volatility'] = data[[f'{coin}_log_volatility' for coin in coins]].mean(axis=1)
data.dropna(inplace=True)

### Dataset
---

In [71]:
class VolatilityDataset(Dataset):
    def __init__(self, features, target):
        self.features = features
        self.target = target

    def __len__(self):
        return len(self.target)

    def __getitem__(self, idx):
        return self.features[idx], self.target[idx]

### Training Data
---
- Features - Selected Coin Log Volatility and Squared Log Returns, Market Log Volatility and Squared Log Returns
- Target - Log_Volatility for Selected Coin
- Training Data = 80%, Testing Data = 20%

In [72]:
coin = 'bitcoin'
features = [data[f'{coin}_log_volatility'].values, data[f'{coin}_squared_log_returns'].values, data['market_log_volatility'].values, data['market_squared_log_returns'].values]
target = [data[f'{coin}_log_volatility'].values]
features = np.array(features).T
target = np.array(target).T

train_size = int(len(features) * 0.8)  
X_train, X_test = features[:train_size], features[train_size:]
y_train, y_test = target[:train_size], target[train_size:]

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

train_dataset = VolatilityDataset(X_train_tensor, y_train_tensor)
test_dataset = VolatilityDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

input_size = X_train.shape[1]



### LSTM Neural Network
---
- LSTM (128 -> 64)
- ReLU
- Linear (64 -> 32)
- ReLU
- Lineart (32 -> 1)

In [60]:
import torch.nn as nn
class VolatilityPredictor(nn.Module):
    def __init__(self, input_size):
        super(VolatilityPredictor, self).__init__()
        self.LSTM = nn.LSTM(input_size, 64, batch_first=True)
        self.seq = nn.Sequential(
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
    def forward(self, x):
        x,_ = self.LSTM(x)
        x = self.seq(x)
        return x

### Train Model
---
- Use MSE Loss

In [73]:
model = VolatilityPredictor(input_size)
loss_f = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 100
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for features, target in train_loader:
        optimizer.zero_grad()
        outputs = model(features).squeeze()  
        loss = loss_f(outputs,target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_loader)}")

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1/100, Loss: 0.0008342255357795036
Epoch 2/100, Loss: 1.9514381866465196e-05
Epoch 3/100, Loss: 1.3269973396858188e-05
Epoch 4/100, Loss: 1.1554107912573246e-05
Epoch 5/100, Loss: 1.0923276754578417e-05
Epoch 6/100, Loss: 1.0680203449692722e-05
Epoch 7/100, Loss: 1.0512612286022138e-05
Epoch 8/100, Loss: 1.045835193257526e-05
Epoch 9/100, Loss: 1.0324619952629885e-05
Epoch 10/100, Loss: 1.0340795488260293e-05
Epoch 11/100, Loss: 1.0500204637375448e-05
Epoch 12/100, Loss: 1.0690089098656614e-05
Epoch 13/100, Loss: 1.0808843901295043e-05
Epoch 14/100, Loss: 1.1034061988607842e-05
Epoch 15/100, Loss: 1.1078548172180814e-05
Epoch 16/100, Loss: 1.103612056331348e-05
Epoch 17/100, Loss: 1.1144203889224321e-05
Epoch 18/100, Loss: 1.1306433035143203e-05
Epoch 19/100, Loss: 1.135907813838147e-05
Epoch 20/100, Loss: 1.1245113852434841e-05
Epoch 21/100, Loss: 1.1216053734285809e-05
Epoch 22/100, Loss: 1.1105907757405251e-05
Epoch 23/100, Loss: 1.0871927456589831e-05
Epoch 24/100, Loss: 1.11

### Evaluate Model
---
- Plot Actuals vs Predictions for the testing dataset

In [None]:
from sklearn.metrics import mean_squared_error
with torch.no_grad():
    predictions = []
    actuals = []
    for features, targets in test_loader:
        outputs = model(features).squeeze()
        predictions.append(outputs.numpy())
        actuals.append(targets.numpy())

predictions = np.concatenate(predictions)
actuals = np.concatenate(actuals).T[0]

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(actuals, predictions)
results = pd.DataFrame({
    'actual': actuals,
    'prediction': predictions
})
actuals = results['actual']
predictions = results['prediction']
index = results.index

plt.figure(figsize=(10, 6))
plt.plot(index, actuals, label='Actuals', linestyle='-')
plt.plot(index, predictions, label='Predictions', linestyle='--')

plt.title('Actuals vs Predictions')
plt.xlabel('Index')
plt.ylabel('Values')
plt.legend()
plt.grid(True)
plt.tight_layout()

plt.show()

### Todo
---
- Try different time intervals