In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.fft import fft, fftfreq
from scipy.stats import pearsonr
from sklearn.metrics import mutual_info_score
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import torch.nn as nn
import torch
from torch_geometric.data import Data
import torch_geometric.transforms as T
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
import seaborn as sns





# Underlying SCM

In [2]:
# {
# 0: x_1 [], 
# 1: x_2 [((0, 0), coeff, lin_f), ((5, 0), coeff, lin_f)], 
# 2: M   [((1, 0), coeff, lin_f), ((5, 0), coeff, lin_f)],
# 3: Y   [((1, 0), coeff, lin_f), ((2, 0), coeff, lin_f), ((6, 0), coeff, lin_f), ((7, 0), coeff, lin_f)],
# 4: Z_1 [((5, 0), coeff, lin_f), ((7, 0), coeff, lin_f)], 
# 5: Z_2 [], 
# 6: Z_3 [],
# 7: [],
# }

# Loading data

In [3]:
df = pd.read_pickle('intervention_data2.pkl')
# df = pd.read_pickle('lin_df.pkl')
df.head(3)

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7
0,0.0,0.0,0.248834,-0.087867,-1.288305,-1.123094,0.973667,
1,0.0,0.0,0.929797,-0.160463,1.274541,0.237258,-0.501115,
2,0.0,0.0,-1.187943,0.356178,-0.912174,-1.018844,0.055279,


# GNN

In [4]:

class TimeSeriesGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TimeSeriesGNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        
        # Only predict for x0
        x0_pred = self.linear(x[0].unsqueeze(0))
        
        return x0_pred




def lin_f(x):
    return x

coeff = 0.5
links_coeffs = {
    0: [], 
    1: [((0, 0), coeff, lin_f), ((5, 0), coeff, lin_f)], 
    2: [((1, 0), coeff, lin_f), ((5, 0), coeff, lin_f)],
    3: [((1, 0), coeff, lin_f), ((2, 0), coeff, lin_f), ((6, 0), coeff, lin_f), ((7, 0), coeff, lin_f)],
    4: [((5, 0), coeff, lin_f), ((7, 0), coeff, lin_f)], 
    5: [],
    6: [],
    7: [],
}

def prepare_data(df, num_timesteps=None, max_lag=1):
    if num_timesteps is None or num_timesteps > len(df):
        num_timesteps = len(df)

    graphs = []
    for t in range(max_lag, num_timesteps):
        try:
            # Create a list to hold features for each variable
            features = []
            
            # Prepare features for each variable
            for var in range(8):  # 8 variables in total
                var_features = [df.loc[t, f'x{var}']]  # Current value
                for link in links_coeffs[var]:
                    source_var, lag = link[0]
                    if t - lag >= 0:
                        var_features.append(df.loc[t - lag, f'x{source_var}'])
                features.append(var_features)
            
            # Convert to tensor, padding shorter sequences with zeros
            max_features = max(len(f) for f in features)
            x = torch.tensor([f + [0] * (max_features - len(f)) for f in features], dtype=torch.float)
            
            # Create edge_index based on links_coeffs
            edge_index = []
            for target, sources in links_coeffs.items():
                for source, _, _ in sources:
                    edge_index.append([source[0], target])
            
            edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
            
            # Use all variables as targets for simplicity
            y = torch.tensor([df.loc[t+1, f'x3']], dtype=torch.float)
            
            graphs.append(Data(x=x, edge_index=edge_index, y=y))
            
            # Debugging: print the shape of x for the first few graphs
            if len(graphs) <= 5:
                print(f"Graph {len(graphs)}: x shape = {x.shape}")
            
        except KeyError as e:
            print(f"KeyError at timestep {t}: {e}")
            print("Skipping this timestep.")
            continue
        except Exception as e:
            print(f"Unexpected error at timestep {t}: {e}")
            print("DataFrame at this timestep:")
            print(df.loc[t-max_lag:t+1])
            raise
    
    return graphs



graphs = prepare_data(df)

# Split data into train and test sets
train_graphs = graphs[:-100]
test_graphs = graphs[-100:]

# Initialize the model
model = TimeSeriesGNN(input_dim=5, hidden_dim=64, output_dim=1)

# Training loop (simplified)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.MSELoss()

for epoch in range(100):
    losses_ = []
    for graph in train_graphs:
        optimizer.zero_grad()
        out = model(graph)
        loss = criterion(out, graph.y)
        loss.backward()
        optimizer.step()
        losses_.append(loss.detach())
    print(f'epoch {epoch} __ loss {np.mean(losses_)}')

# Evaluation
model.eval()
predictions = []
true_values = []

with torch.no_grad():
    for graph in test_graphs:
        pred = model(graph)
        predictions.append(pred.item())
        true_values.append(graph.y.item())



Graph 1: x shape = torch.Size([8, 5])
Graph 2: x shape = torch.Size([8, 5])
Graph 3: x shape = torch.Size([8, 5])
Graph 4: x shape = torch.Size([8, 5])
Graph 5: x shape = torch.Size([8, 5])
KeyError at timestep 9999: 10000
Skipping this timestep.


RuntimeError: mat1 and mat2 shapes cannot be multiplied (8x5 and 8x64)

In [None]:
df.head(1)

# Accuracy metrics

In [None]:

def calculate_smape(true_values, predictions):
    """
    Calculate Symmetric Mean Absolute Percentage Error (SMAPE)
    
    :param true_values: List or array of actual values
    :param predictions: List or array of predicted values
    :return: SMAPE value
    """
    true_values = np.array(true_values)
    predictions = np.array(predictions)
    
    # Avoid division by zero
    denominator = np.abs(true_values) + np.abs(predictions)
    
    # If both true and predicted are zero, consider it as a perfect prediction
    zero_indices = denominator == 0
    denominator[zero_indices] = 1  # Temporarily set to 1 to avoid division by zero
    
    smape = np.mean(2 * np.abs(predictions - true_values) / denominator)
    
    # Adjust for cases where both true and predicted were zero
    smape = smape * (1 - zero_indices.mean())
    
    return smape * 100  # Convert to percentage

# Usage:
smape = calculate_smape(true_values, predictions)
print(f"SMAPE: {smape:.2f}%")

# Plotting

In [None]:

def plot_true_vs_predicted(true_values, predictions):
    plt.figure(figsize=(12, 6))
    plt.plot(true_values, label='True Values', color='blue')
    plt.plot(predictions, label='Predictions', color='red')
    plt.xlabel('Time Step')
    plt.ylabel('Value')
    plt.title('True Values vs Predictions')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

plot_true_vs_predicted(true_values, predictions)