In [1]:
import torch
import pandas as pd
import sqlite3
import numpy as np
import functions

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from matplotlib import pyplot as plt
from datetime import datetime
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
from torch.nn import Linear
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch_geometric.data import  Batch
from torch_geometric.loader import DataLoader


In [2]:
conn = sqlite3.connect('db.db')
data = pd.read_sql_query("SELECT * from test", conn)
data = functions.cleanData(data)
data['MWD_sin'] = np.sin(data['MWD'] * (np.pi/180))
data['MWD_cos'] = np.cos(data['MWD'] * (np.pi/180))
data

Unnamed: 0,datetime,buoy_id,WVHT,MWD,APD,lat,long,MWD_sin,MWD_cos
0,2024-02-15 23:00:00,46001,4.8,138.0,7.7,56.300,-148.018,0.669131,-0.743145
1,2024-02-15 22:00:00,46001,4.7,148.0,7.4,56.300,-148.018,0.529919,-0.848048
2,2024-02-15 22:00:00,46001,4.9,137.0,7.8,56.300,-148.018,0.681998,-0.731354
3,2024-02-15 21:00:00,46001,4.5,142.0,7.7,56.300,-148.018,0.615661,-0.788011
4,2024-02-15 21:00:00,46001,4.6,146.0,7.9,56.300,-148.018,0.559193,-0.829038
...,...,...,...,...,...,...,...,...,...
139274,2024-01-01 02:00:00,51212,1.4,314.0,8.0,21.323,-158.149,-0.719340,0.694658
139275,2024-01-01 01:00:00,51212,1.5,316.0,9.1,21.323,-158.149,-0.694658,0.719340
139276,2024-01-01 01:00:00,51212,1.5,312.0,9.8,21.323,-158.149,-0.743145,0.669131
139277,2024-01-01 00:00:00,51212,1.4,315.0,9.8,21.323,-158.149,-0.707107,0.707107


In [3]:
# encodde MWD (degrees) into sin and cos direction
data['MWD_sin'] = np.sin(data['MWD'] * (np.pi/180))
data['MWD_cos'] = np.cos(data['MWD'] * (np.pi/180))
data['is_present'] = 1

In [4]:
data.columns

Index(['datetime', 'buoy_id', 'WVHT', 'MWD', 'APD', 'lat', 'long', 'MWD_sin',
       'MWD_cos', 'is_present'],
      dtype='object')

In [4]:
data

Unnamed: 0,datetime,buoy_id,WVHT,MWD,APD,lat,long,MWD_sin,MWD_cos,is_present
0,2024-02-15 23:00:00,46001,4.8,138.0,7.7,56.300,-148.018,0.669131,-0.743145,1
1,2024-02-15 22:00:00,46001,4.7,148.0,7.4,56.300,-148.018,0.529919,-0.848048,1
2,2024-02-15 22:00:00,46001,4.9,137.0,7.8,56.300,-148.018,0.681998,-0.731354,1
3,2024-02-15 21:00:00,46001,4.5,142.0,7.7,56.300,-148.018,0.615661,-0.788011,1
4,2024-02-15 21:00:00,46001,4.6,146.0,7.9,56.300,-148.018,0.559193,-0.829038,1
...,...,...,...,...,...,...,...,...,...,...
139274,2024-01-01 02:00:00,51212,1.4,314.0,8.0,21.323,-158.149,-0.719340,0.694658,1
139275,2024-01-01 01:00:00,51212,1.5,316.0,9.1,21.323,-158.149,-0.694658,0.719340,1
139276,2024-01-01 01:00:00,51212,1.5,312.0,9.8,21.323,-158.149,-0.743145,0.669131,1
139277,2024-01-01 00:00:00,51212,1.4,315.0,9.8,21.323,-158.149,-0.707107,0.707107,1


In [5]:
all_buoy_ids = data['buoy_id'].unique()
# Get all unique times
all_times = data['datetime'].unique()

# Iterate through each time
for time in all_times:
    # Get the data for the current time
    time_data = data[data['datetime'] == time]
    
    # Get the missing buoy_ids for the current time
    missing_buoy_ids = np.setdiff1d(all_buoy_ids, time_data['buoy_id'])
    
    # if len(missing_buoy_ids) != 0: 
        # print("there is", len(missing_buoy_ids), " missing buoys at time: ", time, "they are: ", missing_buoy_ids)  

    # Iterate through each missing buoy_id
    for buoy_id in missing_buoy_ids:
        # Find the previous time
        # previous_time = data[data['buoy_id'] == buoy_id]['datetime'].max()
        
        # Get the previous row for the missing buoy_id
        # previous_row = data[(data['datetime'] == previous_time) & (data['buoy_id'] == buoy_id)]
        
        # Create a new row with the previous values and the current time
        # new_row = previous_row.copy()
        # new_row['datetime'] = time
        # Create a new row with the missing buoy_id, current time, and is_present = 0

        new_row = pd.DataFrame({'buoy_id': [buoy_id], 'datetime': [time], 'is_present': [0]})
    
        # Append the new row to the data
        data = pd.concat([data, new_row], ignore_index=True)

In [6]:
# Example DataFrame
# df = pd.DataFrame(your_data)

# Normalize the continuous features
scaler = StandardScaler()
data[['WVHT', 'APD', 'MWD_sin', 'MWD_cos', 'lat', 'long']] = scaler.fit_transform(data[['WVHT', 'APD', 'MWD_sin', 'MWD_cos', 'lat', 'long']])

# Add binary feature for missing data
# Here, you would need a way to identify missing data. Assuming 'WVHT' is null if data is missing
# data['data_present'] = data['WVHT'].notnull().astype(int)

# Assuming missing data is filled with zeros or imputed before this step
data.fillna(0, inplace=True)

# Convert to PyTorch tensors
features = torch.tensor(data[['WVHT', 'APD', 'MWD_sin', 'MWD_cos', 'lat', 'long', 'is_present']].values, dtype=torch.float)
times = data['datetime'].unique()

graphs = []
for time in times:
    time_data = data[data['datetime'] == time]
    node_features = torch.tensor(time_data[['WVHT', 'APD', 'MWD_sin', 'MWD_cos', 'lat', 'long', 'is_present']].values, dtype=torch.float)
    
    # Fully connected graph: all nodes are connected, excluding self-loops
    num_nodes = len(time_data)
    edge_index = [[i, j] for i in range(num_nodes) for j in range(num_nodes) if i != j]
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    
    # Create graph data object
    graph_data = Data(x=node_features, edge_index=edge_index)
    graphs.append(graph_data)

# Now, `graphs` contains a list of Data objects for each time step


In [7]:
len(graphs)

1104

In [9]:
class GraphEmbeddingModel(torch.nn.Module):
    def __init__(self, num_node_features):
        super(GraphEmbeddingModel, self).__init__()
        self.conv1 = GCNConv(num_node_features, 128)
        self.conv2 = GCNConv(128, 64)
        self.fc = Linear(64, 32)  # Embedding size

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))

        # Global mean pooling to get a graph-level embedding
        x = global_mean_pool(x, torch.zeros(x.size(0), dtype=int).to(x.device))  # Assuming batch size of 1 for simplicity

        x = self.fc(x)
        return x


class SequenceModel(torch.nn.Module):
    def __init__(self, input_size, sequence_length, nhead, num_encoder_layers, dim_feedforward):
        super(SequenceModel, self).__init__()
        self.encoder_layer = TransformerEncoderLayer(d_model=input_size, nhead=nhead, dim_feedforward=dim_feedforward)
        self.transformer_encoder = TransformerEncoder(self.encoder_layer, num_layers=num_encoder_layers)
        self.fc_out = Linear(input_size, input_size)  # Predicting the next graph embedding

    def forward(self, src):
        output = self.transformer_encoder(src)
        output = self.fc_out(output)
        return output

In [10]:
# Assuming graphs is your list of Data objects
train_size = int(len(graphs) * 0.8)
train_graphs, test_graphs = graphs[:train_size], graphs[train_size:]

In [30]:
def create_embedding_sequences(embeddings, sequence_length):
    """
    Create sequences of embeddings for training the sequence model.
    
    :param embeddings: A list of embeddings, where each embedding corresponds to a graph.
    :param sequence_length: The fixed length of each sequence.
    :return: A list of sequences suitable for training/testing the sequence model.
    """
    sequences = []
    for i in range(len(embeddings) - sequence_length):
        sequence = torch.stack(embeddings[i:i+sequence_length])
        sequences.append(torch.Tensor(sequence))
    return torch.stack(sequences)


In [56]:
# Assume graph_model is your GraphEmbeddingModel instance
def graph_to_embedding(data_loader, graph_model, device):
    """
    Process graphs through the GNN model to obtain embeddings.
    
    :param data_loader: DataLoader containing graph Data objects.
    :param graph_model: The trained graph embedding model.
    :param device: The device (CPU or GPU) to perform computation on.
    :return: A list of graph embeddings.
    """
    graph_model.eval()
    embeddings = []
    with torch.no_grad():
        for data in data_loader:
            data = data.to(device)
            embedding = graph_model(data)
            embeddings.extend(embedding.detach().cpu())
    return embeddings

# Create a DataLoader for your graphs
batch_size = 32 # Adjust based on your system's capability
train_loader = DataLoader(train_graphs, batch_size=batch_size, shuffle=True)
print(len(train_graphs))

# Convert graphs to embeddings
graph_model = GraphEmbeddingModel(num_node_features=7)  # Assuming 7 features for each node
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_embeddings = graph_to_embedding(train_loader, graph_model, device)
print(len(train_embeddings))

# Now, use create_embedding_sequences to prepare input for the Transformer
sequence_length = 96  # Define the sequence length for your application
sequences = create_embedding_sequences(train_embeddings, sequence_length)


883


TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'torch_geometric.data.data.Data'>

In [51]:
from torch.utils.data import TensorDataset, DataLoader

# Split sequences into inputs and targets
input_sequences = sequences[:-1]  # All but the last for each sequence
target_sequences = sequences[1:]  # From the second to the end for each sequence

# Create TensorDataset and DataLoader for sequence data
sequence_dataset = TensorDataset(input_sequences, target_sequences)
sequence_loader = DataLoader(sequence_dataset, batch_size=batch_size, shuffle=True)

# Now, you can iterate over sequence_loader to train your Transformer model


In [50]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
graph_model = GraphEmbeddingModel(num_node_features=7).to(device)  # 7 node features as per your dataset
sequence_model = SequenceModel(input_size=32, sequence_length=10, nhead=4, num_encoder_layers=3, dim_feedforward=128).to(device)  # Adjust parameters as needed

optimizer = torch.optim.Adam(list(graph_model.parameters()) + list(sequence_model.parameters()), lr=0.001)
criterion = torch.nn.MSELoss()  # Assuming you're predicting continuous values

# Example training loop for one epoch
graph_model.train()
sequence_model.train()
sequence_length = 96 # 4 days
# for data in train_graphs:  # You need to adapt this loop to handle sequences properly
embedding_sequence = create_embedding_sequences(train_embeddings, sequence_length + 1)

# for i in range(len(train_graphs) - sequence_length):
for i, seq in enumerate(embedding_sequence[:-1]):
    # data = data.to(device)
    optimizer.zero_grad()
    #graph_embeddings = [graph_model(x) for x in train_graphs[i:i+sequence_length]]
    # graph_embeddings = torch.stack(graph_embeddings)  

    graph_sequence = embedding_sequence[i]
    target_sequence = embedding_sequence[i+1]

    # sequence_of_embeddings = create_embedding_sequences(graph_embeddings, sequence_length)
    # target_embeddings = sequence_of_embeddings[1:]  # Assuming you're predicting the next embedding in the sequence

    # Here, you should collect a sequence of embeddings and then pass them through the sequence model
    # output = sequence_model(sequence_of_embeddings)
    output = sequence_model(graph_sequence)
    loss = criterion(output, target_sequence)
    loss.backward()
    optimizer.step()

# Similarly, set up your evaluation loop, ensuring to track and evaluate the loss over time




In [52]:
# Now, you can iterate over sequence_loader to train your Transformer model
for epoch in range(10):
    for i, (inputs, targets) in enumerate(sequence_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        output = sequence_model(inputs)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        print(f'Epoch [{epoch+1}/{10}], Step [{i+1}/{len(sequence_loader)}], Loss: {loss.item():.4f}')

Epoch [1/10], Step [1/786], Loss: 0.0016
Epoch [1/10], Step [2/786], Loss: 0.0016
Epoch [1/10], Step [3/786], Loss: 0.0018
Epoch [1/10], Step [4/786], Loss: 0.0015
Epoch [1/10], Step [5/786], Loss: 0.0015
Epoch [1/10], Step [6/786], Loss: 0.0014
Epoch [1/10], Step [7/786], Loss: 0.0014
Epoch [1/10], Step [8/786], Loss: 0.0014
Epoch [1/10], Step [9/786], Loss: 0.0012
Epoch [1/10], Step [10/786], Loss: 0.0013
Epoch [1/10], Step [11/786], Loss: 0.0011
Epoch [1/10], Step [12/786], Loss: 0.0013
Epoch [1/10], Step [13/786], Loss: 0.0012
Epoch [1/10], Step [14/786], Loss: 0.0012
Epoch [1/10], Step [15/786], Loss: 0.0011
Epoch [1/10], Step [16/786], Loss: 0.0011
Epoch [1/10], Step [17/786], Loss: 0.0011
Epoch [1/10], Step [18/786], Loss: 0.0011
Epoch [1/10], Step [19/786], Loss: 0.0010
Epoch [1/10], Step [20/786], Loss: 0.0010
Epoch [1/10], Step [21/786], Loss: 0.0010
Epoch [1/10], Step [22/786], Loss: 0.0010
Epoch [1/10], Step [23/786], Loss: 0.0011
Epoch [1/10], Step [24/786], Loss: 0.0010
E