In [1]:
import pickle
import random

# Parameters
filename = "cat"
useRatio = 0.0001
HIDDEN_DIM = 256

# open file ../data/{filename}_journeys.pkl
with open(f'../data/{filename}_journeys.pkl', 'rb') as f:
    # sample only the journeys
    journeys = pickle.load(f)
    journeys = random.sample(journeys, int(len(journeys) * useRatio))

# open the graph pkl
with open(f'../../Dominators/graphs/{filename}_combined_graph.pkl', 'rb') as f:
    graph = pickle.load(f)

In [2]:
# check the node types
types = set()
for i in range(len(journeys[0])):
    types.add(graph.nodes[journeys[0][i]]['node_type'])

# experiment with single digit features


### for graphsage

In [3]:
# create type and label id mapping
type2id = {}
for t in types:
    type2id[t] = len(type2id)

label2id = {}
for i, node in enumerate(graph):
    if graph.nodes[node]['label'] not in label2id:
        label2id[graph.nodes[node]['label']] = len(label2id)

# create node idx to id mapping
node2id = {}
for i, node in enumerate(graph):
    node2id[node] = i + 1
    

# create edge list
edge_list_dep = []
edge_list_rev = []

for node in graph.nodes:
    for neighbor in graph.neighbors(node):
        edge_list_dep.append(node2id[node] - 1)
        edge_list_rev.append(node2id[neighbor] - 1)
        
# create node feature list
featureMatrix = []

for node in graph.nodes:
    featureMatrix.append([type2id[graph.nodes[node]['node_type']], label2id[graph.nodes[node]['label']]])

num_features = len(featureMatrix[0])

### for lstm

In [4]:
import torch

# jounrey to node id
journeys_id = []

for journey in journeys:
    journey_id = []
    for node in journey:
        journey_id.append(node2id[node])
    journeys_id.append(torch.tensor(journey_id, dtype=torch.long))

In [5]:
# trace in journey to fm
journeys_fm = []

for journey in journeys_id:
    journey_fm = []
    for node in journey:
        journey_fm.append(featureMatrix[node])
    journeys_fm.append(torch.tensor(journey_fm, dtype=torch.float))

In [6]:
# padd for the same length
from torch.nn.utils.rnn import pad_sequence

# Pad the sequences so they are the same length
traces_x = pad_sequence(journeys_fm, batch_first=True)
traces_y = pad_sequence(journeys_id, batch_first=True)

traces_x.size(), traces_y.size()

(torch.Size([100, 80, 2]), torch.Size([100, 80]))

In [7]:
# to tensor
featureMatrix = torch.tensor(featureMatrix, dtype=torch.float)
edge_list = torch.tensor([edge_list_dep, edge_list_rev], dtype=torch.long)

# create dataset

In [8]:
# to dataset
from torch.utils.data import Dataset, DataLoader

class TraceDataset(Dataset):
    def __init__(self, traces_x, traces_y):
        self.traces_x = traces_x
        self.traces_y = traces_y
    
    def __len__(self):
        return len(self.traces_x)
    
    def __getitem__(self, idx):
        return self.traces_x[idx], self.traces_y[idx]

# Creating Dataset
trace_dataset = TraceDataset(traces_x, traces_y)

# Creating DataLoader with batch size 32, adjust as necessary
trace_loader = DataLoader(trace_dataset, batch_size=8, shuffle=True)


# experiment

In [9]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from tqdm import tqdm

class BiLSTM(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, num_layers=1):
        super(BiLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=in_channels,
                            hidden_size=hidden_channels,
                            num_layers=num_layers,
                            bidirectional=True,
                            batch_first=True)
        
    def forward(self, x):
        output, _ = self.lstm(x)
        return output
    
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        #x = F.dropout(x, p=0.9, training=self.training)
        return x

class CombinedModel(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super(CombinedModel, self).__init__()
        self.graphsage = GraphSAGE(in_channels, hidden_channels)
        self.bilstm = BiLSTM(in_channels, hidden_channels//2)

    def forward(self, trace, x, edge_index):
        embeddings = self.graphsage(x, edge_index)
        out_bilstm = self.bilstm(trace)
        embeddings_expanded = embeddings.unsqueeze(0).expand(out_bilstm.size(0), -1, -1)
        combined = torch.bmm(out_bilstm, embeddings_expanded.transpose(1, 2))
        return combined

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CombinedModel(num_features, HIDDEN_DIM).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

def train(batch, featureMatrix, edge_list):
    model.train()
    optimizer.zero_grad()
    trace_x, trace_y = batch
    combined = model(trace_x, featureMatrix, edge_list)
    combined = combined.view(-1, combined.size(-1))
    trace_y = trace_y.view(-1)
    loss = F.cross_entropy(combined, trace_y)
    loss.backward()
    optimizer.step()
    return loss.item()

# Adjust the test function to handle batches
def test(data_loader, featureMatrix, edge_list):
    model.eval()
    num_correct = 0
    num_questions = 0

    with torch.no_grad():
        for batch in data_loader:
            trace_x, trace_y = batch
            combined = model(trace_x, featureMatrix, edge_list)
            print(combined.shape, trace_y.shape)
            out = combined.transpose(0, 1)
            pred = out.argmax(dim=1)
            num_correct += pred.eq(trace_y).sum().item()
            num_questions += len(trace_y)
    
    return num_correct / num_questions

# Adjust the main training loop
for epoch in range(1, 201):
    for batch in tqdm(trace_loader):
        loss = train(batch, featureMatrix, edge_list)
    acc = test(trace_loader, featureMatrix, edge_list)
    print(f'Epoch: {epoch}, Loss: {loss:.4f}, Test Acc: {acc:.4f}')


100%|██████████| 4/4 [02:18<00:00, 34.72s/it]


RuntimeError: The size of tensor a (71698) must match the size of tensor b (80) at non-singleton dimension 1