In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [17]:
df = pd.read_csv("/content/drive/MyDrive/bus_sample 2.csv")

In [18]:
df.head()

Unnamed: 0,Datetime,DirectionRef,PublishedLineName,NextStopPointName,ArrivalProximityText,ExpectedArrivalTime
0,2017-08-01 00:01:03,0.0,0.765432,0.925,0.8,0.119565
1,2017-08-01 00:00:52,0.0,0.092593,0.286364,0.9,0.347826
2,2017-08-01 00:01:18,1.0,0.845679,0.020455,1.0,0.01087
3,2017-08-01 00:01:05,0.0,0.635802,0.575,0.7,0.967391
4,2017-08-01 00:01:05,0.0,0.635802,0.575,0.7,0.967391


In [19]:
df.shape

(500, 6)

In [20]:
df.dtypes

Unnamed: 0,0
Datetime,object
DirectionRef,float64
PublishedLineName,float64
NextStopPointName,float64
ArrivalProximityText,float64
ExpectedArrivalTime,float64


In [21]:
!pip install torch
!pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install torch-geometric


Looking in links: https://data.pyg.org/whl/torch-1.10.0+cu113.html
Looking in links: https://data.pyg.org/whl/torch-1.10.0+cu113.html


In [22]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

def create_edge_index_from_groups(df):
    """
    Create edge index where edges are only between nodes with the same PublishedLineName.
    """
    import torch
    # Dictionary to hold node indices for each PublishedLineName
    line_groups = df.groupby('PublishedLineName').apply(lambda x: x.index.tolist()).to_dict()

    # Create edge index list
    edge_index_list = []

    for nodes in line_groups.values():
        # Create a fully connected subgraph for nodes in this group
        if len(nodes) > 1:
            edges = [[i, j] for i in nodes for j in nodes if i != j]
            edge_index_list.extend(edges)

    # Convert to tensor
    # Ensure the indices in edge_index_list are within the range of the number of nodes in the training data
    num_nodes = len(df)
    edge_index_list = [[i % num_nodes, j % num_nodes] for i, j in edge_index_list]

    edge_index = torch.tensor(edge_index_list, dtype=torch.long).t().contiguous()

    return edge_index

# Example: Assuming `train` is your DataFrame with node features
features = train.drop(['ExpectedArrivalTime', 'Datetime', 'PublishedLineName'], axis=1).values
target = train['ExpectedArrivalTime'].values
line_names = train['PublishedLineName'].values

# Convert the features and target to PyTorch tensors
features = torch.tensor(features, dtype=torch.float)
target = torch.tensor(target, dtype=torch.float)

# Create edge index based on the PublishedLineName column
edge_index = create_edge_index_from_groups(train)

# Create the GNN model
input_dim = features.shape[1]
hidden_dim = 64
output_dim = 1  # Each node will have one output value

class GNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, hidden_dim)  # Add more layers if necessary
        self.fc = nn.Linear(hidden_dim, output_dim)  # Output dimension for each node

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)  # Forward through additional layers
        x = F.relu(x)
        x = self.fc(x)  # Each node gets its own prediction
        return x

model = GNN(input_dim, hidden_dim, output_dim)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(features, edge_index)
    loss = criterion(outputs.squeeze(), target)
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [10/100], Loss: 0.0610
Epoch [20/100], Loss: 0.0582
Epoch [30/100], Loss: 0.0564
Epoch [40/100], Loss: 0.0553
Epoch [50/100], Loss: 0.0536
Epoch [60/100], Loss: 0.0510
Epoch [70/100], Loss: 0.0500
Epoch [80/100], Loss: 0.0485
Epoch [90/100], Loss: 0.0477
Epoch [100/100], Loss: 0.0474


In [24]:
# Extract features from test data
test_features = test.drop(['ExpectedArrivalTime', 'Datetime', 'PublishedLineName'], axis=1).values
test_features = torch.tensor(test_features, dtype=torch.float)


# Extract target from test data
test_target = test['ExpectedArrivalTime'].values
test_target = torch.tensor(test_target, dtype=torch.float)

import torch
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Define edge index for test data (assuming fully connected)
test_edge_index = torch.tensor([[i, j] for i in range(test_features.shape[0])
                                 for j in range(test_features.shape[0])], dtype=torch.long)
test_edge_index = test_edge_index.t().contiguous()

# Convert test features and target to PyTorch tensors
test_features = torch.tensor(test_features, dtype=torch.float)
test_target = torch.tensor(test_target, dtype=torch.float).view(-1, 1)  # Ensure target is of shape (N, 1)

# Get model predictions for test data
with torch.no_grad():
    test_outputs = model(test_features, test_edge_index)

# Convert tensors to numpy arrays for metric calculation
test_outputs_np = test_outputs.squeeze().numpy()  # Shape (num_nodes,)
test_target_np = test_target.squeeze().numpy()  # Shape (num_nodes,)

# Calculate metrics
mae = mean_absolute_error(test_target_np, test_outputs_np)
mse = mean_squared_error(test_target_np, test_outputs_np)
rmse = np.sqrt(mse)

# Calculate MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    nonzero_indices = y_true != 0  # Avoid division by zero
    return np.mean(np.abs((y_true[nonzero_indices] - y_pred[nonzero_indices]) / y_true[nonzero_indices])) * 100

mape = mean_absolute_percentage_error(test_target_np, test_outputs_np)

print(f'MAE: {mae:.4f}')
print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'MAPE: {mape:.4f}%')



MAE: 0.1889
MSE: 0.0630
RMSE: 0.2510
MAPE: 736.0327%


  test_features = torch.tensor(test_features, dtype=torch.float)
  test_target = torch.tensor(test_target, dtype=torch.float).view(-1, 1)  # Ensure target is of shape (N, 1)


In [25]:
print("Predicted Outputs:")
print(test_outputs.squeeze())
print("\nReal Outputs:")
print(test_target.squeeze())

Predicted Outputs:
tensor([0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552,
        0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552,
        0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552,
        0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552,
        0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552,
        0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552,
        0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552,
        0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552,
        0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552,
        0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552,
        0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552, 0.1552,
        0.1552])

Real Outputs:
tensor([0.0435, 0.2065, 0.0109, 0.0109, 0.2826, 0.0109, 0.0435, 0.010

In [14]:
torch.save(model.state_dict(), 'trained_gnn_model.pth')