In [9]:
import os
import sys

# notebook's parent directory path
module_path = os.path.abspath('..')

# add to sys.path if not already present
if module_path not in sys.path:
    sys.path.append(module_path)

# Graph Classification Demo

Demo classifying the work status of day-long schedules using node lables (activity and zone).

In [10]:
from pathlib import Path

from torch import stack, cat, nn
from torch_geometric.loader import DataLoader
from ntsx.nx_to_torch import nx_to_torch_geo
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv, global_mean_pool, GCNConv

from ntsx import graph_ops, nts_to_nx
from ntsx import read_nts
from ntsx.models.models import GCNGraphLabeller, GATGraphLabeller
from ntsx.models.embed import MultiTokenEmbedSum
from ntsx.encoders.trip_encoder import TripEncoder
from ntsx.encoders.table_encoder import TableTokeniser

In [11]:
# load dummy data (synthesised from UK NTS)

dir = Path("data/dummyNTS/")
trips_path = dir / "trips.tab"
attributes_path = dir / "individuals.tab"
hhs_path = dir / "households.tab"

years = [2021]

write_dir = Path("tmp")
write_dir.mkdir(exist_ok=True)

In [12]:
# load data from disk
trips, labels = read_nts.load_nts(
    trips_path, attributes_path, hhs_path, years=years
)

# assign human readable values to the labels
labels = read_nts.label_mapping(labels)

# initaite the encoders
label_encoder = TableTokeniser(labels, verbose=False)
trip_encoder = TripEncoder(trips)

display(labels[["work_status"]].head())
display(trips.head())

HIDs in people and households do not match, attempting to fix...
Fixed: People 6 -> 7, HHs 5 -> 5


Unnamed: 0_level_0,work_status
iid,Unnamed: 1_level_1
1,unemployed
2,employed
3,unemployed
4,unemployed
5,employed


Unnamed: 0,tid,year,day,iid,hid,seq,mode,oact,dact,freq,tst,tet,ozone,dzone,did,pid
0,1,2021,2,1,1,1,car,home,social,0.989618,675,683,7,7,0,1_1
1,2,2021,2,1,1,2,car,social,other,1.002945,720,735,7,7,0,1_1
2,3,2021,2,1,1,3,car,other,social,0.989618,770,780,7,7,0,1_1
3,4,2021,2,1,1,4,taxi,social,home,0.989618,1110,1130,7,7,0,1_1
4,5,2021,3,1,1,1,car,home,social,0.999891,760,770,7,7,1,1_1


In [13]:
# first encode the trips and lables tables
trips_encoded = trip_encoder.encode_trips_table(trips)
print(f"Activity mapping: {trip_encoder.encoders["oact"].mapping}")

labels_encoded = label_encoder.encode_table(labels)

0        car
1        car
2        car
3       taxi
4        car
       ...  
144      car
145      car
146      car
147    train
148    train
Name: mode, Length: 149, dtype: object
0        home
1      social
2       other
3      social
4        home
        ...  
144      shop
145      home
146    social
147      home
148      work
Name: oact, Length: 149, dtype: object
0      social
1       other
2      social
3        home
4      social
        ...  
144      home
145    social
146      home
147      work
148      home
Name: dact, Length: 149, dtype: object
0       2
1       2
2       2
3       2
4       3
       ..
144    47
145    47
146    47
147    49
148    49
Name: day, Length: 149, dtype: int64
0       675
1       720
2       770
3      1110
4       760
       ... 
144     900
145    1013
146    1306
147     495
148    1052
Name: tst, Length: 149, dtype: int64
0       683
1       735
2       780
3      1130
4       770
       ... 
144     906
145    1023
146    1312
147     

In [14]:
# then build individuals and then days graphs from the trips table, note that we only merge on home (2)
individuals = nts_to_nx.to_individuals_nx(
    trips_encoded, attribute_data=labels_encoded
)
days = []
for ind in individuals:
    g = graph_ops.anchor_activities(ind, [2])
    g = graph_ops.merge_similar(g, duration_tolerance=0.2)

    # now we can create a graph for each day
    indiv_days = [d for _, d in graph_ops.iter_days(g, stop=None)]
    days.extend(indiv_days)

# now we can create a graph dataset
dataset = nx_to_torch_geo(days)

# finally we can create a dataloader
loader = DataLoader(dataset, batch_size=16, shuffle=True)
for data in loader:
    print(data)

DataBatch(edge_index=[2, 67], act=[59], location=[59], duration=[67], day=[16], tst=[67], tet=[67], travel=[67], iid=[16], age=[16], gender=[16], ethnicity=[16], education=[16], license=[16], car_access=[16], work_status=[16], year=[16], area=[16], income=[16], hh_size=[16], hh_composition=[16], hh_children=[16], hh_cars=[16], hh_bikes=[16], hh_motorcycles=[16], num_nodes=59, batch=[59], ptr=[17])
DataBatch(edge_index=[2, 64], act=[58], location=[58], duration=[64], day=[16], tst=[64], tet=[64], travel=[64], iid=[16], age=[16], gender=[16], ethnicity=[16], education=[16], license=[16], car_access=[16], work_status=[16], year=[16], area=[16], income=[16], hh_size=[16], hh_composition=[16], hh_children=[16], hh_cars=[16], hh_bikes=[16], hh_motorcycles=[16], num_nodes=58, batch=[58], ptr=[17])
DataBatch(edge_index=[2, 18], act=[18], location=[18], duration=[18], day=[7], tst=[18], tet=[18], travel=[18], iid=[7], age=[7], gender=[7], ethnicity=[7], education=[7], license=[7], car_access=[7

In [15]:
# train
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

node_embed_sizes = [
    trip_encoder.embed_sizes()["oact"],
    trip_encoder.embed_sizes()["ozone"],
]
target_size = label_encoder.embed_sizes()["work_status"]

model = GCNGraphLabeller(
    node_embed_sizes=node_embed_sizes, target_size=target_size, hidden_size=32
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-3)

model.train()
for epoch in range(10):
    for data in loader:
        data.to(device)
        optimizer.zero_grad()
        out = model(data)
        y = data.work_status
        loss = F.nll_loss(out, y)
        preds = out.argmax(dim=1)
        correct = (preds == y).sum().item()
        acc = correct / len(y)
        print(f"Epoch {epoch}: Loss {loss.item():.4f}. Accuracy: {acc:.4f}")
        loss.backward()
        optimizer.step()

Epoch 0: Loss 0.7464. Accuracy: 0.2500
Epoch 0: Loss 0.6310. Accuracy: 0.7500
Epoch 0: Loss 0.5517. Accuracy: 0.7143
Epoch 1: Loss 0.5789. Accuracy: 0.8125
Epoch 1: Loss 0.6920. Accuracy: 0.5625
Epoch 1: Loss 0.5669. Accuracy: 0.7143
Epoch 2: Loss 0.6180. Accuracy: 0.6875
Epoch 2: Loss 0.5157. Accuracy: 0.8125
Epoch 2: Loss 0.3930. Accuracy: 0.8571
Epoch 3: Loss 0.5071. Accuracy: 0.6875
Epoch 3: Loss 0.6151. Accuracy: 0.7500
Epoch 3: Loss 0.5650. Accuracy: 0.7143
Epoch 4: Loss 0.3647. Accuracy: 0.9375
Epoch 4: Loss 0.6442. Accuracy: 0.6250
Epoch 4: Loss 0.4808. Accuracy: 0.8571
Epoch 5: Loss 0.5635. Accuracy: 0.6875
Epoch 5: Loss 0.7138. Accuracy: 0.5000
Epoch 5: Loss 0.2678. Accuracy: 1.0000
Epoch 6: Loss 0.3574. Accuracy: 0.8125
Epoch 6: Loss 0.6045. Accuracy: 0.6250
Epoch 6: Loss 0.4372. Accuracy: 0.7143
Epoch 7: Loss 0.3088. Accuracy: 0.9375
Epoch 7: Loss 0.5170. Accuracy: 0.7500
Epoch 7: Loss 0.3337. Accuracy: 0.8571
Epoch 8: Loss 0.3905. Accuracy: 0.8750
Epoch 8: Loss 0.4022. Acc

In [16]:
# train
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

node_embed_sizes = [
    trip_encoder.embed_sizes()["oact"],
    trip_encoder.embed_sizes()["ozone"],
]
edge_embed_sizes = [trip_encoder.embed_sizes()["mode"]]

target_size = label_encoder.embed_sizes()["work_status"]

model = GATGraphLabeller(
    node_embed_sizes=node_embed_sizes,
    edge_embed_sizes=edge_embed_sizes,
    target_size=target_size,
    hidden_size=32,
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-3)

model.train()
for epoch in range(10):
    for data in loader:
        data.to(device)
        optimizer.zero_grad()
        out = model(data)
        y = data.work_status
        loss = F.nll_loss(out, y)
        preds = out.argmax(dim=1)
        correct = (preds == y).sum().item()
        acc = correct / len(y)
        print(f"Epoch {epoch}: Loss {loss.item():.4f}. Accuracy: {acc:.4f}")
        loss.backward()
        optimizer.step()

Epoch 0: Loss 0.7302. Accuracy: 0.5000
Epoch 0: Loss 0.7852. Accuracy: 0.6250
Epoch 0: Loss 0.6476. Accuracy: 0.5714
Epoch 1: Loss 0.8892. Accuracy: 0.4375
Epoch 1: Loss 0.4691. Accuracy: 0.8750
Epoch 1: Loss 0.8000. Accuracy: 0.5714
Epoch 2: Loss 0.6960. Accuracy: 0.5625
Epoch 2: Loss 0.5086. Accuracy: 0.6875
Epoch 2: Loss 0.5554. Accuracy: 0.7143
Epoch 3: Loss 0.4289. Accuracy: 0.7500
Epoch 3: Loss 0.8163. Accuracy: 0.5000
Epoch 3: Loss 0.5591. Accuracy: 0.5714
Epoch 4: Loss 0.4607. Accuracy: 0.8125
Epoch 4: Loss 0.4761. Accuracy: 0.8125
Epoch 4: Loss 0.8089. Accuracy: 0.5714
Epoch 5: Loss 0.5399. Accuracy: 0.8125
Epoch 5: Loss 0.4284. Accuracy: 0.7500
Epoch 5: Loss 0.4224. Accuracy: 1.0000
Epoch 6: Loss 0.4611. Accuracy: 0.8125
Epoch 6: Loss 0.4177. Accuracy: 0.8750
Epoch 6: Loss 0.2843. Accuracy: 0.8571
Epoch 7: Loss 0.4042. Accuracy: 0.8750
Epoch 7: Loss 0.4418. Accuracy: 0.7500
Epoch 7: Loss 0.4205. Accuracy: 0.7143
Epoch 8: Loss 0.4633. Accuracy: 0.8125
Epoch 8: Loss 0.4116. Acc