In [1]:
# tutorial from: https://towardsdatascience.com/hands-on-graph-neural-networks-with-pytorch-pytorch-geometric-359487e221a8
# https://github.com/khuangaf/PyTorch-Geometric-YooChoose
# 
# 

In [1]:
import numpy as np
import pandas as pd
import torch
from torch_geometric.data import Data

# Part 1 - Naive application

In [2]:
# node inputs and outputs
x = torch.tensor([[2,1], [5,6], [3,7], [12,0]], dtype=torch.float)
y = torch.tensor([0, 1, 0, 1], dtype=torch.float)

In [3]:
# define links in a graph.
# directed graph from source to target nodes! 
edge_index = torch.tensor([[0, 1, 2, 0, 3],
                           [1, 0, 1, 3, 2]], dtype=torch.long)

In [4]:
# create the graph data
# sw: I learnt the basic format to prepare the data set.
data = Data(x=x, y=y, edge_index=edge_index)
data

Data(edge_index=[2, 5], x=[4, 2], y=[4])

In [6]:
# note: InMemoryDataset vs. Dataset.
# It depends on the size of the data sets. 
# Q: How is the batch even possible? 
# Q: How to reconcile the global computing in basic GCN and the batch thing (local)?

In [7]:
# # read winzip and extract files
# import py7zr
# archive = py7zr.SevenZipFile('./data_shenhao/graph_choose_buy/yoochoose-data.7z', mode='r')
# archive.extractall(path="./data_shenhao/graph_choose_buy/")
# archive.close()

# Part 2. graph data preparation

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
# read files
# sw: the basic data set is very similar to many travel data sets. 
df = pd.read_csv('./data_shenhao/graph_choose_buy/yoochoose-clicks.dat', header=None)
df.columns=['session_id','timestamp','item_id','category']
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07T10:51:09.277Z,214536502,0
1,1,2014-04-07T10:54:09.868Z,214536500,0
2,1,2014-04-07T10:54:46.998Z,214536506,0
3,1,2014-04-07T10:57:00.306Z,214577561,0
4,2,2014-04-07T13:56:37.614Z,214662742,0


In [8]:
# 
buy_df = pd.read_csv('./data_shenhao/graph_choose_buy/yoochoose-buys.dat', header=None)
buy_df.columns=['session_id','timestamp','item_id','price','quantity']

In [11]:
# 
item_encoder = LabelEncoder() # Q: What is this fun? A: transform the discrete values to 0~N.
df['item_id'] = item_encoder.fit_transform(df.item_id)
df.head()


Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07T10:51:09.277Z,2053,0
1,1,2014-04-07T10:54:09.868Z,2052,0
2,1,2014-04-07T10:54:46.998Z,2054,0
3,1,2014-04-07T10:57:00.306Z,9876,0
4,2,2014-04-07T13:56:37.614Z,19448,0


In [12]:
# randomly sample a couple of them
sampled_session_id = np.random.choice(df.session_id.unique(), 100000, replace=False)
df = df.loc[df.session_id.isin(sampled_session_id)] # sw: good indexing function. 
df.nunique() # sw: good summary report function.

session_id    100000
timestamp     356209
item_id        20523
category         111
dtype: int64

In [13]:
# Check if clicks lead to buy
# sw: imagine it is the relationship between travel activities and travel modes. 
df['label'] = df.session_id.isin(buy_df.session_id)
df.head(15)

Unnamed: 0,session_id,timestamp,item_id,category,label
198,62,2014-04-06T15:42:34.618Z,28438,0,False
199,62,2014-04-06T15:43:00.299Z,12957,0,False
200,62,2014-04-06T15:44:51.971Z,41158,0,False
201,62,2014-04-06T15:45:44.613Z,31369,0,False
202,62,2014-04-06T15:45:49.462Z,39937,0,False
203,62,2014-04-06T15:46:27.323Z,29199,0,False
204,62,2014-04-06T15:46:47.147Z,41270,0,False
205,62,2014-04-06T15:47:14.526Z,31812,0,False
206,62,2014-04-06T15:48:36.052Z,41262,0,False
207,62,2014-04-06T15:48:56.201Z,37075,0,False


In [37]:
import torch
from torch_geometric.data import InMemoryDataset
from tqdm import tqdm

In [38]:
# To-Be read carefully...

class YooChooseBinaryDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(YooChooseBinaryDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0]) 
        # sw Q: what is this processed_paths[0]? 
        # sw A: it inherits from InMemoryDataset and Dataset object. We skip data processing if the processed_file_names already exists.

    @property
    def raw_file_names(self):
        return []
    
    @property
    def processed_file_names(self):
        return ['yoochoose_click_binary_1M_sess.dataset']

    def download(self):
        pass
    
    def process(self): 
        # sw: data is processed here and automatically saved into a folder called processed.
        data_list = []

        # process by session_id
        grouped = df.groupby('session_id')
        
        for session_id, group in tqdm(grouped): # sw: tqdm amazing tool to iterate over the grouped object! 
            sess_item_id = LabelEncoder().fit_transform(group.item_id)
            group = group.reset_index(drop=True)
            group['sess_item_id'] = sess_item_id
            node_features = group.loc[group.session_id==session_id,['sess_item_id','item_id']].sort_values('sess_item_id').item_id.drop_duplicates().values

            node_features = torch.LongTensor(node_features).unsqueeze(1)
            target_nodes = group.sess_item_id.values[1:]
            source_nodes = group.sess_item_id.values[:-1]

            edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
            x = node_features

            y = torch.FloatTensor([group.label.values[0]])

            data = Data(x=x, edge_index=edge_index, y=y)
            data_list.append(data) # sw: a list of graphs. The authors reduce the sessions into many graphs.
        
        data, slices = self.collate(data_list) # sw: unclear about the purpose of the function - but always add it!
        torch.save((data, slices), self.processed_paths[0])


In [34]:
# sw analyzes the block above. Very INFORMATIVE.
grouped = df.groupby('session_id')
for session_id, group in tqdm(grouped):
    break
print("Initial session id is {}".format(session_id))
print("Show an individual group in the groupby obj", group)

sess_item_id = LabelEncoder().fit_transform(group.item_id)
print("Fit transformed item id is {}".format(sess_item_id))

group = group.reset_index(drop=True)
print("reindexed group")
print(group)

group['sess_item_id'] = sess_item_id

node_features = group.loc[group.session_id==session_id,['sess_item_id','item_id']].sort_values('sess_item_id').item_id.drop_duplicates().values
print("node features are", node_features) # sw: wait the node_features are the item id...

node_features = torch.LongTensor(node_features).unsqueeze(1)
print("node features are", node_features) # sw: adjust the node features' dim - N * 1.

target_nodes = group.sess_item_id.values[1:]
source_nodes = group.sess_item_id.values[:-1]
print("targeting nodes are ", target_nodes)
print("source nodes are ", source_nodes)

edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
print("edge index are", edge_index) # sw: note that the edge indicators are dif from node item indicators. 

x = node_features
print("Final node features are the item IDs {}".format(x))

y = torch.FloatTensor([group.label.values[0]])
print(group.label.values[0])
print("Final targeting variable is {}".format(y))

data = Data(x=x, edge_index=edge_index, y=y)
print("One block of data looks like:", data)


  0%|          | 0/100000 [00:00<?, ?it/s]

Initial session id is 62
Show an individual group in the groupby obj      session_id                 timestamp  item_id category  label
198          62  2014-04-06T15:42:34.618Z    28438        0  False
199          62  2014-04-06T15:43:00.299Z    12957        0  False
200          62  2014-04-06T15:44:51.971Z    41158        0  False
201          62  2014-04-06T15:45:44.613Z    31369        0  False
202          62  2014-04-06T15:45:49.462Z    39937        0  False
203          62  2014-04-06T15:46:27.323Z    29199        0  False
204          62  2014-04-06T15:46:47.147Z    41270        0  False
205          62  2014-04-06T15:47:14.526Z    31812        0  False
206          62  2014-04-06T15:48:36.052Z    41262        0  False
207          62  2014-04-06T15:48:56.201Z    37075        0  False
Fit transformed item id is [1 0 7 3 6 2 9 4 8 5]
reindexed group
   session_id                 timestamp  item_id category  label
0          62  2014-04-06T15:42:34.618Z    28438        0  False




In [39]:
dataset = YooChooseBinaryDataset('./data_shenhao/graph_choose_buy/') # sw: this is the root dir for data processing. 
#! First run costs 30 min

In [40]:
dataset = dataset.shuffle()
train_dataset = dataset[:80000]
val_dataset = dataset[80000:90000]
test_dataset = dataset[90000:]
len(train_dataset), len(val_dataset), len(test_dataset)

(80000, 10000, 10000)

In [41]:
from torch_geometric.data import DataLoader
batch_size= 1024
train_loader = DataLoader(train_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [42]:
num_items = df.item_id.max() +1 # return the total number of items.
num_items

52707

In [48]:
# sw: analyze the data structure in data loader
# sw: train_loader has many IID graphs. Each graph has node attributes, edge attributes, edge index, etc.

for data in train_loader:
    d = data
    break

print(d)
print(d.batch)
print(d.edge_index)
print(d.x)
print(d.y)


Batch(batch=[2890], edge_index=[2, 2481], x=[2890, 1], y=[1024])
tensor([   0,    0,    0,  ..., 1022, 1023, 1023])
tensor([[   3,    2,    1,  ..., 2885, 2887, 2889],
        [   2,    1,    2,  ..., 2885, 2887, 2888]])
tensor([[  420],
        [ 2287],
        [20230],
        ...,
        [14040],
        [41981],
        [47208]])
tensor([0., 0., 0.,  ..., 0., 0., 0.])


In [41]:
import torch
from torch.nn import Sequential as Seq, Linear, ReLU
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import remove_self_loops, add_self_loops
class SAGEConv(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super(SAGEConv, self).__init__(aggr='max') #  "Max" aggregation.
        self.lin = torch.nn.Linear(in_channels, out_channels)
        self.act = torch.nn.ReLU()
        self.update_lin = torch.nn.Linear(in_channels + out_channels, in_channels, bias=False)
        self.update_act = torch.nn.ReLU()
        
    def forward(self, x, edge_index):
        # x has shape [N, in_channels]
        # edge_index has shape [2, E]
        
        
        edge_index, _ = remove_self_loops(edge_index)
        edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))
        
        
        return self.propagate(edge_index, size=(x.size(0), x.size(0)), x=x)

    def message(self, x_j):
        # x_j has shape [E, in_channels]

        x_j = self.lin(x_j)
        x_j = self.act(x_j)
        
        return x_j

    def update(self, aggr_out, x):
        # aggr_out has shape [N, out_channels]


        new_embedding = torch.cat([aggr_out, x], dim=1)
        
        new_embedding = self.update_lin(new_embedding)
        new_embedding = self.update_act(new_embedding)
        
        return new_embedding

In [42]:
embed_dim = 128
from torch_geometric.nn import GraphConv, TopKPooling, GatedGraphConv
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp
import torch.nn.functional as F
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = SAGEConv(embed_dim, 128)
        self.pool1 = TopKPooling(128, ratio=0.8)
        self.conv2 = SAGEConv(128, 128)
        self.pool2 = TopKPooling(128, ratio=0.8)
        self.conv3 = SAGEConv(128, 128)
        self.pool3 = TopKPooling(128, ratio=0.8)
        self.item_embedding = torch.nn.Embedding(num_embeddings=df.item_id.max() +1, embedding_dim=embed_dim)
        self.lin1 = torch.nn.Linear(256, 128)
        self.lin2 = torch.nn.Linear(128, 64)
        self.lin3 = torch.nn.Linear(64, 1)
        self.bn1 = torch.nn.BatchNorm1d(128)
        self.bn2 = torch.nn.BatchNorm1d(64)
        self.act1 = torch.nn.ReLU()
        self.act2 = torch.nn.ReLU()        
  
    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.item_embedding(x)
        x = x.squeeze(1)        

        x = F.relu(self.conv1(x, edge_index))

        x, edge_index, _, batch, _, _ = self.pool1(x, edge_index, None, batch)
        x1 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = F.relu(self.conv2(x, edge_index))
     
        x, edge_index, _, batch, _, _ = self.pool2(x, edge_index, None, batch)
        x2 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = F.relu(self.conv3(x, edge_index))

        x, edge_index, _, batch, _, _ = self.pool3(x, edge_index, None, batch)
        x3 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = x1 + x2 + x3

        x = self.lin1(x)
        x = self.act1(x)
        x = self.lin2(x)
        x = self.act2(x)      
        x = F.dropout(x, p=0.5, training=self.training)

        x = torch.sigmoid(self.lin3(x)).squeeze(1)

        return x

In [43]:
device = torch.device('cuda:0')
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
crit = torch.nn.BCELoss()

In [44]:
def train():
    model.train()

    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        label = data.y.to(device)
        loss = crit(output, label)
        loss.backward()
        loss_all += data.num_graphs * loss.item()
        optimizer.step()
    return loss_all / len(train_dataset)

In [45]:
from sklearn.metrics import roc_auc_score
def evaluate(loader):
    model.eval()

    predictions = []
    labels = []

    with torch.no_grad():
        for data in loader:

            data = data.to(device)
            pred = model(data).detach().cpu().numpy()

            label = data.y.detach().cpu().numpy()
            predictions.append(pred)
            labels.append(label)

    predictions = np.hstack(predictions)
    labels = np.hstack(labels)
    
    return roc_auc_score(labels, predictions)

In [47]:
for epoch in range(3):
    loss = train()
    train_acc = evaluate(train_loader)
    val_acc = evaluate(val_loader)    
    test_acc = evaluate(test_loader)
    print('Epoch: {:03d}, Loss: {:.5f}, Train Auc: {:.5f}, Val Auc: {:.5f}, Test Auc: {:.5f}'.
          format(epoch, loss, train_acc, val_acc, test_acc))

Epoch: 000, Loss: 0.19731, Train Auc: 0.83172, Val Auc: 0.66619, Test Auc: 0.65998
Epoch: 001, Loss: 0.17814, Train Auc: 0.86945, Val Auc: 0.65645, Test Auc: 0.64241
Epoch: 002, Loss: 0.15777, Train Auc: 0.88363, Val Auc: 0.65822, Test Auc: 0.64234


In [8]:
import csv
import os

import pandas as pd

from ..datasets.dataset_base import DatasetBase
from ..utils.constants import (
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
    DEFAULT_TIMESTAMP_COL,
    DEFAULT_USER_COL,
)

# Download URL
YOOCHOOSE_URL = "https://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-data.7z"


ValueError: attempted relative import beyond top-level package