In [31]:
import torch 
from torch import Tensor 
print(torch.__version__)
import os 

2.4.0+cu124


#### Heterogeneous graph creation

In [32]:
from torch_geometric.data import download_url, extract_zip

url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
extract_zip(download_url(url, './data'), './data')

movies_path = './data/ml-latest-small/movies.csv'
ratings_path = './data/ml-latest-small/ratings.csv'

Using existing file ml-latest-small.zip
Extracting ./data\ml-latest-small.zip


In [33]:
import pandas as pd

# Look through the .csv data files
print(pd.read_csv(movies_path).head())
print(pd.read_csv(ratings_path).head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [34]:
# view only relevant info in the .csv files that you are considering here
print('movies.csv')
print(pd.read_csv(movies_path)[["movieId", "genres"]].head())
print()
print("ratings.csv")
print(pd.read_csv(ratings_path)[["userId", "movieId"]].head())

movies.csv
   movieId                                       genres
0        1  Adventure|Animation|Children|Comedy|Fantasy
1        2                   Adventure|Children|Fantasy
2        3                               Comedy|Romance
3        4                         Comedy|Drama|Romance
4        5                                       Comedy

ratings.csv
   userId  movieId
0       1        1
1       1        3
2       1        6
3       1       47
4       1       50


In [35]:
# Extract the feature vectors for movie-nodes

# Load the entire movie data frame into memory:
movies_df = pd.read_csv(movies_path, index_col='movieId')
print(len(movies_df)) # 9742 movies

# Split genres and convert into indicator variables:
genres = movies_df['genres'].str.get_dummies('|')
print(genres[["Action", "Adventure", "Drama", "Horror"]].head())

# Use genres as movie input features:
movie_feat = torch.from_numpy(genres.values).to(torch.float)
assert movie_feat.size() == (9742, 20)  # 9742 movies, 20 genres in total.
# print(movie_feat.shape)

9742
         Action  Adventure  Drama  Horror
movieId                                  
1             0          1      0       0
2             0          1      0       0
3             0          0      0       0
4             0          0      1       0
5             0          0      0       0


In [36]:
# Get the edge_index between uses and movies

# Step 1: from the ratings.csv, create unique users's & movie's ID
# Step 1: 610 users (0 --> 609) and 9742 movies (0 --> 9741)

ratings_df = pd.read_csv(ratings_path)
print(ratings_df.head())
print(ratings_df.shape)

# Create a mapping from unique user indices to range [0, num_user_nodes):
unique_user_id = ratings_df['userId'].unique()
print(unique_user_id.shape)

unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedID': pd.RangeIndex(len(unique_user_id)),
})

print(unique_user_id.head())
# print(unique_user_id)

print("================================================")
# print(movies_df.head())

# Create a mapping from unique movie indices to range [0, num_movie_nodes):
unique_movie_id = pd.DataFrame(data={
    'movieId': movies_df.index,
    'mappedID': pd.RangeIndex(len(movies_df)),
})

print(unique_movie_id.head())
# print(unique_movie_id.shape)


   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
(100836, 4)
(610,)
   userId  mappedID
0       1         0
1       2         1
2       3         2
3       4         3
4       5         4
   movieId  mappedID
0        1         0
1        2         1
2        3         2
3        4         3
4        5         4


In [37]:
# Step 2: Perform merges to be able to map userId and movieId in ratings.csv to unique userId and movieId

ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id,
                            left_on='userId', right_on='userId', how='left')
print(ratings_user_id.shape) # left merge with ratings to map userId'%%SVG
# print(ratings_user_id)
ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values) #100836
print(ratings_user_id.shape)

ratings_movie_id = pd.merge(ratings_df['movieId'], unique_movie_id,
                            left_on='movieId', right_on='movieId', how='left')
ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)
print(ratings_movie_id.shape)

print(ratings_user_id[:10])
print(ratings_movie_id[:10])

(100836, 2)
torch.Size([100836])
torch.Size([100836])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([  0,   2,   5,  43,  46,  62,  89,  97, 124, 130])


In [38]:
# Step 3: With this, we are ready to construct our `edge_index` in COO format
# following PyG semantics:
edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)
assert edge_index_user_to_movie.size() == (2, 100836)
print(edge_index_user_to_movie)
print(edge_index_user_to_movie.shape)

tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    2,    5,  ..., 9462, 9463, 9503]])
torch.Size([2, 100836])


##### Construct the HeteroData()

In [39]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

data = HeteroData()

# Save node indices:
data["user"].node_id = torch.arange(len(unique_user_id))
data["movie"].node_id = torch.arange(len(movies_df))

# Add the node features and edge indices:
data["movie"].x = movie_feat
data["user", "rates", "movie"].edge_index = edge_index_user_to_movie

# We also need to make sure to add the reverse edges from movies to users
# in order to let a GNN be able to pass messages in both directions.
# We can leverage the `T.ToUndirected()` transform for this from PyG:

data = T.ToUndirected()(data)

print(data)

assert data.node_types == ["user", "movie"]
assert data.edge_types == [("user", "rates", "movie"),
                           ("movie", "rev_rates", "user")]
assert data["user"].num_nodes == 610
assert data["user"].num_features == 0
assert data["movie"].num_nodes == 9742
assert data["movie"].num_features == 20
assert data["user", "rates", "movie"].num_edges == 100836
assert data["movie", "rev_rates", "user"].num_edges == 100836

HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[9742],
    x=[9742, 20],
  },
  (user, rates, movie)={ edge_index=[2, 100836] },
  (movie, rev_rates, user)={ edge_index=[2, 100836] }
)


#### Defining edge-level training splits
- Note: All training edges (message passing + supervision = 80670) are available in the validation/test set for evaluation as can be seen below
- RandomLinkSplit: Performs an edge-level random split into training, val, test sets 

In [40]:
# For this, we first split the set of edges into
# training (80%), validation (10%), and testing edges (10%).
# Across the training edges, we use 70% of edges for message passing,
# and 30% of edges for supervision.
# We further want to generate fixed negative edges for evaluation with a ratio of 2:1.
# Negative edges during training will be generated on-the-fly, so we don't want to
# add them to the graph right away.
# Overall, we can leverage the `RandomLinkSplit()` transform for this from PyG:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0, 
    add_negative_train_samples=False,
    edge_types=("user", "rates", "movie"),
    rev_edge_types=("movie", "rev_rates", "user"),
)
train_data, val_data, test_data = transform(data)
print("Training data:")
print("==============")
print(train_data)
print()
print("Validation data:")
print("================")
print(val_data)

assert train_data["user", "rates", "movie"].num_edges == 56469 # training edges
assert train_data["user", "rates", "movie"].edge_label_index.size(1) == 24201 # supervision edges
assert train_data["movie", "rev_rates", "user"].num_edges == 56469
# No negative edges added:
assert train_data["user", "rates", "movie"].edge_label.min() == 1
assert train_data["user", "rates", "movie"].edge_label.max() == 1

assert val_data["user", "rates", "movie"].num_edges == 80670
assert val_data["user", "rates", "movie"].edge_label_index.size(1) == 30249
assert val_data["movie", "rev_rates", "user"].num_edges == 80670
# Negative edges with ratio 2:1:
assert val_data["user", "rates", "movie"].edge_label.long().bincount().tolist() == [20166, 10083]

Training data:
HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[9742],
    x=[9742, 20],
  },
  (user, rates, movie)={
    edge_index=[2, 56469],
    edge_label=[24201],
    edge_label_index=[2, 24201],
  },
  (movie, rev_rates, user)={ edge_index=[2, 56469] }
)

Validation data:
HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[9742],
    x=[9742, 20],
  },
  (user, rates, movie)={
    edge_index=[2, 80670],
    edge_label=[30249],
    edge_label_index=[2, 30249],
  },
  (movie, rev_rates, user)={ edge_index=[2, 80670] }
)


#### Defining mini-batch loaders
- loader.LinkNeighborLoader: samples multiple hops from both ends of a link (edge) and creates a subgraph from it

In [None]:
# In the first hop, we sample at most 20 neighbors.
# In the second hop, we sample at most 10 neighbors.
# In addition, during training, we want to sample negative edges on-the-fly with
# a ratio of 2:1.
# We can make use of the `loader.LinkNeighborLoader` from PyG:
from torch_geometric.loader import LinkNeighborLoader

# Define seed edges
edge_label_index = train_data["user", "rates", "movie"].edge_label_index # supervision/positive edges or samples
print(edge_label_index.shape)
print(edge_label_index)

edge_label = train_data["user", "rates", "movie"].edge_label # all 1's as positive edges
print(edge_label)
print(edge_label.shape)
print('----------')

train_loader = LinkNeighborLoader(
    data = train_data,
    num_neighbors=[20,10],
    neg_sampling_ratio=2.0,
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=128, # takes 128 positive edges, 2*128 negative edges dynamically (384 edges)
    shuffle=True,
)

# Inspect a sample:
sampled_data = next(iter(train_loader))
print(sampled_data)


torch.Size([2, 24201])
tensor([[ 558,  176,  176,  ...,  602,  462,  245],
        [ 418, 7324, 6273,  ..., 1797, 4360, 7456]])
tensor([1., 1., 1.,  ..., 1., 1., 1.])
torch.Size([24201])
----------
HeteroData(
  user={
    node_id=[607],
    n_id=[607],
    num_sampled_nodes=[3],
  },
  movie={
    node_id=[2760],
    x=[2760, 20],
    n_id=[2760],
    num_sampled_nodes=[3],
  },
  (user, rates, movie)={
    edge_index=[2, 16988],
    edge_label=[384],
    edge_label_index=[2, 384],
    e_id=[16988],
    num_sampled_edges=[2],
    input_id=[128],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 7624],
    e_id=[7624],
    num_sampled_edges=[2],
  }
)


#### Heterogeneous Link-level GNN

In [51]:
data 
# data["user"]

HeteroData(
  user={ node_id=[610] },
  movie={
    node_id=[9742],
    x=[9742, 20],
  },
  (user, rates, movie)={ edge_index=[2, 100836] },
  (movie, rev_rates, user)={ edge_index=[2, 100836] }
)

In [54]:
data["user"].num_nodes, data["movie"].num_nodes

(610, 9742)

In [115]:
from torch_geometric.nn import SAGEConv, to_hetero

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()

        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
    
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        return x

# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, x_user, x_movie, edge_label_index):
        # Convert node embeddings to edge-level representations:
        # (user rates movie): edge_index so edge_index[0] is userId and edge_index[1] is movie 
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]

        # print(edge_feat_user.shape, edge_feat_movie.shape)

        # Apply dot product
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for users and movies:
        self.movie_lin = torch.nn.Linear(20, hidden_channels)
        self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels) # create embedding for users (look-up table like NLP)
        self.movie_emb = torch.nn.Embedding(data["movie"].num_nodes, hidden_channels)

        # Instantiate homogeneous GNN
        self.gnn = GNN(hidden_channels)

        # convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())

        self.classifier = Classifier()
    
    def forward(self, data: HeteroData):
        # user nodeId: 0 -> 609
        # movie nodeId: 0 -> 9741
        # movie x (features): 0/1 20-dim of genre
        # print(data)
        
        x_dict = {
            "user": self.user_emb(data["user"].node_id), # get from nn.Embedding
            "movie": self.movie_lin(data["movie"].x) + self.movie_emb(data["movie"].node_id) # get from nn.Embedding
        }

        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types -- Undirected
        # edge_index_dict holds only "message passing" edges
        x_dict = self.gnn(x_dict, data.edge_index_dict) 

        # edge_label_index hold one direction (user rates movie) "supervision" edges
        # print(data["user","rates","movie"].edge_label_index.shape)
        # fsjhbf
        
        pred = self.classifier(
            x_dict["user"],
            x_dict["movie"],
            data["user","rates","movie"].edge_label_index
        )

        return pred
    
model = Model(hidden_channels=64)

print(model)

Model(
  (movie_lin): Linear(in_features=20, out_features=64, bias=True)
  (user_emb): Embedding(610, 64)
  (movie_emb): Embedding(9742, 64)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__rates__movie): SAGEConv(64, 64, aggr=mean)
      (movie__rev_rates__user): SAGEConv(64, 64, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__movie): SAGEConv(64, 64, aggr=mean)
      (movie__rev_rates__user): SAGEConv(64, 64, aggr=mean)
    )
  )
  (classifier): Classifier()
)


In [116]:
import tqdm 
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Device: {device}')

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1,6):
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()
        sampled_data = sampled_data.to(device)
        pred = model(sampled_data)
        ground_truth = sampled_data["user", "rates", "movie"].edge_label
        loss = F.binary_cross_entropy_with_logits(pred,ground_truth)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, loss: {total_loss / total_examples:.4f}")


Device: cuda


100%|██████████| 190/190 [00:01<00:00, 121.74it/s]


Epoch: 001, loss: 0.4394


100%|██████████| 190/190 [00:01<00:00, 129.02it/s]


Epoch: 002, loss: 0.3488


100%|██████████| 190/190 [00:01<00:00, 130.94it/s]


Epoch: 003, loss: 0.3259


100%|██████████| 190/190 [00:01<00:00, 123.56it/s]


Epoch: 004, loss: 0.3138


100%|██████████| 190/190 [00:01<00:00, 130.83it/s]

Epoch: 005, loss: 0.2975





#### Evaluation on val_data

In [134]:
# Define the validation seed edges:
edge_label_index = val_data["user", "rates", "movie"].edge_label_index
edge_label = val_data["user", "rates", "movie"].edge_label

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 10],
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=3 * 128,
    shuffle=False,
)

sampled_data = next(iter(val_loader))
print(sampled_data)

HeteroData(
  user={
    node_id=[607],
    n_id=[607],
    num_sampled_nodes=[3],
  },
  movie={
    node_id=[2655],
    x=[2655, 20],
    n_id=[2655],
    num_sampled_nodes=[3],
  },
  (user, rates, movie)={
    edge_index=[2, 18540],
    edge_label=[384],
    edge_label_index=[2, 384],
    e_id=[18540],
    num_sampled_edges=[2],
    input_id=[384],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 7715],
    e_id=[7715],
    num_sampled_edges=[2],
  }
)


In [140]:
from sklearn.metrics import roc_auc_score

preds = []
ground_truths = []
for sampled_data in tqdm.tqdm(val_loader):
    with torch.no_grad():
        sampled_data = sampled_data.to(device)
        preds.append(model(sampled_data))
        ground_truths.append(sampled_data["user","rates","movie"].edge_label)
        # loss = F.binary_cross_entropy_with_logits(pred,ground_truth)
pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
print(f"Val score: {auc:.4f}")


100%|██████████| 79/79 [00:00<00:00, 153.97it/s]

Val score: 0.9283





In [139]:
preds = [torch.tensor([1,2]),torch.tensor([3,4])]
torch.cat(preds, dim=0)

tensor([1, 2, 3, 4])