In [1]:
!pip install torch



In [2]:
!pip install torch_geometric



In [3]:
import torch
print(torch.__version__)

2.2.1+cu121


In [4]:
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.2.1+cpu.html

Looking in links: https://data.pyg.org/whl/torch-2.2.1+cpu.html


In [5]:
from torch_geometric.datasets import AmazonBook
from torch.nn.functional import one_hot
import torch
from torch.nn.functional import dropout
from torch_geometric.nn import to_hetero
from torch_geometric.nn import Linear, SAGEConv
from torch.optim import SGD
from torch.nn import CrossEntropyLoss
from torch_geometric.nn import to_hetero
from torch_geometric.loader import NeighborLoader
from torch_geometric.utils import to_networkx
from torch_geometric.transforms import RandomLinkSplit
from torch.nn.functional import mse_loss, binary_cross_entropy_with_logits
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch_geometric.nn import LightGCN
from sklearn.metrics import classification_report
from torch_geometric.utils import degree
import numpy as np
import networkx as nx
from sklearn.metrics import roc_auc_score, average_precision_score

In [6]:
data = AmazonBook('../data')
dataset = data[0]

print(dataset)

HeteroData(
  user={ num_nodes=52643 },
  book={ num_nodes=91599 },
  (user, rates, book)={
    edge_index=[2, 2380730],
    edge_label_index=[2, 603378],
  },
  (book, rated_by, user)={ edge_index=[2, 2380730] }
)


#Предвидување на врски со мерки за сличност

In [7]:
graph = to_networkx(AmazonBook('/content/data/Amazon')[0]).to_undirected()

In [8]:
edges = graph.edges()
edges_list = np.array(edges)

In [9]:
length = edges_list.shape[0]
size = int(length * 0.2)

test_edges = np.random.choice(edges_list.flatten(), size=(size, edges_list.shape[1]), replace=False)
graph.remove_edges_from(test_edges)

In [10]:
jac_ind = list(nx.jaccard_coefficient(graph, test_edges))
jac_ind = np.array(jac_ind)

In [11]:
y_hats = (jac_ind[:, -1] > 0.5).astype(int)

In [12]:
num_nodes = graph.number_of_nodes()

negative_edges = set()
set_of_edges = set(map(tuple, edges_list))

while len(negative_edges) < size:
    random_edge = tuple(np.random.randint(0, num_nodes, size=2))
    if random_edge not in set_of_edges and random_edge not in negative_edges:
        negative_edges.add(random_edge)

negative_edges = list(negative_edges)

In [13]:
jac_ind_neg = list(nx.jaccard_coefficient(graph, negative_edges))
jac_ind_neg = np.array(jac_ind_neg)

In [14]:
y_hats_neg = (jac_ind_neg[:, -1] > 0.5).astype(int)

In [15]:
y_score = np.concatenate([y_hats, y_hats_neg])
true_y_pos = np.ones(test_edges.shape[0])
true_y_neg = np.zeros(test_edges.shape[0])
y_true = np.concatenate([true_y_pos, true_y_neg])

In [16]:
roc_auc_score(y_true, y_score)

0.5000157514711874

In [17]:
average_precision_score(y_true, y_score)

0.5000135012610177

#Предвидување на врски со GNN

In [18]:
num_users = dataset['user']['num_nodes']
num_books = dataset['book']['num_nodes']

In [19]:
dataset['user'].x = torch.ones(num_users, 1)
dataset['book'].x = torch.ones(num_books, 1)

In [20]:
dataset = dataset.to('cuda' if torch.cuda.is_available() else 'cpu')

In [21]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

In [22]:
class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['book'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

In [23]:
class Model(torch.nn.Module):
    def __init__(self, hidden_channels, data):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

In [24]:
def train_link_prediction(model, train_data, val_data, optimizer, epochs=5):
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        pred = model(train_data.x_dict, train_data.edge_index_dict,
                     train_data['user', 'book'].edge_label_index)
        target = train_data['user', 'book'].edge_label
        loss = mse_loss(pred, target)
        loss.backward()
        optimizer.step()

        model.eval()
        pred = model(val_data.x_dict, val_data.edge_index_dict,
                     val_data['user', 'book'].edge_label_index)
        pred = pred.clamp(min=0, max=5)
        target = val_data['user', 'book'].edge_label.float()
        val_loss = mse_loss(pred, target).sqrt()

        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val Loss: {val_loss:.4f}')

In [25]:
train_val_test_split = RandomLinkSplit(num_val=0.2,
                                           num_test=0.2,
                                           add_negative_train_samples=True,
                                           edge_types=('user', 'rates', 'book'),
                                           rev_edge_types=('book', 'rated_by', 'user'))

In [26]:
train_data, val_data, test_data = train_val_test_split(dataset)

In [27]:
model = Model(hidden_channels=64, data=dataset)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

In [28]:
optimizer = SGD(model.parameters(), lr=0.001)

In [29]:
train_link_prediction(model, train_data, val_data, optimizer, 50)

Epoch: 000, Loss: 0.5188, Val Loss: 0.7070
Epoch: 001, Loss: 0.4999, Val Loss: 0.6944
Epoch: 002, Loss: 0.4823, Val Loss: 0.6826
Epoch: 003, Loss: 0.4660, Val Loss: 0.6714
Epoch: 004, Loss: 0.4509, Val Loss: 0.6621
Epoch: 005, Loss: 0.4384, Val Loss: 0.6532
Epoch: 006, Loss: 0.4267, Val Loss: 0.6448
Epoch: 007, Loss: 0.4158, Val Loss: 0.6368
Epoch: 008, Loss: 0.4055, Val Loss: 0.6292
Epoch: 009, Loss: 0.3959, Val Loss: 0.6220
Epoch: 010, Loss: 0.3869, Val Loss: 0.6152
Epoch: 011, Loss: 0.3784, Val Loss: 0.6087
Epoch: 012, Loss: 0.3705, Val Loss: 0.6025
Epoch: 013, Loss: 0.3630, Val Loss: 0.5967
Epoch: 014, Loss: 0.3560, Val Loss: 0.5911
Epoch: 015, Loss: 0.3494, Val Loss: 0.5859
Epoch: 016, Loss: 0.3433, Val Loss: 0.5809
Epoch: 017, Loss: 0.3374, Val Loss: 0.5762
Epoch: 018, Loss: 0.3320, Val Loss: 0.5718
Epoch: 019, Loss: 0.3269, Val Loss: 0.5676
Epoch: 020, Loss: 0.3221, Val Loss: 0.5636
Epoch: 021, Loss: 0.3176, Val Loss: 0.5598
Epoch: 022, Loss: 0.3133, Val Loss: 0.5562
Epoch: 023,

####Тестирање на моделот

In [30]:
def test_link_prediction(model, test_data, optimizer, epochs=5):
    model.to('cuda' if torch.cuda.is_available() else 'cpu').eval()

    with torch.inference_mode():
        pred = model(test_data.x_dict, test_data.edge_index_dict,
                      test_data['user', 'book'].edge_label_index)
        pred = pred.clamp(min=0, max=5)
        target = test_data['user', 'book'].edge_label.float()

        print(classification_report(y_true=target.cpu().numpy(), y_pred=pred.round().detach().cpu().numpy()))

        val_loss = mse_loss(pred, target).sqrt()

        print(f'Loss: {val_loss:.4f}')

In [31]:
test_link_prediction(model, test_data, optimizer, 50)

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.50      1.00      0.67    476146
         1.0       0.00      0.00      0.00    476146

    accuracy                           0.50    952292
   macro avg       0.25      0.50      0.33    952292
weighted avg       0.25      0.50      0.33    952292

Loss: 0.5102


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#Системи за препораки со LightGCN

In [32]:
num_users, num_books = dataset['user'].num_nodes, dataset['book'].num_nodes
num_nodes = dataset.num_nodes
#dataset = dataset.to('cuda' if torch.cuda.is_available() else 'cpu')
dataset = dataset.to_homogeneous()

In [33]:
def train(dataset, train_loader, model, optimizer, num_users, num_books, epochs=1):
    model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
    for epoch in range(epochs):
        total_loss, total_examples = 0, 0

        for node_ids in train_loader:
            pos_edge_label_index = dataset.edge_index[:, node_ids]
            generated = torch.randint(num_users, num_users + num_books,
                                                              (node_ids.numel(),))
            neg_edge_label_index = torch.stack([pos_edge_label_index[0],
                                                generated.to('cuda' if torch.cuda.is_available() else 'cpu')],
                                               dim=0)
            edge_label_index = torch.cat([pos_edge_label_index, neg_edge_label_index], dim=1)

            optimizer.zero_grad()

            pos_rank, neg_rank = model(dataset.edge_index, edge_label_index).chunk(2)

            loss = model.recommendation_loss(pos_rank, neg_rank, node_id=edge_label_index.unique())
            loss.backward()
            optimizer.step()

            total_loss += float(loss) * pos_rank.numel()
            total_examples += pos_rank.numel()

            print(f'Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}')

In [34]:
train_test_split = RandomLinkSplit(num_val=0.01,
                                           num_test=0.98)
train_data, val_data, _ = train_test_split(dataset)

In [35]:
# _, _, dataset_sm = train_test_split(dataset)
# print(type(dataset_sm)), print(type(dataset))

In [36]:
data_loader = DataLoader(range(train_data.edge_index.size(1)),
                             shuffle=True,
                             batch_size=16)

In [37]:
model = LightGCN(num_nodes=num_nodes, embedding_dim=128, num_layers=1)

optimizer = Adam(model.parameters(), lr=0.01)

train(dataset, data_loader, model, optimizer, num_users, num_books, 1)

Epoch: 000, Loss: 0.6931
Epoch: 000, Loss: 0.6888
Epoch: 000, Loss: 0.6803
Epoch: 000, Loss: 0.6673
Epoch: 000, Loss: 0.6518
Epoch: 000, Loss: 0.6299
Epoch: 000, Loss: 0.6147
Epoch: 000, Loss: 0.6144
Epoch: 000, Loss: 0.6086
Epoch: 000, Loss: 0.6051
Epoch: 000, Loss: 0.6042
Epoch: 000, Loss: 0.5860
Epoch: 000, Loss: 0.5745
Epoch: 000, Loss: 0.5643
Epoch: 000, Loss: 0.5536
Epoch: 000, Loss: 0.5473
Epoch: 000, Loss: 0.5335
Epoch: 000, Loss: 0.5343
Epoch: 000, Loss: 0.5348
Epoch: 000, Loss: 0.5304
Epoch: 000, Loss: 0.5243
Epoch: 000, Loss: 0.5175
Epoch: 000, Loss: 0.5184
Epoch: 000, Loss: 0.5112
Epoch: 000, Loss: 0.5139
Epoch: 000, Loss: 0.5087
Epoch: 000, Loss: 0.4981
Epoch: 000, Loss: 0.4927
Epoch: 000, Loss: 0.4995
Epoch: 000, Loss: 0.4955
Epoch: 000, Loss: 0.4924
Epoch: 000, Loss: 0.4817
Epoch: 000, Loss: 0.4783
Epoch: 000, Loss: 0.4746
Epoch: 000, Loss: 0.4706
Epoch: 000, Loss: 0.4636
Epoch: 000, Loss: 0.4595
Epoch: 000, Loss: 0.4558
Epoch: 000, Loss: 0.4502
Epoch: 000, Loss: 0.4480


In [38]:
test_loader = DataLoader(range(val_data.edge_index.size(1)),
                             shuffle=True,
                             batch_size=16)

In [39]:
def test(model, test_loader, num_users, num_books):
    model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()

    true_positives = 0
    false_positives = 0
    false_negatives = 0

    with torch.no_grad():
        for node_ids in test_loader:
            pos_edge_label_index = dataset.edge_index[:, node_ids]
            generated = torch.randint(num_users, num_users + num_books,
                                      (node_ids.numel(),))
            neg_edge_label_index = torch.stack([pos_edge_label_index[0],
                                                generated.to('cuda' if torch.cuda.is_available() else 'cpu')],
                                               dim=0)
            edge_label_index = torch.cat([pos_edge_label_index, neg_edge_label_index], dim=1)

            pos_rank, neg_rank = model(dataset.edge_index, edge_label_index).chunk(2)

            # Calculate true positives, false positives and false negatives
            true_positives += (pos_rank > 0.5).sum().item()
            false_positives += (neg_rank > 0.5).sum().item()
            false_negatives += (pos_rank <= 0.5).sum().item()

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}')

In [40]:
test(model, test_loader, num_users, num_books)

Precision: 0.7755, Recall: 0.9762


#####тестирав на валидациско бидејќи со тестирачкото множество премногу време се чека