# Reuters Graph version

### Sources
- [Philipp tutorial](https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial7/GNN_overview.html)
- [pytorch-geometric](https://pytorch-geometric.readthedocs.io/en/latest/)
- [nltk docs](https://www.nltk.org/book/ch02.html)

In [1]:
import torch
import numpy as np

from datasets.reuters_graph import R8, R52

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.cuda("cpu")

[nltk_data] Downloading package reuters to /home/matyi/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [2]:
r8 = R8(device)

Prepare Reuters dataset
Compute tf.idf
Compute PMI scores
Generate edges
Generate masks
Generate feature matrix
Features mtx is 0.580713604 GBs in size


In [3]:
print(np.unique(r8.data.y[r8.data.train_mask].cpu(), return_counts=True))
print(np.unique(r8.data.y[r8.data.val_mask].cpu(), return_counts=True))

(array([0, 1, 2, 3, 4, 5, 6, 7]), array([265,  41, 470,  16,  24,  35,  29, 120]))
(array([0, 1, 2, 4, 5, 6, 7]), array([31,  3, 56,  1,  4,  3,  2]))


In [4]:
print(sum(r8.data.val_mask * r8.data.train_mask * r8.data.test_mask))
print(sum(r8.data.train_mask))
print(sum(r8.data.val_mask))
print(sum(r8.data.test_mask))

tensor(0, device='cuda:0')
tensor(1000, device='cuda:0')
tensor(100, device='cuda:0')
tensor(100, device='cuda:0')


In [5]:
# GraphConv, GATConv
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GraphConv

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(len(r8.iton), 200)
        self.conv2 = GCNConv(200, 8)

    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr
        
        x = self.conv1(x, edge_index, edge_weight)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index, edge_weight)
        return x

def eval(model, data, mask):
    _, pred = model(data).max(dim=1)
    #print(pred[mask])
    #print(data.y[mask])
    correct = pred[mask].eq(data.y[mask]).sum().item()
    acc = correct / mask.sum()
    print('Accuracy: {:.4f}'.format(acc))

In [6]:
model = Net()
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)#, weight_decay=5e-4)
r8.data.to(device)

Data(edge_attr=[1208572], edge_index=[2, 1208572], test_mask=[12049], train_mask=[12049], val_mask=[12049], x=[12049, 12049], y=[12049])

In [7]:
from tqdm.notebook import tqdm

model.train()
for epoch in tqdm(range(40)):
    optimizer.zero_grad()
    out = model(r8.data)
    # We might want to use the "weight" parameter for the loss with unbalanced dataset
    # since with a low learning rate the model just assigns every doc to class "earn"
    # https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
    loss = F.cross_entropy(out[r8.data.train_mask], r8.data.y[r8.data.train_mask])
    print('Loss:', loss.item())
    loss.backward()
    optimizer.step()
    print(np.unique(out[r8.data.train_mask].max(dim=1)[1].detach().cpu().numpy(), return_counts=True))
    #print(np.unique(r8.data.y[r8.data.train_mask].detach().cpu().numpy(), return_counts=True))
    eval(model, r8.data, r8.data.val_mask)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40.0), HTML(value='')))

Loss: 2.079289436340332
(array([0, 1, 2, 3, 4, 5, 6, 7]), array([ 56,  10, 285, 430, 161,  12,  19,  27]))
Accuracy: 0.5600
Loss: 1.577468752861023
(array([2]), array([1000]))
Accuracy: 0.5600
Loss: 1.3939603567123413
(array([2]), array([1000]))
Accuracy: 0.5600
Loss: 1.3254753351211548
(array([2]), array([1000]))
Accuracy: 0.6400
Loss: 1.1599255800247192
(array([0, 2]), array([194, 806]))
Accuracy: 0.8000
Loss: 1.0375745296478271
(array([0, 2]), array([481, 519]))
Accuracy: 0.8200
Loss: 0.9047172665596008
(array([0, 2, 7]), array([444, 555,   1]))
Accuracy: 0.8200
Loss: 0.8084975481033325
(array([0, 2, 7]), array([417, 577,   6]))
Accuracy: 0.8200
Loss: 0.7237847447395325
(array([0, 2, 7]), array([410, 563,  27]))
Accuracy: 0.8400
Loss: 0.6571189165115356
(array([0, 2, 7]), array([404, 533,  63]))
Accuracy: 0.8600
Loss: 0.5918500423431396
(array([0, 2, 7]), array([333, 515, 152]))
Accuracy: 0.8400
Loss: 0.5329527854919434
(array([0, 2, 7]), array([281, 498, 221]))
Accuracy: 0.8200
Los

In [8]:
eval(model, r8.data, r8.data.test_mask)

Accuracy: 0.9000
