# Graph dataset examples

### Sources
- [Philipp tutorial](https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial7/GNN_overview.html)
- [pytorch-geometric](https://pytorch-geometric.readthedocs.io/en/latest/)
- [nltk docs](https://www.nltk.org/book/ch02.html)

In [1]:
import torch
import numpy as np

from data_prep.agnews_graph import AGNewsGraph
from data_prep.imdb_graph import IMDbGraph
from data_prep.reuters_graph import R8Graph, R52Graph

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

[nltk_data] Downloading package reuters to /home/mat/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [11]:
# dataset = AGNewsGraph(device, n_train_docs=1000)
# dataset = R8Graph(device, n_train_docs=1000)
# dataset = R52Graph(device, n_train_docs=300)
dataset = IMDbGraph(device, n_train_docs=300)

Reusing dataset imdb (/home/mat/.cache/huggingface/datasets/imdb/plain_text/1.0.0/4ea52f2e58a08dbc12c2bd52d0d92b30b88c00230b4522801b3636782f625c5b)


Compute tf.idf
Compute PMI scores
Generate edges
Generate masks
Generate feature matrix
Features mtx is 0.609892416 GBs in size


In [12]:
dataset.num_classes

2

In [13]:
print(np.unique(dataset.data.y[dataset.data.train_mask].cpu(), return_counts=True))
print(np.unique(dataset.data.y[dataset.data.val_mask].cpu(), return_counts=True))

(array([0, 1]), array([139, 161]))
(array([0, 1]), array([ 9, 21]))


In [14]:
print(sum(dataset.data.val_mask * dataset.data.train_mask * dataset.data.test_mask))
print(sum(dataset.data.train_mask))
print(sum(dataset.data.val_mask))
print(sum(dataset.data.test_mask))

tensor(0, device='cuda:0')
tensor(300, device='cuda:0')
tensor(30, device='cuda:0')
tensor(30, device='cuda:0')


In [15]:
# GraphConv, GATConv
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GraphConv

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(len(dataset.iton), 200)
        self.conv2 = GCNConv(200, dataset.num_classes)

    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr
        
        x = self.conv1(x, edge_index, edge_weight)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index, edge_weight)
        return x

def eval(model, data, mask):
    out = model(data)
#     print(out.size())
    _, pred = out.max(dim=1)
#     print("out", out[mask])
#     print("pred", pred[mask])
#     print(pred[mask].size())
    #print(data.y[mask])
    correct = pred[mask].eq(data.y[mask]).sum().item()
    acc = correct / mask.sum()
    print('Accuracy: {:.4f}'.format(acc))

In [16]:
model = Net()
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)#, weight_decay=5e-4)
dataset.data.to(device)

Data(edge_attr=[1010862], edge_index=[2, 1010862], test_mask=[12348], train_mask=[12348], val_mask=[12348], x=[12348, 12348], y=[12348])

In [17]:
eval(model, dataset.data, dataset.data.val_mask)

Accuracy: 0.3333


In [18]:
from tqdm.notebook import tqdm

model.train()
for epoch in tqdm(range(40)):
    optimizer.zero_grad()
    out = model(dataset.data)
    # We might want to use the "weight" parameter for the loss with unbalanced dataset
    # since with a low learning rate the model just assigns every doc to class "earn"
    # https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
    loss = F.cross_entropy(out[dataset.data.train_mask], dataset.data.y[dataset.data.train_mask])
    print('Loss:', loss.item())
    loss.backward()
    optimizer.step()
    print(np.unique(out[dataset.data.train_mask].max(dim=1)[1].detach().cpu().numpy(), return_counts=True))
    #print(np.unique(r8.data.y[r8.data.train_mask].detach().cpu().numpy(), return_counts=True))
    eval(model, dataset.data, dataset.data.val_mask)
#     break

HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))

Loss: 0.6931571960449219
(array([0, 1]), array([281,  19]))
Accuracy: 0.7000
Loss: 0.7331043481826782
(array([1]), array([300]))
Accuracy: 0.7000
Loss: 0.6677138805389404
(array([1]), array([300]))
Accuracy: 0.3000
Loss: 0.686011552810669
(array([0, 1]), array([212,  88]))
Accuracy: 0.3000
Loss: 0.6865906119346619
(array([0, 1]), array([286,  14]))
Accuracy: 0.3000
Loss: 0.6772840619087219
(array([0, 1]), array([242,  58]))
Accuracy: 0.5000
Loss: 0.655045747756958
(array([0, 1]), array([142, 158]))
Accuracy: 0.7000
Loss: 0.6209892630577087
(array([0, 1]), array([  2, 298]))
Accuracy: 0.7000
Loss: 0.587864875793457
(array([1]), array([300]))
Accuracy: 0.7000
Loss: 0.5621703863143921
(array([1]), array([300]))
Accuracy: 0.7000
Loss: 0.5264880061149597
(array([1]), array([300]))
Accuracy: 0.7000
Loss: 0.46876031160354614
(array([0, 1]), array([  6, 294]))
Accuracy: 0.6667
Loss: 0.40634408593177795
(array([0, 1]), array([121, 179]))
Accuracy: 0.6667
Loss: 0.36764416098594666
(array([0, 1])

In [19]:
eval(model, dataset.data, dataset.data.test_mask)

Accuracy: 0.5333
