In [107]:
%%shell

wget "https://web.archive.org/web/20150918182409/http://www.cs.umd.edu/~sen/lbc-proj/data/cora.tgz"
tar -xzvf cora.tgz

pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cpu.html
pip install torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cpu.html
pip install torch-geometric

pip install networkx

pip install icecream
pip install tqdm

--2021-11-19 16:23:20--  https://web.archive.org/web/20150918182409/http://www.cs.umd.edu/~sen/lbc-proj/data/cora.tgz
Resolving web.archive.org (web.archive.org)... 207.241.237.3
Connecting to web.archive.org (web.archive.org)|207.241.237.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/x-gzip]
Saving to: ‘cora.tgz.5’

cora.tgz.5              [  <=>               ] 163.15K   501KB/s    in 0.3s    

2021-11-19 16:23:25 (501 KB/s) - ‘cora.tgz.5’ saved [167063]

cora/
cora/README
cora/cora.content
cora/cora.cites
Looking in links: https://data.pyg.org/whl/torch-1.10.0+cpu.html
Looking in links: https://data.pyg.org/whl/torch-1.10.0+cpu.html




In [150]:
import numpy as np
import pandas as pd
import random
from icecream import ic

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch_geometric
import networkx as nx
import scipy
from tqdm.notebook import tqdm   
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

print("Torch version:", torch.__version__)
print("CUDA Present:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)

Torch version: 1.10.0+cu111
CUDA Present: False
CUDA Version: 11.1


In [210]:
CONFIG = {
    'PATH': './cora',
    'LIMIT': 250,
    'HIDDEN_CHANNELS': 1024,
    'NUM_LAYERS': 2,
    'DROPOUT_RATE': 0,
    'EPOCHS': 50
}

print("Here's the configuration: ")
for k, v in CONFIG.items():
    print(f"{k} = {v}")

Here's the configuration: 
PATH = ./cora
LIMIT = 250
HIDDEN_CHANNELS = 1024
NUM_LAYERS = 2
DROPOUT_RATE = 0
EPOCHS = 50


In [211]:
class Data:
    def __init__(self, path):
        self.path = path
    
    def readFile(self, path):
        lines = []
        with open(path) as file:
            lines = file.readlines()
        return lines

    def readContent(self, data):
        nodes, labels, x = [], [], []
        for d in data:
            words = d.split("\t")
            nodes.append(words[0].strip())
            labels.append(words[-1].strip())
            x.append([ord(w) - 48 for w in words[1:-1]])

        LE = LabelEncoder()
        labels = LE.fit_transform(labels)
        x_req = torch.Tensor(x)
        # ic(x.shape)
        x = pd.DataFrame.from_records(x)
        
        return nodes, labels, LE, x_req, x

    def getLabels(self, LE, data):
        return LE.inverse_transform(data)

    def readCites(self, data):
        edges = []
        for d in data:
            words = d.split("\t")
            edges.append([
                words[0].strip(),
                words[1].strip()
            ])
        return edges

    def splitDataCount(self, data, labels):
        lcounter = dict((l, 0) for l in labels)
        indices = []
        for i in range(len(labels)):
            label = labels[i]
            if lcounter[label] < CONFIG['LIMIT']:
                indices.append(i)
                lcounter[label] += 1
        rest = [x for x in range(len(labels)) if x not in indices]
        rest = torch.LongTensor(rest)
        indices = torch.LongTensor(indices)
        return rest, indices

    def normalizeMatrix(self, A):
        return scipy.sparse.diags(np.array(A.sum(1)).flatten() ** -1).dot(A)

    def toTensor(self, A):
        A = A.tocoo()
        i = torch.tensor(np.vstack((A.row, A.col)), dtype=torch.long)
        v = torch.tensor(A.data, dtype=torch.float)
        return torch.sparse_coo_tensor(i, v, torch.Size(A.shape))

    def buildGraph(self):
        nodes, edges = self.getGraph()
        G = nx.Graph()
        G.add_nodes_from(nodes)
        G.add_edges_from(edges)
        A = nx.adjacency_matrix(G)
        I = scipy.sparse.identity(A.shape[0])
        A = A + I
        A = self.normalizeMatrix(A)
        A = self.toTensor(A)
        A = A.to_dense()
        ic(A.shape)
        ic(nx.info(G))
        return A, G

    def getIndices(self):
        return self.train, self.test

    def getGraph(self):
        return self.nodes, self.edges

    def getMatrix(self):
        return self.A

    def getXY(self):
        return self.x, torch.LongTensor(self.labels)

    def printData(self):
        print(f"Number of nodes: {len(self.nodes)}")
        print(f"Number of features per node: {len(self.x[0])}")
        print(f"Categories: {set(self.labels)}")

    def handle(self):
        data = self.readFile(self.path + '/cora.content')
        e_data = self.readFile(self.path + '/cora.cites')
        self.nodes, self.labels, self.LE, self.x, self.x_req = self.readContent(data)
        self.edges = self.readCites(e_data)
        self.test, self.train = self.splitDataCount(self.x_req, self.labels)
        self.A, self.G = self.buildGraph()

In [212]:
class MyGATLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        """
        in_channels: #features in the input
        out_channels: #features in the output
        these layers have their *own* independent weights and biases
        """
        super().__init__()
        self.out_channels = out_channels
        
        # W = FxH
        self.W = nn.Parameter(torch.empty(in_channels, out_channels))
        nn.init.xavier_uniform_(self.W, gain=1.414)

        # a = 2Hx1 (So its essentially concating features of 2 nodes)
        # Note: torch.empty does random assignments of value
        self.a = nn.Parameter(torch.empty(size=(2*out_channels, 1)))
        nn.init.xavier_uniform_(self.a, gain=1.414)

        self.leaky = nn.LeakyReLU()

    def forward(self, X, A):
        """
        does the neat math on *symmetrically normalized* A
        """

        # Normal feature calculation, Wh = NxH
        wh = torch.mm(X, self.W)

        # wh = NxH, a = 2Hx1, halfA = Hx1
        # both half = NxH * Hx1 = Nx1
        first_half = wh@self.a[self.out_channels:, :]
        second_half = wh@self.a[:self.out_channels, :]

        # NxN
        after = second_half + first_half.T
        after = self.leaky(after)
        neginf = -1e17*torch.ones_like(after)
        attention = torch.where(A>0, after, neginf)
        attention = F.softmax(attention, dim=1)
        hprime = torch.matmul(attention, wh)
        # ic(hprime.shape)
        return F.elu(hprime)

In [213]:
class MyGAT(nn.Module):
    def __init__(
            self, 
            in_channels, 
            hidden_channels, 
            num_layers, 
            out_channels, 
            dropout_rate,
            attention_channels,
        ):
        super().__init__()
        self.in_channels = in_channels
        self.hidden_channels = hidden_channels
        self.num_layers = num_layers
        self.out_channels = out_channels
        self.dropout_rate = dropout_rate
        
        self.MyGATLayers = []
        self.MyGATLayers.append(
            MyGATLayer(self.in_channels, self.hidden_channels)
        )
        self.outputLayers = MyGATLayer(self.hidden_channels, self.out_channels)

        for _ in range(1, self.num_layers):
            self.MyGATLayers.append(
                MyGATLayer(self.hidden_channels, self.hidden_channels)
            )

    def sdMul(self, X, Y):
        i = X._indices()
        v = X._values()
        dv = Y[i[0,:], i[1,:]]
        return torch.sparse.FloatTensor(i, v*dv, X.size())
    
    def forward(self, X, A):
        """
        math done on *symmetrically normalized* A
        """
        for layer in self.MyGATLayers:
            X = layer.forward(X, A)
        X = F.relu(X)
        X = F.dropout(X, p=self.dropout_rate, training=self.training)
        X = self.outputLayers.forward(X, A)
        return F.log_softmax(X)

In [214]:
dataset = Data(CONFIG['PATH'])
dataset.handle()
X, y = dataset.getXY()
train, test = dataset.getIndices()
A = dataset.getMatrix()
ic(X.shape, y.shape)
ic(train.shape, test.shape)

ic| A.shape: torch.Size([2708, 2708])
ic| nx.info(G): 'Graph with 2708 nodes and 5278 edges'
ic| X.shape: torch.Size([2708, 1433]), y.shape: torch.Size([2708])
ic| train.shape: torch.Size([1647]), test.shape: torch.Size([1061])


(torch.Size([1647]), torch.Size([1061]))

In [215]:
model = MyGAT(
    in_channels=X.shape[1],
    hidden_channels=CONFIG['HIDDEN_CHANNELS'],
    num_layers=CONFIG['NUM_LAYERS'],
    out_channels=7,
    dropout_rate=CONFIG['DROPOUT_RATE'],
    attention_channels=A.shape[1]
)

loss = nn.NLLLoss()
optimizer = optim.Adam(model.parameters())

In [216]:
losses = []
for _ in tqdm(range(CONFIG['EPOCHS'])):
    optimizer.zero_grad()
    output = model.forward(X, A)
    train_x = torch.index_select(output, 0, train)
    train_y = torch.index_select(y, 0, train)
    l = loss(train_x, train_y)
    l.backward()
    losses.append(l.item())
    optimizer.step()

  0%|          | 0/50 [00:00<?, ?it/s]



In [217]:
losses # benchmark

[1.9682247638702393,
 1.953641653060913,
 1.9391576051712036,
 1.9247713088989258,
 1.910477876663208,
 1.8962702751159668,
 1.8821415901184082,
 1.8680912256240845,
 1.8541179895401,
 1.8402163982391357,
 1.8263906240463257,
 1.8126386404037476,
 1.7989606857299805,
 1.7853552103042603,
 1.7718250751495361,
 1.7583675384521484,
 1.7449820041656494,
 1.7316724061965942,
 1.7184377908706665,
 1.7052839994430542,
 1.6922128200531006,
 1.679226040840149,
 1.6663248538970947,
 1.6535110473632812,
 1.6407874822616577,
 1.6281553506851196,
 1.6156151294708252,
 1.6031684875488281,
 1.5908154249191284,
 1.5785554647445679,
 1.5663892030715942,
 1.5543150901794434,
 1.5423345565795898,
 1.5304479598999023,
 1.5186558961868286,
 1.506960153579712,
 1.4953608512878418,
 1.4838588237762451,
 1.47245454788208,
 1.4611480236053467,
 1.4499382972717285,
 1.438825249671936,
 1.4278104305267334,
 1.4168928861618042,
 1.406071424484253,
 1.3953477144241333,
 1.384720802307129,
 1.3741891384124756,
 1.3

In [218]:
output = model.forward(X, A)
test_x = torch.index_select(output, 0, test)
test_y = torch.index_select(y, 0, test)



In [219]:
predictions = torch.argmax(test_x, dim=1)

In [220]:
predictions.shape

torch.Size([1061])

In [221]:
predictions, test_y = predictions.numpy(), test_y.numpy()

In [222]:
acc = 0
for i in range(len(test_y)):
    if predictions[i] == test_y[i]:
        acc += 1
ic(acc/len(test_y)*100)

ic| acc/len(test_y)*100: 79.26484448633366


79.26484448633366

In [206]:
predictions

array([2, 2, 2, ..., 1, 0, 3])

In [207]:
test_y

array([2, 2, 2, ..., 1, 0, 2])

In [208]:
from collections import Counter
c = Counter(predictions)
print(c)

Counter({2: 333, 3: 220, 1: 188, 6: 159, 0: 128, 4: 18, 5: 15})


In [209]:
from collections import Counter
c = Counter(test_y)
print(c)

Counter({2: 568, 3: 176, 1: 168, 6: 101, 0: 48})
