In [1]:
import torch as th
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
os.environ["DGLBACKEND"] = "pytorch"
import dgl
import dgl.data
import dgl.function as fn
import dgl.nn as gnn
import pickle
from tqdm import tqdm

import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
# import faiss

In [2]:
class NegativeSampler(object):
    def __init__(self, g, k, neg_share=False, device=None):
        if device is None:
            device = g.device
        self.weights = g.in_degrees().float().to(device) ** 0.75
        self.k = k
        self.neg_share = neg_share

    def __call__(self, g):
        src, _ = g.edges()
        n = len(src)
        if self.neg_share and n % self.k == 0:
            dst = self.weights.multinomial(n, replacement=True)
            dst = dst.view(-1, 1, self.k).expand(-1, self.k, -1).flatten()
        else:
            dst = self.weights.multinomial(n*self.k, replacement=True)
        src = src.repeat_interleave(self.k)
        return dgl.graph((src, dst), num_nodes=g.num_nodes())

class CrossEntropyLoss(nn.Module):
    def forward(self, h, pos_graph, neg_graph):
        with pos_graph.local_scope():
            pos_graph.ndata['h'] = h
            pos_graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            pos_score = pos_graph.edata['score']
        with neg_graph.local_scope():
            neg_graph.ndata['h'] = h
            neg_graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            neg_score = neg_graph.edata['score']

        score = th.cat([pos_score, neg_score])
        label = th.cat([th.ones_like(pos_score), th.zeros_like(neg_score)]).long()
        loss = F.binary_cross_entropy_with_logits(score, label.float())
        return loss

In [3]:
class Sage(nn.Module):
    def __init__(self, in_feats, n_hidden, n_classes, n_layers, activation, dropout):
        super().__init__()
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.n_classes = n_classes
        self.layers = nn.ModuleList()
        if n_layers > 1:
            self.layers.append(gnn.SAGEConv(in_feats, n_hidden, aggregator_type='mean'))
            for i in range(1, n_layers - 1):
                self.layers.append(gnn.SAGEConv(n_hidden, n_hidden, aggregator_type='mean'))
            self.layers.append(gnn.SAGEConv(n_hidden, n_classes, aggregator_type='mean'))
        else:
            self.layers.append(gnn.SAGEConv(in_feats, n_classes, aggregator_type='mean'))
        self.dropout = nn.Dropout(dropout)
        self.activation = activation

    def forward(self, g, x):
        h = x
        for l, layer in enumerate(self.layers):
            h = layer(g, h)
            if l != len(self.layers) - 1:
                h = self.activation(h)
                h = self.dropout(h)
        return h

In [4]:
with open('../data/graph/okved_graph.pickle', 'rb') as fp:
    g = pickle.load(fp)

clf_graph = dgl.edge_subgraph(graph=g, 
                              edges=(g.edata['type'] == 1).nonzero().flatten())
clf_graph = g
okved_data = pd.read_csv('../data/okved2/okved_2014_w_sections.csv', index_col=0)
sections = okved_data['section_id'].values

In [5]:
clf_graph

Graph(num_nodes=2637, num_edges=438730,
      ndata_schemes={'feat': Scheme(shape=(312,), dtype=torch.float64)}
      edata_schemes={'weight': Scheme(shape=(), dtype=torch.int64), 'type': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'norm': Scheme(shape=(), dtype=torch.float32)})

In [6]:
g

Graph(num_nodes=2637, num_edges=438730,
      ndata_schemes={'feat': Scheme(shape=(312,), dtype=torch.float64)}
      edata_schemes={'weight': Scheme(shape=(), dtype=torch.int64), 'type': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'norm': Scheme(shape=(), dtype=torch.float32)})

In [5]:
np.unique(g.edata['type'])

array([0, 1, 2], dtype=int64)

In [6]:
g # число ребер- 438730, число узлов-2637

Graph(num_nodes=2637, num_edges=438730,
      ndata_schemes={'feat': Scheme(shape=(312,), dtype=torch.float64)}
      edata_schemes={'weight': Scheme(shape=(), dtype=torch.int64), 'type': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'norm': Scheme(shape=(), dtype=torch.float32)})

In [7]:
g.ndata['feat'].shape # 312 - размерность пространства признаков (осталось понять, что это за признаки)

torch.Size([2637, 312])

In [8]:
g.edata.keys() # ключи в данных о ребрах

dict_keys(['weight', 'type', 'train_mask', 'norm'])

In [9]:
g.edata['weight'].shape # для каждого узла свой вес (берем его из ...)

torch.Size([438730])

In [10]:
g.edata['type'].shape # для каждого узла свой тип связи (берем его из ...)

torch.Size([438730])

In [11]:
th.unique(g.edata['type'], return_counts=True)

(tensor([0, 1, 2]), tensor([175868,   5272, 257590]))

In [12]:
g.edata['train_mask'].shape # для каждого узла свой вес (берем его из ...)

torch.Size([438730])

In [13]:
th.unique(g.edata['train_mask'], return_counts=True) # train_test_split????

(tensor([False,  True]), tensor([ 87746, 350984]))

In [14]:
g.edata['norm'].shape # для каждого узла свой вес (берем его из ...)

torch.Size([438730])

In [15]:

th.unique(g.edata['norm'], return_counts=True) # ну это что-то с чем-то...

(tensor([3.2410e-04, 3.6836e-04, 3.9812e-04,  ..., 5.7735e-01, 6.7700e-01,
         7.0711e-01]),
 tensor([  2,   2,   2,  ..., 798,   2,   4]))

In [16]:
def split_okved(s: str) -> tuple:
    class_, subclass, group, subgroup, type_ = ['None'] * 5
    assert len(s) in {2, 4, 5, 7, 8}
    class_ = s[:2]
    if len(s) >= 4:
        subclass = s[:4]
    if len(s) >= 5:
        group = s[:5]
    if len(s) >= 7:
        subgroup = s[:7]
    if len(s) == 8:
        type_ = s
    return class_, subclass, group, subgroup, type_

def build_nfeat_from_okved_data(okved_data) -> th.Tensor:
    nfeat = np.full((len(okved_data)+1, 6), 'None', dtype=object)
    nfeat[1:, :-1] = np.array(okved_data['native_code'].map(split_okved).tolist())
    nfeat[1:, -1] = sections
    nfeat = ce.OrdinalEncoder().fit_transform(nfeat).values
    nfeat = StandardScaler().fit_transform(nfeat)
    nfeat = th.from_numpy(nfeat).float()   
    return nfeat  

In [22]:
num_negs = 1
neg_share = False
device = th.device('cpu')
num_hidden = 256
num_layers = 2
dropout = 0.25
lr = 0.0001
num_epochs = 10000
best_loss = 1000000 
last_improvement = 0
require_improvements = 50
best_state = None

nfeat = g.ndata['feat'].float().to(device)
# nfeat = build_nfeat_from_okved_data(okved_data).to(device)
in_feats = nfeat.shape[1]
n_edges = g.num_edges()


model = Sage(in_feats, num_hidden, num_hidden, num_layers, F.relu, dropout)
model = model.to(device)
sampler = NegativeSampler(g, num_negs, neg_share, device)
criterion = CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)


for epoch in range(num_epochs):
    neg_graph = sampler(clf_graph).to(device)
    # Compute loss and prediction
    pred = model(clf_graph, nfeat)
    loss = criterion(pred, clf_graph, neg_graph)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch:05d} |  Loss {loss.item():.4f}')

    if loss.item() < best_loss:
        best_loss = loss.item()
        last_improvement = 0
        best_state = model.state_dict()
    else:
        last_improvement += 1
    if last_improvement > require_improvements:
        print(f"No improvement found during the {require_improvements} last iterations, stopping optimization.")
        model.load_state_dict(best_state)
        break
        

Epoch 00000 |  Loss 1.9893
Epoch 00001 |  Loss 1.8887
Epoch 00002 |  Loss 1.7659
Epoch 00003 |  Loss 1.6600
Epoch 00004 |  Loss 1.5674
Epoch 00005 |  Loss 1.4714
Epoch 00006 |  Loss 1.3888
Epoch 00007 |  Loss 1.3225
Epoch 00008 |  Loss 1.2554
Epoch 00009 |  Loss 1.1976
Epoch 00010 |  Loss 1.1565
Epoch 00011 |  Loss 1.0849
Epoch 00012 |  Loss 1.0679
Epoch 00013 |  Loss 1.0215
Epoch 00014 |  Loss 0.9740
Epoch 00015 |  Loss 0.9481
Epoch 00016 |  Loss 0.9195
Epoch 00017 |  Loss 0.8894
Epoch 00018 |  Loss 0.8701
Epoch 00019 |  Loss 0.8543
Epoch 00020 |  Loss 0.8361
Epoch 00021 |  Loss 0.8282
Epoch 00022 |  Loss 0.8097
Epoch 00023 |  Loss 0.7942
Epoch 00024 |  Loss 0.7864
Epoch 00025 |  Loss 0.7751
Epoch 00026 |  Loss 0.7695
Epoch 00027 |  Loss 0.7602
Epoch 00028 |  Loss 0.7550
Epoch 00029 |  Loss 0.7494
Epoch 00030 |  Loss 0.7435
Epoch 00031 |  Loss 0.7379
Epoch 00032 |  Loss 0.7358
Epoch 00033 |  Loss 0.7316
Epoch 00034 |  Loss 0.7271
Epoch 00035 |  Loss 0.7244
Epoch 00036 |  Loss 0.7218
E

In [21]:
neg_graph.

Graph(num_nodes=2637, num_edges=438730,
      ndata_schemes={}
      edata_schemes={})

In [19]:
pred

tensor([[ 0.1266, -0.0913, -0.1717,  ..., -0.2550, -0.1028, -0.0374],
        [ 0.0236,  0.0076,  0.0754,  ..., -0.1338, -0.1767, -0.0850],
        [ 0.1176, -0.1793, -0.0510,  ..., -0.1772,  0.0130,  0.0804],
        ...,
        [ 0.0239, -0.1121, -0.0222,  ..., -0.0029, -0.0696, -0.0118],
        [ 0.2438,  0.2699, -0.0350,  ..., -0.0766, -0.2062,  0.2366],
        [ 0.0999, -0.1860,  0.1036,  ..., -0.1389, -0.0271,  0.1005]],
       grad_fn=<AddBackward0>)

In [None]:
model.eval()
embeddings_model = model(clf_graph, nfeat).detach().numpy()[1:]
embeddings_model_2d = TSNE(n_components=2).fit_transform(embeddings_model)

In [None]:
embeddings_bert = np.load('../data/okved2/okved_embeddings.npy')
embeddings_bert_2d = TSNE(n_components=2).fit_transform(embeddings_bert)

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
ax.scatter(embeddings_bert_2d[:, 0], embeddings_bert_2d[:, 1], c=sections)
ax.set_title('TSNE of BERT embeddings')

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
ax.scatter(embeddings_model_2d[:sections.shape[0], 0], embeddings_model_2d[:sections.shape[0], 1], c=sections)
ax.set_title('TSNE of SAGE (only clf edges) embeddings')