In [1]:
import numpy as np
from neighbourhood import Neighbourhood
from preprocessor import *
from graph import BipartiteGraph
from model import PinSAGE

[nltk_data] Downloading package stopwords to /Users/heidi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
nodes = pd.read_csv("data/nodes.csv")
links = pd.read_csv("data/links_type.csv")
bilinks = pd.read_csv("data/bilinks.csv")

## Idea of the Graph Neural Network
### Graph Convolutional Network
#### PinSAGE? Pins and Boards vs Items and Users vs Products and Customers

1. Collecting **neighbourhood** of item and presenting the item through its neighbourhood. -> We need to define the neighbourhood of item in bipartite graph.
2. Items **initial representation** should be embeddings created based on its features. -> We need to determine features for the items.
3. ...
4. PinSAGE creates embeddings based on item and its neighbourhood features.


### **Collecting Neighbourhood in Bipartite graph**

Define the neighbourhood for any Product item in our Bipartite graph.

In [3]:
N = Neighbourhood(bilinks,links)

Generate random product and find its neighbourhood through common customer.

In [4]:
random_product = np.random.choice(range(bilinks.Id.nunique()))
print("Random product: ", random_product)
N.find_neighbourhood(random_product)

Random product:  34044


([], [])

Get the Product's whole neighbourhood including direct neighbours and neighbourhood of common Customer.

In [5]:
N.get_neighbourhood(random_product)

([], [])

Receptive field is neighbourhood of K randomly selected neighbours. Neighbours can be selected multiple times.

In [6]:
for k in range(7):
    print(N.get_receptive_field(random_product,K=k))

None
None
None
None
None
None
None


Generate a random walk from Product.

In [7]:
N.generate_random_walk(143845)

[147830, 143845, 521846, 143845, 142912]

Importance pooling: selecting T (default 10) most important neighbours of Product by generating T random walks from T-hop neighbourhood.

In [8]:
print(N.importance_pooling(random_product,T=3))

{}


### **Feature table for Products (nodes)**

We decided to select different types of values: textual, categorical and numerical values.

In [9]:
columns = {
    "ID": ["Id"],
    "textual": ["Title"],
    "categorical": ["Group"],
    "numerical": ["Salesrank","AvgRating"]
}

In [10]:
Features = get_features(nodes,columns)

In [11]:
Features.dtypes.unique()

array([dtype('int64'), dtype('float64'), dtype('int8')], dtype=object)

In [12]:
Features.shape # normaliseerimine? Standard scaler

(542664, 2004)

#### Additional features from Categories

We would like to add more information for nodes: categories

In [13]:
# df_cat = pd.read_csv('data/categories.csv')
# df_cat.head()

Each category path has its own unique CatId. We can represent each node as vectors representing which categories they belong to.

In [14]:
# df_cat["nbr"] = 1
# Features2 = pd.pivot_table(data=df_cat,index="Id",columns="CatId",values="nbr",fill_value=0)

In [15]:
# Features2.shape

In [16]:
# df_cat.Id.nunique()

In [17]:
# np.min(df_cat.CatId.values),np.max(df_cat.CatId.values)

In [18]:
# 542664-519781

### **Graph neural network**

**Architecture from Paper**

Input: 
- Set of nodes $\mathcal{M}\subset \mathcal{V}$ (minibatch from nodes $\mathcal{V}$);
- depth parameter $K$;
- neighbourhood function $\mathcal{N}:\mathcal{V}\rightarrow 2^{\mathcal{V}}$

Output:
- Embeddings $z_u, \forall u\in \mathcal{M}$

Sampling neighbourhoods for nodes in minibatch:
- $K$-th round consist of batchnodes: $\mathcal{S}^{(k)} \leftarrow \mathcal{M}$;
- for $k = K,\dots, 1$ do
  - $\mathcal{S}^{(k-1)}\leftarrow \mathcal{S}^{(k)}$
  - for $u\in \mathcal{S}^{(k)}$ do
    - $\mathcal{S}^{(k-1)}\leftarrow\mathcal{S}^{(k-1)}\cup \mathcal{N}(u)$ 
    - ($K-1$)-st round consist of $K$-th nodes and their neighbourhood nodes

Generating embeddings for nodes in minibatch:
- $h^{(0)}_u \leftarrow x_u \forall u\in \mathcal{S}^{(0)}$; init emb is feature vector $x_u$
- for $k = 1,\dots,K$ do
  - for $u\in \mathcal{S}^{(k)}$ do
    - $\mathcal{H}\leftarrow \big\{ h^{(k-1)}_v, \forall v\in \mathcal{N}(u) \big\}$
    - $h^{(k)}_u\leftarrow \text{convolve}^{(k)}\big( h^{(k-1)}_u,\mathcal{H} \big)$
- for $u\in \mathcal{M}$ do
  - $z_u\leftarrow G_2\cdot\text{ReLU}\big( G_1h^{(K)}_u+g \big)$

<hr>
<u>Convolve</u>:

Input:
- current embedding $z_u$ for node $u$;
- set of neighbour embeddings $\{ z_v|v\in\mathcal{N}(u) \}$ with set of neighbour importances **$\alpha$**;
- symmetric vector function $\gamma(\cdot)$

Output: 
- new embedding $z^{\text{NEW}}_u$ for node u

Generating an embedding:
- neighbourhood embedding: $n_u\leftarrow \gamma\big(\{ \text{ReLU}(Qh_v+q)|v\in\mathcal{N}(u) \}, \alpha \big)$;
- node $u$ embedding: $z^{\text{NEW}}_u \leftarrow \text{ReLU}\big(W\cdot \text{concat}(z_u,n_u)+w\big)$;
- normalized node $u$ emb: $z^{\text{NEW}}_u \leftarrow \frac{z^{\text{NEW}}_u}{\lVert z^{\text{NEW}}_u\rVert _2}$

In [19]:
import tensorflow as tf
from tensorflow.keras import Model
from keras.layers import Input, Dot
from tensorflow.keras.optimizers import Adam
from model import PinSAGE, Sampler, Embedder
from graph import BipartiteGraph
from tqdm import tqdm

G = BipartiteGraph(Features,bilinks,links)

In [20]:
def sampling(inputs, depth_K, pool_T):
    stacks, alphas = [],[]
    node_ids = tf.constant(inputs,dtype="int32")
    for k in tqdm(range(depth_K),
                  desc=f"Sampling neighbourhoods and importances for nodes"): # k = 0,...,K-1
        neigh_ids = tf.constant(G.pooling(node_ids,pool_T)[0]) # get neighbours
        alpha = tf.constant(G.pooling(node_ids,pool_T)[1]) # get importances
        node_ids = tf.constant(node_ids,shape=(node_ids.numpy().size,1),dtype="int32") # reshape nodes
        stack = tf.concat([node_ids,neigh_ids],axis=-1) # stack nodes with their neighbours
        stacks.append(stack)
        alphas.append(alpha)
        if k > 0:
            stacks[k] = tf.concat([stack,stacks[k-1]],axis=0)
            alphas[k] = tf.concat([alpha,alphas[k-1]],axis=0)
        node_ids = tf.concat(tf.unstack(neigh_ids, axis=1),axis=0) # flatten neighbours for next round nodes
    return stacks[-1], alphas[-1]

In [21]:
def generate_posneg_samples(G, node_ids, nbrOfPos, nbrOfNeg):
    targets, labels = [], []
    pos_samples, pos_weights = [], []
    neg_samples, neg_weights = [], []
    pos, posw = G.pooling(node_ids,pool_size=nbrOfPos)
    for idx in range(len(node_ids)):
        for i in range(len(pos[idx])):
            targets.append(node_ids[idx])
            pos_samples.append(pos[idx][i])
            pos_weights.append(posw[idx][i])
            labels.append(1)
        for j in range(nbrOfNeg):
            targets.append(node_ids[idx])
            neg_samples.append(np.random.choice(G.Features.Id.values))
            neg_weights.append(0.)
            labels.append(0)
    return np.array(targets), np.array(labels), \
        np.array(pos_samples+neg_samples), np.array(pos_weights + neg_weights)

In [22]:
def stack_over(stack, depth_K, alphas_tensor=False):
    if alphas_tensor: pool_T = stack.shape[1]
    else: pool_T = stack.shape[1]-1
    s = [int(stack.shape[0]*(pool_T-1)/(pool_T**(depth_K)-1))]
    for i in range(1,depth_K):
        s.append(s[i-1]*pool_T)
    s = [0]+s
    cums = np.cumsum(s)
    pieces = []
    for k in range(depth_K):
        pieces.append(tf.unstack(stack,axis=0)[cums[k]:cums[k+1]])
    for k in range(depth_K-1,0,-1):
        pieces[k-1] = tf.concat(
            [pieces[k-1]]
            +[tf.unstack(pieces[k],axis=0)[int(i*s[k+1]/pool_T):int((i+1)*s[k+1]/pool_T)] for i in range(pool_T)],axis=1)
    return pieces[k-1]

In [23]:
def stack_back(stack, pool_T, depth_K, alphas_tensor=False):
    if alphas_tensor: y = pool_T
    else: y = pool_T + 1
    pieces = [tf.concat(
        [tf.reshape(tf.unstack(stack,axis=1)[i],shape=(stack.shape[0],1)) for i in range((y)*j, (y)*(j+1))]
        ,axis=1) for j in range(int(stack.shape[1]/(y)))]
    s = [pieces[0]]
    for i in range((pool_T+1)):
        for j in range(depth_K+1):
            s.append(pieces[1+i+(pool_T+1)*j])
    return tf.concat(s,axis=0)

In [24]:
batch = G.random_subgraph(5)
stacks, alphas = sampling(batch,depth_K=3, pool_T=4)

2022-05-16 14:01:22.696462: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Finding neighbours and importances: 100%|██████████| 5/5 [00:01<00:00,  3.83it/s] ?it/s]
Finding neighbours and importances: 100%|██████████| 5/5 [00:00<00:00,  6.15it/s]
Finding neighbours and importances: 100%|██████████| 20/20 [00:03<00:00,  5.97it/s]4,  2.16s/it]
Finding neighbours and importances: 100%|██████████| 20/20 [00:03<00:00,  6.55it/s]
Finding neighbours and importances: 100%|██████████| 80/80 [00:11<00:00,  6.85it/s]4,  4.66s/it]
Finding neighbours and importances: 100%|██████████| 80/80 [00:11<00:00,  6.93it/s]
Sampling neighbourhoods and importances for nodes: 100%|██████████| 3/3 [00:31<00:00, 10.61s/it]


In [25]:
stacks.shape, alphas.shape

(TensorShape([105, 5]), TensorShape([105, 4]))

In [41]:
stack_over(stacks,depth_K=3).shape, stack_over(alphas,depth_K=3,alphas_tensor=True).shape

(TensorShape([40, 52]), TensorShape([40, 39]))

In [27]:
stack_back(stack_over(stacks,depth_K=3),pool_T=4,depth_K=3).shape,stack_back(stack_over(alphas,depth_K=3,alphas_tensor=True),pool_T=4,depth_K=3, alphas_tensor=True).shape

(TensorShape([105, 5]), TensorShape([105, 4]))

In [29]:
x = Embedder(G, 3, 4, 20)([stack_over(stacks,3),stack_over(alphas,3,alphas_tensor=True)])
x

<tf.Tensor: shape=(5, 1, 20), dtype=float32, numpy=
array([[[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         1.47997394e-01, 0.00000000e+00, 5.77476807e-02, 3.78233850e-01,
         0.00000000e+00, 6.02768511e-02, 0.00000000e+00, 0.00000000e+00,
         2.13850021e-01, 0.00000000e+00, 1.13240167e-01, 5.71299940e-02]],

       [[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 1.59668431e-01, 6.80349185e-05,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.01896435e-01,
         5.99330179e-02, 0.00000000e+00, 1.57962404e-02, 0.00000000e+00,
         1.73881471e-01, 1.28060440e-02, 8.67689103e-02, 0.00000000e+00]],

       [[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         1.26779988e-01, 0.00000000e+00, 0.00000000e+00, 5.05898893e-02,
         9.57183540e-02, 0.00000000e+00, 8.88336450e-02, 4.5556631

In [30]:
batch = G.random_subgraph(10)
targets, labels, samples, weights = generate_posneg_samples(G, batch, 2, 2)
targets.size, labels.size, samples.size, weights.size

Finding neighbours and importances: 100%|██████████| 10/10 [00:00<00:00, 18.43it/s]


(40, 40, 40, 40)

Idea:
- Generate positive and negative samples for nodes with labels 1 and 0
- "target", "context", "weight", "label"
- write the loss function

In [31]:
stacks, alphas = sampling(targets,3,3)
stacks_s, alphas_s = sampling(samples,3,3)
targets, samples = [stacks, alphas], [stacks_s, alphas_s]

Finding neighbours and importances: 100%|██████████| 40/40 [00:02<00:00, 16.40it/s]it/s]
Finding neighbours and importances: 100%|██████████| 40/40 [00:02<00:00, 16.89it/s]
Finding neighbours and importances: 100%|██████████| 120/120 [00:08<00:00, 14.55it/s]  4.82s/it]
Finding neighbours and importances: 100%|██████████| 120/120 [00:07<00:00, 15.37it/s]
Finding neighbours and importances: 100%|██████████| 360/360 [00:25<00:00, 13.99it/s] 11.44s/it]
Finding neighbours and importances: 100%|██████████| 360/360 [00:25<00:00, 14.30it/s]
Sampling neighbourhoods and importances for nodes: 100%|██████████| 3/3 [01:11<00:00, 23.94s/it]
Finding neighbours and importances: 100%|██████████| 40/40 [00:03<00:00, 10.76it/s]it/s]
Finding neighbours and importances: 100%|██████████| 40/40 [00:11<00:00,  3.63it/s]
Finding neighbours and importances: 100%|██████████| 120/120 [00:13<00:00,  9.09it/s] 14.77s/it]
Finding neighbours and importances: 100%|██████████| 120/120 [00:09<00:00, 12.41it/s]
Finding 

In [32]:
stacks_s.shape, alphas_s.shape

(TensorShape([520, 4]), TensorShape([520, 3]))

In [44]:
targets = [stack_over(stacks,depth_K=3),stack_over(alphas,depth_K=3,alphas_tensor=True)]
samples = [stack_over(stacks_s,depth_K=3),stack_over(alphas_s,depth_K=3,alphas_tensor=True)]

In [45]:
for set in targets+samples: print(set.shape)

(40, 52)
(40, 39)
(40, 52)
(40, 39)


In [46]:
def create_dataset(targets, samples, labels, weights, batch_size):
    inputs = {
        "stack_t": targets[0],
        "alpha_t": targets[1],
        "stack_s": samples[0],
        "alpha_s": samples[1]
    }
    dataset = tf.data.Dataset.from_tensor_slices((inputs, labels, weights))
    dataset = dataset.shuffle(buffer_size=batch_size * 2)
    dataset = dataset.batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

batch_size = 10
dataset = create_dataset(
    targets=targets,
    samples=samples,
    labels=labels,
    weights=weights,
    batch_size=batch_size,
)

Siiamaani töötab... saaks nüüd mudeli ka tööle

In [47]:
inputs = {
    "stack_t": Input(name="stack_t", shape=(), dtype="int32"),
    "alpha_t": Input(name="alpha_t", shape=(), dtype="int32"),
    "stack_s": Input(name="stack_s", shape=(), dtype="int32"),
    "alpha_s": Input(name="alpha_s", shape=(), dtype="int32")
}
target_emb = Embedder(G, 3, 3, 20)([inputs["stack_t"],inputs["alpha_t"]])
sample_emb = Embedder(G, 3, 3, 20)([inputs["stack_s"],inputs["alpha_s"]])

logits = Dot(axes=1, normalize=False, name="similarity")([target_emb, sample_emb])

pinsage = Model(inputs=inputs, outputs=logits)

StagingError: Exception encountered when calling layer "embedder_2" (type Embedder).

in user code:

    File "/Users/heidi/Documents/Andmeteadus/NetworkScience/Project/network_science/model.py", line 64, in call  *
        stack = stack_back(inputs[0],self.pool_T,self.depth_K)
    File "/Users/heidi/Documents/Andmeteadus/NetworkScience/Project/network_science/model.py", line 95, in stack_back  *
        pieces = [tf.concat(

    IndexError: list index out of range


Call arguments received:
  • inputs=['tf.Tensor(shape=(None,), dtype=int32)', 'tf.Tensor(shape=(None,), dtype=int32)']