# Imports

In [1]:
import gzip
import gc
import math
import json
from collections import Counter, defaultdict
import random
from tqdm.notebook import tqdm
import numpy as np

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd

from stellargraph import StellarGraph
from scipy.sparse import load_npz

def jl_to_list(fname):
    output = []
    with gzip.open(fname, "rb") as f:
        for line in f:
            output.append(json.loads(line))
    return output

# Load user data

In [2]:
samples = None
test_size = .2
rows = jl_to_list("data/train_dataset.jl.gz")
if samples:
    rows = rows[:samples]

# Train-test split (revise)

In [3]:
val_size = .15
rows_train, rows_test = train_test_split(rows, test_size = test_size, random_state = 42)
print(f"Train data: {len(rows_train)}.")
print(f"Test data: {len(rows_test)}.")

Train data: 330530.
Test data: 82633.


# Function to return the count of users who searched item pairs

In [4]:
def create_dict(rows):
    counts = Counter()
    for i, row in tqdm(enumerate(rows)):
        viewed = [item["event_info"] for item in row["user_history"]
                  if item["event_type"] == "view"]

        for i, v1 in enumerate(viewed):
            for v2 in viewed[i + 1:]:
                counts[str(v1) + "_" + str(v2)] += 1
    
    return counts

In [5]:
c = create_dict(rows_train)
del rows
del rows_train
del rows_test

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




# Load item data

In [6]:
item_data = jl_to_list("data/item_data.jl.gz")
metadata = {x["item_id"] : x for x in item_data}

# Map a unique integer to each item for Stellar

In [7]:
ids_map = {}
for i, item_id in tqdm(enumerate(metadata.keys())):
    ids_map[str(item_id)] = i

del item_data
del metadata

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




# Function to get edges

In [8]:
def get_edges(counts, ids_map, thresh = 2):
    source = []
    target = []
    for ids, val in tqdm(counts.items()):
        if val >= thresh:
            indx = ids.index("_")
            id1 = ids_map[ids[:indx]]
            id2 = ids_map[ids[indx+1:]]
            source += [id1]
            target += [id2]
        
    return source, target

In [9]:
t = 2
source, target = get_edges(c, ids_map, t)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30115117.0), HTML(value='')))




# Load feature matrix (sparse)

In [10]:
item_feat = load_npz('data/item_feats_v2.npz')

# Convert feature matrix to dense

In [11]:
import sys

dtype = np.float32
item_feats = item_feat.toarray().astype(dtype)
print(sys.getsizeof(item_feats)/(10e9))

3.6192800944


In [12]:
item_feats[0]

array([0.        , 1.        , 0.        , ..., 0.        , 0.        ,
       0.04414384], dtype=float32)

# Create edges dataframe

In [13]:
edges = pd.DataFrame({"source": source, "target": target})

# Finally, create the graph

In [31]:
G = StellarGraph(item_feats, edges, is_directed = False, dtype = np.float32)
print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 2102277, Edges: 15322937

 Node types:
  default: [2102277]
    Features: float32 vector, length 4304
    Edge types: default-default->default

 Edge types:
    default-default->default: [15322937]
        Weights: all 1 (default)
        Features: none


# Lets start training! GraphSAGE Imports:

In [15]:
import networkx as nx
import pandas as pd
import numpy as np
import os
import random

import stellargraph as sg
from stellargraph.data import EdgeSplitter
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.layer import GraphSAGE, link_classification
from stellargraph.data import UniformRandomWalk
from stellargraph.data import UnsupervisedSampler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import accuracy_score

from stellargraph import globalvar

from stellargraph import datasets
from IPython.display import display, HTML

# Check if we are running on gpu

In [16]:
tf.debugging.set_log_device_placement(True)

# Place tensors on the GPU
with tf.device("/GPU:0"):
    a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
    b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])

# Run on the GPU
C = tf.matmul(a, b)
print(C)

tf.Tensor(
[[22. 28.]
 [49. 64.]], shape=(2, 2), dtype=float32)


In [32]:
nodes = list(G.nodes())
number_of_walks = 1
length = 5

# Make validation split

In [33]:
node_indexes = np.arange(len(nodes))
val_size = .15
nodes_train, nodes_val = train_test_split(node_indexes, test_size = val_size, random_state = 42)

In [34]:
unsupervised_samples_train = UnsupervisedSampler(
    G, nodes=nodes_train, length=length, number_of_walks=number_of_walks)

In [21]:
unsupervised_samples_val = UnsupervisedSampler(
    G, nodes=nodes_val, length=length, number_of_walks=number_of_walks)

In [20]:
print(nodes_train[:10])
print(nodes_val[:10])

[  66946  292836  120815 1076367 1971989  684678   30869  464270 1365193
 1027760]
[1473523  331569  446851 2060933   60144  139159  104776 1531393 1462210
  501768]


# More parameters

In [35]:
batch_size = 50
epochs = 1
num_samples = [12, 6]

In [36]:
generator = GraphSAGELinkGenerator(G, batch_size, num_samples)
train_gen = generator.flow(unsupervised_samples_train)
#val_gen = generator.flow(unsupervised_samples_val)

# Create GraphSAGE model

In [37]:
layer_sizes = [64, 64]
graphsage = GraphSAGE(
    layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0.25, normalize="l2")

In [38]:
x_inp, x_out = graphsage.in_out_tensors()

In [39]:
prediction = link_classification(
    output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
)(x_out)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [40]:
model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
    optimizer=keras.optimizers.Adam(lr=2e-3),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)

# Its time to train!

In [None]:
with tf.device("/GPU:0"):
    history = model.fit(
        train_gen,
        epochs=epochs,
        verbose=1,
        use_multiprocessing=False,
        workers=4,
        shuffle=True)



In [41]:
x_inp_src = x_inp[0::2]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

In [42]:
embedding_model.save("sage_v0")
print("Model saved bro.")

INFO:tensorflow:Assets written to: sage_v0/assets
Model saved bro.
