In [1]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd()))

In [2]:
import random
import numpy as np
import scipy as sp
import pandas as pd
import networkx
from collections import defaultdict

from src.utils.logger import logger
from src.utils.io_utils import save_model, load_model
from src.prep.prep_edgelist import create_product_pair, get_relationship_weights, split_product_pair

ModuleNotFoundError: No module named 'src.prep.prep_edgelist'

In [None]:
N_NODES = 5
SAMPLES_PER_NODE = 10
SEQUENCE_LEN = 10

In [None]:
G = networkx.gnp_random_graph(5, 0.5)
G.add_edge(1, 2, weight=7)

In [None]:
G.edges

In [None]:
G.nodes

In [None]:
A = networkx.adj_matrix(G)

In [None]:
A = A.todense()

In [None]:
A = np.array(A, dtype=np.float64)

In [None]:
A

In [None]:
np.sum(A, axis=1)

In [None]:
D = np.diag(np.sum(A, axis=0))
D

In [None]:
T = np.dot(np.linalg.inv(D), A)
T

In [None]:
T[0]

In [None]:
networkx.write_weighted_edgelist(G, 'test.edgelist')

### Create graph

In [3]:
# Weight for each relationship
BOUGHT_TOGETHER_WEIGHT = 1.2
ALSO_BOUGHT_WEIGHT = 1
ALSO_VIEWED_WEIGHT = 0.5

In [56]:
df = pd.read_csv('../data/books_nodes_and_edges.csv')
logger.info('Nodes and edges shape: {}'.format(df.shape))

2019-11-11 17:45:03,647 - Nodes and edges shape: (53783446, 3)


#### Sampling for development

In [57]:
sample_idx = np.random.choice(df.shape[0], size=100000, replace=False)

In [58]:
df = df.iloc[sample_idx]
logger.info('Relationship distribution: \n{}'.format(df['relationship'].value_counts()))

2019-11-11 17:45:08,561 - Relationship distribution: 
also_bought        93761
also_viewed         4736
bought_together     1503
Name: relationship, dtype: int64


#### End sampling

In [59]:
x = df[['asin', 'related']].values

In [60]:
x

array([['B00AK3MY8I', '1401604730'],
       ['B009XLZIUE', 'B00AJQ2YUO'],
       ['2090338490', '2218065916'],
       ...,
       ['B00FDZK4MG', 'B00F3HJFBA'],
       ['1571103333', '0805845143'],
       ['1490936653', '1500159050']], dtype=object)

In [61]:
x.sort(axis=1)

In [62]:
df['product_pair'] = ['|'.join(arr) for arr in x]

### Update weights based on manual input

In [63]:
df['weight'] = 0
df.loc[df['relationship'] == 'bought_together', 'weight'] += BOUGHT_TOGETHER_WEIGHT
df.loc[df['relationship'] == 'also_bought', 'weight'] += ALSO_BOUGHT_WEIGHT
df.loc[df['relationship'] == 'also_viewed', 'weight'] += ALSO_VIEWED_WEIGHT

In [64]:
df_agg = df.groupby('product_pair').agg({'weight': 'sum'}).reset_index()

In [65]:
def split_product_pair(product_pair):
    result = product_pair.split('|')
    return result[0], result[1]

In [66]:
df_agg['product1'], df_agg['product2'] = zip(*df_agg['product_pair'].apply(split_product_pair))

In [67]:
df_agg = df_agg[['product1', 'product2', 'weight']]

In [68]:
df_agg.shape

(99960, 3)

In [69]:
df_agg.to_csv('../data/books_edges.edgelist', sep=' ', index=False, header=False)

### Check

In [3]:
df = pd.read_csv('../data/books_edges_train.csv')

In [4]:
df

Unnamed: 0.1,Unnamed: 0,product1,product2,weight
0,0,0000013714,0002877813,2.0
1,1,0000013714,0005064295,3.2
2,2,0000013714,0005064309,2.0
3,3,0000013714,0005064341,3.2
4,4,0000013714,0005080789,4.4
...,...,...,...,...
21277474,26596843,B00L3BZ410,B00L3D35KU,1.0
21277475,26596844,B00L3BZ410,B00LH1APNI,2.0
21277476,26596845,B00L6GYHUQ,B00L6H1XIE,1.0
21277477,26596846,B00L6GYYCC,B00L6H1XIE,1.0


### Load graph

In [70]:
G = networkx.read_weighted_edgelist('../data/books_edges.edgelist')

In [71]:
logger.info('No of nodes ({:,}) and edges ({:,})'.format(G.number_of_nodes(), G.number_of_edges()))

2019-11-11 17:45:10,405 - No of nodes (170,847) and edges (99,960)


In [72]:
node_dict = {i: key for i, key in enumerate(G.nodes.keys())}

### Create transition matrix

In [73]:
adjacency_mat = networkx.adj_matrix(G)

In [74]:
adjacency_mat.shape

(170847, 170847)

In [75]:
adjacency_mat

<170847x170847 sparse matrix of type '<class 'numpy.float64'>'
	with 199920 stored elements in Compressed Sparse Row format>

In [76]:
degree_vector = sp.sparse.csr_matrix(1/np.sum(adjacency_mat, axis=0))

In [77]:
degree_vector

<1x170847 sparse matrix of type '<class 'numpy.float64'>'
	with 170847 stored elements in Compressed Sparse Row format>

In [78]:
transition_matrix = adjacency_mat.multiply(degree_vector).T  # Need to transpose so each row probability sum to 1

### Create transition dict

In [27]:
transition_dict = defaultdict(lambda: defaultdict(list))

In [28]:
rows, cols = transition_matrix.nonzero()

In [29]:
for row, col in zip(rows, cols):
    transition_dict[row]['product'].append(col)
    transition_dict[row]['probability'].append(transition_matrix[row, col])

In [50]:
def run_defaultdict():
    transition_dict = defaultdict(lambda: defaultdict(list))
    rows, cols = transition_matrix.nonzero()
    
    for row, col in zip(rows, cols):
        transition_dict[row]['product'].append(col)
        transition_dict[row]['probability'].append(transition_matrix[row, col])

In [51]:
def run_setdefault():
    transition_dict = defaultdict(dict)
    rows, cols = transition_matrix.nonzero()
    
    prev_row = -1
    for row, col in zip(rows, cols):
        if row != prev_row:
            transition_dict[row].setdefault('product', [])
            transition_dict[row].setdefault('probability', [])

        transition_dict[row]['product'].append(col)
        transition_dict[row]['probability'].append(transition_matrix[row, col])

In [47]:
transition_dict = defaultdict(dict)

In [48]:
rows, cols = transition_matrix.nonzero()

In [49]:
prev_row = -1
for row, col in zip(rows, cols):
    if row != prev_row:
        transition_dict[row].setdefault('product', [])
        transition_dict[row].setdefault('probability', [])
        
    transition_dict[row]['product'].append(col)
    transition_dict[row]['probability'].append(transition_matrix[row, col])

In [81]:
def run_defaultdict():
    transition_dict = defaultdict(lambda: defaultdict(list))
    rows, cols = transition_matrix.nonzero()
    
    for row, col in zip(rows, cols):
        transition_dict[row]['product'].append(col)
        transition_dict[row]['probability'].append(transition_matrix[row, col])
        
    return transition_dict

In [114]:
def run_setdefault():
    transition_dict = {}
    rows, cols = transition_matrix.nonzero()
    
    prev_row = -1
    for row, col in zip(rows, cols):
        if row != prev_row:
            transition_dict.setdefault(row, {})
            transition_dict[row].setdefault('product', [])
            transition_dict[row].setdefault('probability', [])

        transition_dict[row]['product'].append(col)
        transition_dict[row]['probability'].append(transition_matrix[row, col])
        prev_row = row
        
    return transition_dict

In [83]:
%timeit run_defaultdict()

4.01 s ± 86.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [115]:
%timeit run_setdefault()

3.88 s ± 55.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [116]:
transition_dict = run_setdefault()

In [99]:
idx = 25
df_agg[(df_agg['product1'] == node_dict[idx]) | (df_agg['product2'] == node_dict[idx])]

Unnamed: 0,product1,product2,weight
12,000481259X,471076333,1.0
25587,0471076333,781760038,1.0


### Random walk sampling

In [121]:
N_NODES = len(node_dict)
SAMPLES_PER_NODE = 5
SEQUENCE_LEN = 5

In [122]:
len(node_dict)

170847

In [123]:
sample_array = np.zeros((N_NODES * SAMPLES_PER_NODE, SEQUENCE_LEN), dtype=int)
logger.info('Samples shape: {}'.format(sample_array.shape))

2019-11-11 17:58:48,941 - Samples shape: (854235, 5)


In [124]:
# def np_rand():
#     for node_idx in range(N_NODES):
#         for sample_idx in range(SAMPLES_PER_NODE):
#             node = node_idx
#             for seq_idx in range(SEQUENCE_LEN):
#                 sample_array[node_idx*SAMPLES_PER_NODE + sample_idx, seq_idx] = node
#                 node = np.random.choice(a=transition_dict[node]['product'], p=transition_dict[node]['probability'])

In [125]:
# Much much faster
# np.random.choice: 10.4 s ± 95.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# random.choices: 1.65 s ± 33.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
def get_sample_array():
    sample_array = np.zeros((N_NODES * SAMPLES_PER_NODE, SEQUENCE_LEN), dtype=int)
    logger.info('Samples shape: {}'.format(sample_array.shape))

    for node_idx in range(N_NODES):
        if node_idx % 50000 == 0:
            logger.info('Getting samples for node: {:,}/{:,}'.format(node_idx, N_NODES))
        for sample_idx in range(SAMPLES_PER_NODE):
            node = node_idx
            for seq_idx in range(SEQUENCE_LEN):
                sample_array[node_idx*SAMPLES_PER_NODE + sample_idx, seq_idx] = node
                node = random.choices(population=transition_dict[node]['product'], weights=transition_dict[node]['probability'], k=1)[0]
                
    return sample_array

In [126]:
sample_array = get_sample_array()

2019-11-11 17:58:50,353 - Samples shape: (854235, 5)
2019-11-11 17:58:50,354 - Getting samples for node: 0/170,847
2019-11-11 17:58:54,586 - Getting samples for node: 50,000/170,847
2019-11-11 17:58:58,945 - Getting samples for node: 100,000/170,847
2019-11-11 17:59:03,487 - Getting samples for node: 150,000/170,847


In [36]:
np.save('../data/books_sequences.npy', sample_array)

In [37]:
save_model(node_dict, '../data/books_node_dict.tar.gz')

2019-11-11 17:29:36,832 - Model saved to ../data/books_node_dict.tar.gz


In [100]:
save_model(transition_dict, '../data/books_transition_dict.tar.gz')

2019-11-11 17:50:18,294 - Model saved to ../data/books_transition_dict.tar.gz


In [None]:
sample_array

### Checks

In [118]:
check_arr = np.load('../data/books_sequences.npy')

In [120]:
check_arr

array([[     0,      1,      0, ...,      1,      0,      1],
       [     0,      1,      0, ...,      1,      0,      1],
       [     0,      1,      0, ...,      1,      0,      1],
       ...,
       [170846, 170845, 170846, ..., 170845, 170846, 170845],
       [170846, 170845, 170846, ..., 170845, 170846, 170845],
       [170846, 170845, 170846, ..., 170845, 170846, 170845]])

In [128]:
(check_arr == sample_array).sum()

  """Entry point for launching an IPython kernel.


AttributeError: 'bool' object has no attribute 'sum'

In [130]:
sample_array.shape[0] * sample_array.shape[1]

4271175

In [131]:
node_check = load_model('../data/books_node_dict.tar.gz')

2019-11-11 18:00:19,768 - Model loaded from: ../data/books_node_dict.tar.gz (Size: 20106661 bytes)


In [134]:
len(node_check)

170847

In [140]:
transition_check = load_model('../data/books_transition_dict.tar.gz')

2019-11-11 18:01:45,490 - Model loaded from: ../data/books_transition_dict.tar.gz (Size: 96295336 bytes)


In [138]:
transition_check = None

In [141]:
transition_check

{0: {'product': [1], 'probability': [1.0]},
 1: {'product': [0], 'probability': [1.0]},
 2: {'product': [3], 'probability': [1.0]},
 3: {'product': [2], 'probability': [1.0]},
 4: {'product': [5], 'probability': [1.0]},
 5: {'product': [4], 'probability': [1.0]},
 6: {'product': [7], 'probability': [1.0]},
 7: {'product': [6], 'probability': [1.0]},
 8: {'product': [9], 'probability': [1.0]},
 9: {'product': [8], 'probability': [1.0]},
 10: {'product': [11], 'probability': [1.0]},
 11: {'product': [10, 12879],
  'probability': [0.3333333333333333, 0.6666666666666666]},
 12: {'product': [13], 'probability': [1.0]},
 13: {'product': [12], 'probability': [1.0]},
 14: {'product': [15], 'probability': [1.0]},
 15: {'product': [14], 'probability': [1.0]},
 16: {'product': [17], 'probability': [1.0]},
 17: {'product': [16], 'probability': [1.0]},
 18: {'product': [19], 'probability': [1.0]},
 19: {'product': [18], 'probability': [1.0]},
 20: {'product': [21], 'probability': [1.0]},
 21: {'pro