In [1]:
import numpy as np
import networkx as nx
from networkx.generators.random_graphs import erdos_renyi_graph, barabasi_albert_graph
import matplotlib.pyplot as plt
from pylab import *
import pandas as pd
from tqdm import tqdm

In [2]:
def gen_network(p, n, m, net_type, drop_prob=None, seed=34) :
    if net_type == 'erdos':
        g = erdos_renyi_graph(n=n, p=p, seed=seed)
    elif net_type == 'barabasi':
        g = barabasi_albert_graph(n=n, m=m, seed=seed)
    if drop_prob is not None:
        np.random.seed(seed)
        for e in g.edges:
            if np.random.random(size=1)<=drop_prob:
                g.remove_edge(e[0],e[1])
#    N_i_j = nx.to_numpy_matrix(g, dtype=np.int0)

    return [], g


In [16]:
NUMBER_OF_CUSTOMERS = 4121
NUMBER_OF_PRODUCTS = 6
SALES_PER_DAY = 202

In [37]:
%%time
net,grp = gen_network(p=0.0001,n=NUMBER_OF_CUSTOMERS, m=1, net_type='erdos', drop_prob=None, seed=34)


CPU times: user 349 ms, sys: 1.6 ms, total: 351 ms
Wall time: 353 ms


id, product, date

- For each Day
    - Select a bunch of Random customer (np.choice())
    - Select a bunch of Random products (np.choice())

In [38]:
grp.number_of_edges()

826

In [36]:
customers = np.arange(NUMBER_OF_CUSTOMERS)
products = np.arange(NUMBER_OF_PRODUCTS)
start_date = np.datetime64('2017-01-01')
end_date = np.datetime64('2018-01-01')
theCalender = np.arange(start_date, end_date)

In [37]:
start_date = np.datetime64('2017-01')
end_date = np.datetime64('2018-01')
m_delta = np.timedelta64(1, 'M')
d_delta = np.timedelta64(1, 'D')
np.random.seed(34)
y_d = 1
data = []
for m in np.arange(start_date, end_date, m_delta):
    diff_range = np.arange(m, m+m_delta, d_delta)
    diff_range_size = diff_range.size
    selectedCustomers = np.random.choice(customers, diff_range_size*SALES_PER_DAY, replace=False)
    selectedProducts = np.random.choice(products, diff_range_size*SALES_PER_DAY, replace=True)
    d_i = 1
    for d in diff_range:
        cust = selectedCustomers[(d_i-1)*SALES_PER_DAY:d_i*SALES_PER_DAY]
        prd = selectedProducts[(d_i-1)*SALES_PER_DAY:d_i*SALES_PER_DAY]
        o = np.vstack((cust, prd, np.repeat(y_d, SALES_PER_DAY))).T
        data.append(o)
        d_i += 1
        y_d += 1

data = np.concatenate(data)
df = pd.DataFrame(data, columns=("cust","prd","date"))


(0, 1) ->
5 common preferences out of 3 seems to be affected (<10 days, 3-8-8)

	cust-0 ~ cust-1 ~ prod-2 ==> size: 1, diffs:[207]
	cust-0 ~ cust-1 ~ prod-3 ==> size: 1, diffs:[10]
	cust-0 ~ cust-1 ~ prod-4 ==> size: 1, diffs:[65]
	cust-0 ~ cust-1 ~ prod-6 ==> size: 2, diffs:[ 25 258]
	cust-0 ~ cust-1 ~ prod-8 ==> size: 9, diffs:[ 2 10 27]
(0, 2) ->

4 common preferences out of 0 seems to be affected

	cust-0 ~ cust-2 ~ prod-1 ==> size: 6, diffs:[14 25 78]
	cust-0 ~ cust-2 ~ prod-3 ==> size: 2, diffs:[ 49 112]
	cust-0 ~ cust-2 ~ prod-4 ==> size: 1, diffs:[221]
	cust-0 ~ cust-2 ~ prod-8 ==> size: 3, diffs:[140 209 246]


In [31]:
df

Unnamed: 0,cust,prd,date
0,945321,2,1
1,3170671,13,1
2,2404315,13,1
3,3679524,11,1
4,3816113,2,1
...,...,...,...
7295,5164296,1,365
7296,1612885,5,365
7297,3431773,10,365
7298,5008232,2,365


In [33]:
G = nx.Graph()

for u,v in tqdm(grp.edges()):
    for p in range(0,NUMBER_OF_PRODUCTS):
        c1_sales = df[(df['cust']==u) & (df['prd']==p)]['date'].ravel()
        c2_sales = df[(df['cust']==v) & (df['prd']==p)]['date'].ravel()
        if c1_sales.size>0 and c2_sales.size>0:
            sales_likely = [diff for diff in [np.abs(c1-c2) for c1 in c1_sales for c2 in c2_sales] if diff < 64]
            likely_size = len(sales_likely)
            if likely_size>=1:
                print(f"{u},{v} added")
                G.add_edge(u,v)

nx.draw(G, with_labels=True)

  0%|                                                                                                                                                                          | 4431/6999999 [01:49<47:51:55, 40.60it/s]


KeyboardInterrupt: 

In [11]:
print(len(grp.edges()),"==>",len(G.edges()))
for u,v in tqdm(G.edges()):
    print(f"{u} <==> {v}")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 153/153 [00:00<00:00, 6375.27it/s]

6999 ==> 153
2 <==> 19
19 <==> 100
19 <==> 1349
19 <==> 3839
3 <==> 105
3 <==> 732
3 <==> 1453
3 <==> 2767
3 <==> 6314
7 <==> 48
7 <==> 197
7 <==> 2475
11 <==> 383
11 <==> 1043
11 <==> 1426
12 <==> 633
17 <==> 77
77 <==> 4045
24 <==> 111
24 <==> 559
24 <==> 1562
36 <==> 118
36 <==> 720
36 <==> 767
36 <==> 4512
37 <==> 4744
39 <==> 3826
44 <==> 6464
56 <==> 871
60 <==> 6862
62 <==> 2390
66 <==> 124
82 <==> 3501
85 <==> 1225
93 <==> 832
114 <==> 5527
114 <==> 6681
120 <==> 1750
152 <==> 352
152 <==> 5810
164 <==> 5195
176 <==> 623
183 <==> 4242
201 <==> 3641
210 <==> 834
227 <==> 1029
228 <==> 2286
251 <==> 5534
255 <==> 1434
260 <==> 5531
276 <==> 5666
280 <==> 2570
283 <==> 5627
286 <==> 307
291 <==> 803
300 <==> 4101
300 <==> 6953
304 <==> 861
312 <==> 3246
320 <==> 5673
322 <==> 6238
326 <==> 3729
336 <==> 3195
340 <==> 610
610 <==> 1189
343 <==> 1165
350 <==> 4058
350 <==> 4537
393 <==> 1249
410 <==> 1800
414 <==> 3374
414 <==> 4065
474 <==> 4637
513 <==> 5471
524 <==> 6233
552 <==>




# Multiprocessor

In [38]:
%%time
from joblib import Parallel, delayed
import numpy as np
import time

def rate_customers(edges,product_size,sales_df, G):
    G = nx.Graph()
    for u,v in tqdm(edges):
        for p in range(0,product_size):
            c1_sales = sales_df[(sales_df['cust']==u) & (sales_df['prd']==p)]['date'].ravel()
            c2_sales = sales_df[(sales_df['cust']==v) & (sales_df['prd']==p)]['date'].ravel()
            if c1_sales.size>0 and c2_sales.size>0:
                sales_likely = [diff for diff in [np.abs(c1-c2) for c1 in c1_sales for c2 in c2_sales] if diff < 64]
                likely_size = len(sales_likely)
                if likely_size>=1:
                    G.add_edge(u,v)
    return G

all_edges=[(u,v) for u,v in grp.edges()]

start = time.perf_counter()
edge_clusters = [all_edges[i*1000: (i+1)*1000] for i in range(7)]
results = Parallel(n_jobs=8,verbose=10)\
    (delayed(rate_customers)(edge_cluster, NUMBER_OF_PRODUCTS, df, G) for edge_cluster in edge_clusters)
H = nx.compose_all(results)
end = time.perf_counter()
print(f'Finished with pool in {round(end-start, 2)} second(s)') 

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   1 tasks      | elapsed: 55.8min
[Parallel(n_jobs=8)]: Done   2 out of   7 | elapsed: 55.8min remaining: 139.5min
[Parallel(n_jobs=8)]: Done   3 out of   7 | elapsed: 55.9min remaining: 74.5min
[Parallel(n_jobs=8)]: Done   4 out of   7 | elapsed: 55.9min remaining: 41.9min
[Parallel(n_jobs=8)]: Done   5 out of   7 | elapsed: 55.9min remaining: 22.4min


Finished with pool in 3358.56 second(s)
Wall time: 56min 34s


[Parallel(n_jobs=8)]: Done   7 out of   7 | elapsed: 56.0min remaining:    0.0s
[Parallel(n_jobs=8)]: Done   7 out of   7 | elapsed: 56.0min finished


In [39]:
print(len(grp.edges()),"==>",len(H.edges()))
for u,v in tqdm(H.edges()):
    print(f"{u} <==> {v}")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 137/137 [00:00<00:00, 6106.02it/s]

6999999 ==> 137
1 <==> 4850
1 <==> 5779
1 <==> 13775
1 <==> 40708
1 <==> 46466
1 <==> 61327
1 <==> 70424
1 <==> 72250
1 <==> 76753
1 <==> 93491
1 <==> 117969
1 <==> 127005
1 <==> 130595
1 <==> 170469
1 <==> 186892
1 <==> 189427
1 <==> 194092
1 <==> 202040
1 <==> 249575
1 <==> 274509
1 <==> 323876
1 <==> 361660
1 <==> 399525
1 <==> 420710
1 <==> 459213
1 <==> 529248
1 <==> 543024
1 <==> 557373
1 <==> 561523
1 <==> 597083
1 <==> 682537
1 <==> 784520
1 <==> 809585
1 <==> 898041
1 <==> 1069368
1 <==> 1144120
1 <==> 1147077
1 <==> 1164442
1 <==> 1210829
1 <==> 1270800
1 <==> 1365848
1 <==> 1444751
1 <==> 1693278
1 <==> 1729239
1 <==> 1823870
1 <==> 1837944
1 <==> 1886913
1 <==> 1901337
1 <==> 1942128
1 <==> 2358206
1 <==> 2630221
1 <==> 2716583
1 <==> 2744285
1 <==> 2774531
1 <==> 2790370
1 <==> 3011031
1 <==> 3050028
1 <==> 3080679
1 <==> 3213327
1 <==> 3256780
1 <==> 3351670
1 <==> 3423921
1 <==> 3464962
1 <==> 3718716
1 <==> 4082197
1 <==> 4084805
1 <==> 4441692
1 <==> 4459387
1 <==> 455


