# Data Science for Networks

In [None]:
%matplotlib inline

import os
import pandas as pd
import matplotlib.pyplot as plt
import time

#### Stock pre-processing

In [None]:
def _date_parser(x):
    return pd.to_datetime(x, yearfirst=True, format='%Y-%m-%d')

data_dir = 'historical-prices'
files = os.listdir(data_dir)
files.sort()
len(files)

Retain only stocks whose series are complete between year-X and year-Y.

In [None]:
desired_size = 251 # 503 for 2016+2017
stock_series = []
for file in files:
    df = pd.read_csv(os.path.join(data_dir, file), header=0, index_col=0, parse_dates=True, 
                     date_parser=_date_parser)    
    s = df.loc['2017':'2017', 'Close']
    s.name = file.split('_')[0]
    # print(s.size)
    if s.size == desired_size:
        stock_series += [s]
len(stock_series)

Find relative difference with respect to the previous day.

In [None]:
diffs = [s.diff() for s in stock_series]
pdiffs = [s.diff() / s.shift(1) for s in stock_series]
pdiffs = [s[1:] for s in pdiffs]
pdiffs = pdiffs[:-1]

stock_to_id = {}
id_to_stock = {}
for stock in pdiffs:
    stock_to_id[stock.name] = len(stock_to_id)
    id_to_stock[len(stock_to_id) - 1] = stock.name

#### Define tensor and decompose

In [None]:
# tensor shape
r_len = c_len = len(pdiffs)
t_len = pdiffs[0].size
r_len, c_len, t_len

Numpy implementation -> large memory consumption ?

In [None]:
import numpy as np

In [None]:
X = np.zeros(shape=(r_len, c_len, t_len))

start_time = time.time()
for t in range(t_len):
    print("t:", t)
    for i, item_i in enumerate(pdiffs):
        
        # use symmetry
        for j in range(i+1): # enumerate(pdiffs):
            # print(i, j)
            if i == j:
                X[i][j][t] = 0
                continue
            
            item_j = pdiffs[j]
            if item_i.iloc[t] > tr and item_j.iloc[t] > tr:
                X[i][j][t] = 1
                X[j][i][t] = 1
            else:
                X[i][j][t] = 0

print('elapsed {} secs'.format(int(time.time() - start_time)))      

In [None]:
import tensorly as tl
from tensorly.decomposition import parafac as par
from tensorly.decomposition import non_negative_parafac as nnpar

tl.set_backend('numpy')

NPX = tl.tensor(X)

In [None]:
for r in range(1, t_len + 1):
    start_time = time.time()
    A, B, C = par(NPX, r)
    print('elapsed: ', int(time.time() - start_time))
    start_time = time.time()
    A, B, C = nnpar(NPX, r)
    print('elapsed: ', int(time.time() - start_time))

Create sparse Dictionary Of Keys (see also scipy.dox_matrix)

In [None]:
import sparse

'''
EXAMPLE
X = sparse.DOK(shape=(len(filtered_users), len(filtered_tags), len(time)))
for user, tag, ts in filtered_mentions:
    X[filtered_users[user], filtered_tags[tag], time[ts]] = filtered_mentions[user, tag, ts]
X = X.to_coo()
'''

X = sparse.DOK(shape=(r_len, c_len, t_len))

Populate sparse matrix (sequential)

In [None]:
X = sparse.DOK(shape=(r_len, c_len, t_len))
start_time = time.time()

for t in range(t_len):
    print("t:", t)
    for i, item_i in enumerate(pdiffs):
        
        # use symmetry
        for j in range(i+1): # enumerate(pdiffs):
            # print(i, j)
            if i == j:
                X[i, j, t] = 0
                continue
            
            item_j = pdiffs[j]
            if item_i.iloc[t] > tr and item_j.iloc[t] > tr:
                X[i, j, t] = 1
                X[j, i, t] = 1
            else:
                X[i, j, t] = 0

print('elapsed {} secs'.format(int(time.time() - start_time)))           

XCOO = X.to_coo()

In [None]:
XCOO = X.to_coo()

In [None]:
XCOO.shape

In [None]:
import tensorly as tl
tl.set_backend('sparse')

import tensorly.contrib.sparse as tlsp
from tensorly.contrib.sparse.decomposition import parafac
#from tensorly.sparse.decomposition import parafac

X = tlsp.tensor(XCOO)
X

Decompose evaluating (time) and MSE give a number **R** of components.

In [None]:
for r in range(1, t_len + 1):
    start_time = time.time()
    A, B, C = parafac(X, r)
    print('elapsed: ', int(time.time() - start_time))

In [None]:
for factor in range(5):
    plt.plot(list(C[:, factor]))
    plt.title('Factor {}'.format(factor))
    # print(A[:20, factor].data)
    # print(B[:20, factor].data)
    for a in A[:, factor].todense().argsort()[::-1][:5]:
        print(a,id_to_stock[a],A[a,factor])
        print(id_to_stock[a], end=' ')
        # print()
        # print()
    print('---')
    for b in B[:, factor].todense().argsort()[::-1][:5]:
        print(b,id_to_stock[b],B[b,factor])
        print(id_to_stock[b], end=' ')
        # print()
        # print()
    print()
    plt.show()
    print()
    print()