# LN - Data PP - Stability and efficiency calculations

## Import libraries and parameters

In [None]:
import sqlite3
import numpy as np
import pandas as pd
import networkx as nx
import itertools
#import matplotlib.pyplot as plt
import time
import pickle

import os
import re
import sys
import io
import random
from itertools import islice
import math

from tqdm.notebook import trange, tqdm
#from tqdm.notebook import trange
#from tqdm import tqdm_notebook as tqdm
from time import sleep

from dask_cloudprovider import FargateCluster
from dask.distributed import Client
import dask.array as da
import dask
dask.config.set({'distributed.scheduler.allowed-failures': 50}) 


import boto3


In [2]:
# Define parameters

bucket='ln-strategy-data'
extraction_id=1587447789
#extraction_id=1585344554

## Connection to AWS Resources

### S3

In [3]:
# Initiate s3 resource

session = boto3.session.Session()
s3 = session.resource('s3')

### Fargate Clusters

In [146]:
#cluster = FargateCluster(n_workers=100,scheduler_timeout='20 minutes',image='dsrincon/dask-graph:nx-scipy-v1',scheduler_cpu=4096,scheduler_mem=16384,worker_mem=32768)
cluster = FargateCluster(n_workers=20,scheduler_timeout='10 minutes',image='dsrincon/dask-graph:nx-scipy-v1',scheduler_cpu=4096,scheduler_mem=16384)

In [147]:
cluster

VBox(children=(HTML(value='<h2>FargateCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n  …

In [149]:
client = Client(cluster)
#cluster=Client('tcp://18.234.80.68:8786')


python
+---------------------------+---------------+
|                           | version       |
+---------------------------+---------------+
| client                    | 3.7.3.final.0 |
| scheduler                 | 3.7.4.final.0 |
| tcp://172.31.10.138:46825 | 3.7.4.final.0 |
| tcp://172.31.11.169:42625 | 3.7.4.final.0 |
| tcp://172.31.19.210:41527 | 3.7.4.final.0 |
| tcp://172.31.2.56:33483   | 3.7.4.final.0 |
| tcp://172.31.20.202:34557 | 3.7.4.final.0 |
| tcp://172.31.21.138:37815 | 3.7.4.final.0 |
| tcp://172.31.22.25:38343  | 3.7.4.final.0 |
| tcp://172.31.26.50:41149  | 3.7.4.final.0 |
| tcp://172.31.27.64:37437  | 3.7.4.final.0 |
| tcp://172.31.29.168:42035 | 3.7.4.final.0 |
| tcp://172.31.29.26:37969  | 3.7.4.final.0 |
| tcp://172.31.29.41:44035  | 3.7.4.final.0 |
| tcp://172.31.30.139:36575 | 3.7.4.final.0 |
| tcp://172.31.32.196:40005 | 3.7.4.final.0 |
| tcp://172.31.32.50:46365  | 3.7.4.final.0 |
| tcp://172.31.33.2:45037   | 3.7.4.final.0 |
| tcp://172.31.37.153:4649

## Define Utility functions

### Save Data

**FUNCTION: Write output to DataFrame**

In [4]:
# Function write output to DataFrame

"""
Function
--------
add_columns
    Function that takes an output from a decision comparisson computation and adds it's results for nodes 1 and 0 in the main DataFrame

Parameters
----------
output_diclist : list
    Dictionary of the form (node0_dic_i,node1_dic_i) where i runs for all of the blocks being compared. 

original_df: Pandas DataFrame
    Original DataFrame containing the opening and closure information for each channel, with a column named 'short_channel_id' to denote 
    id of channel. 

column_name_node0: string
    Name for column in dataframe where the results will be stored for node 0
    
column_name_node1: string
    Name for column in dataframe where the results will be stored for node 1

    
Returns
-------
no_changes: list
    List with the 'short_channel_id' of the channels edited. 
 
"""


def add_columns(output_diclist,original_df,column_name_node0,column_name_node1):


    # Merge individual dictionaries into one for each node
    node0_dic={}
    node1_dic={}
    for dic_tuple in output_diclist:
        node0_dic.update(dic_tuple[0])
        node1_dic.update(dic_tuple[1])
    
    # Add to DataFrame

    # Create empty columns
    original_df[column_name_node0]=np.nan
    original_df[column_name_node1]=np.nan

    # Populate df with values
    original_df[column_name_node0]=original_df['short_channel_id'].map(node0_dic)
    original_df[column_name_node1]=original_df['short_channel_id'].map(node1_dic)
    
    # Calculate values changed
    rows_edited=(original_df[original_df[column_name_node0].notnull()]['short_channel_id']).tolist()
    
    return rows_edited


**FUNCTION: Save python object to S3 using pickle**

In [5]:
# Function to write pickle data to S3 bucket

"""
Function
--------
pickle_save_s3
    Saves Python object to S3 as pickle object

Parameters
----------
obj : <any>
    Python Object

blocks: list
    List of extracted blocks

extraction_id: int
    Number of block extraction
    
name: string
    Name of object to add to filename in S3

    
Returns
-------
response: int
    HTTP response code from S3 
 
"""


def pickle_save_s3(obj,blocks,extraction_id,name):


    # Define number of blocks
    start_block=np.min(np.array(blocks))
    end_block=np.max(np.array(blocks))
    no_blocks=len(blocks)

    # Load S3 and bucket details
    session = boto3.session.Session()
    s3 = session.resource('s3')

    # File path and name ([extraction_id][name]-[no_blocks]-[start_block]-[end_block])
    key='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+str(extraction_id)+name+'-'+str(no_blocks)+'-'+str(start_block)+'-'+str(end_block)+'.pkl'

    # Create pickle object and send to S3
    pickle_byte_obj = pickle.dumps(obj) 
    response=s3.Object(bucket,key).put(Body=pickle_byte_obj)
    
    return response['ResponseMetadata']

In [10]:
# Function to write pickle data to S3 bucket

"""
Function
--------
simple_psave_s3
    Saves Python object to S3 as pickle object

Parameters
----------
input_tuple: tuple
    
    bucket: str
        S3 bucket to save
    key: str
        S3 key to save
    obj: obj
        python object to save

Returns
-------
response: int
    HTTP response code from S3 
 
"""
def simple_psave_s3(input_tuple):
    
    bucket=input_tuple[0]
    key=input_tuple[1]
    obj=input_tuple[2]
    
    # Start S3 session
    session = boto3.session.Session()
    s3 = session.resource('s3')
    
    # Save to S3
    pickle_byte_obj = pickle.dumps(obj) 
    response=s3.Object(bucket,key).put(Body=pickle_byte_obj)['ResponseMetadata']['HTTPStatusCode']
    
    return response
    

### Load Data

**FUNCTION: Load single graph**

In [4]:


"""
Function
--------
load_graph
    Loads networkX (pickle serialized) object from S3

Parameters
----------
key : str
    Path in S3 bucket for individual pickled serialized networkX graph object 

    
Returns
-------
response: networkX graph
    Graph object
 
"""

def load_graph(key):
    session = boto3.session.Session()
    s3 = session.resource('s3')
    response = s3.Object(bucket_name=bucket, key=key).get()
    G=pickle.loads(response['Body'].read())
    
    return G
   

### Testing

**FUNCTION: Run graph measurement**

In [6]:


"""
Function
--------
load_graph_measurement:
    Runs graph measurement for every node in a specific block and loads the created dictionary from S3

Parameters
----------

extraction_id: int
    Timestamp of block extraction

test_ix: int
    Index of block to test. Can be negative to move backwards in array

measurement: str
    Type of graph measurement to perform

weight: str
    Node attribute to use for weighted calculations

blocks: list
    List of (ints) blocks extracted 

graph_keys: list
    List of (str) graph paths in S3

    
Returns
-------
g_test: NetworkX graph
    NetworkX graph extracted for the given ix

nodes_test: list
    List of nodes in g_test

g_dic_test: dict
    Dictionary of graph measurment for each node in g_test

block: int
    Block selected for test
 
"""

def load_graph_measurement(extraction_id,measurement,weight,blocks,graph_keys,test_ix=None):

    if test_ix==None: # If no index provided choose one at random
        test_ix=random.choice(range(len(blocks)))
        
        
    
    # Define block and graph key
    test_block=blocks[test_ix]
    g_key=graph_keys[test_ix]
    print('Block selected:{}'.format(test_block))
    
    # Load graph and nodes
    response = s3.Object(bucket_name=bucket, key=g_key).get()
    g_test=pickle.loads(response['Body'].read())
    nodes_test=list(g_test.nodes())

    # Run function
    block,response_test=graph_measurement((g_key,measurement,weight,bucket))


    if response_test==200:
        # Load created dictionary with calculation from S3
        
        key_test='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/raw_score/'+str(test_block)+'.pkl'
        g__test_load = s3.Object(bucket_name=bucket, key=key_test).get()
        g_dic_test = pickle.loads(g__test_load['Body'].read())
        print('Dic was saved correctly. Sample below:')
        print(list(g_dic_test.items())[:10])
        
    
    else:
        print('Measurement was not saved correctly')
        return
    
    return g_test,nodes_test,g_dic_test,test_block

## Load Data

In [5]:
# Load objects form S3
# Dataframe

decisions_load = s3.Object(bucket_name=bucket, key='decisions_df.csv').get()
decisions_df=pd.read_csv(io.BytesIO(decisions_load['Body'].read()))

# Channel closures
closure_file = s3.Object(bucket_name=bucket, key='channel_closures.p').get()
channel_closures = pickle.loads(closure_file['Body'].read())
    
    
# Channel openings 
opens_file = s3.Object(bucket_name=bucket, key='channel_opens.p').get()
channel_opens = pickle.loads(opens_file['Body'].read())

    

# Create list with graph keys

#TODO: Save graphs as numpy array in single H5 file to reduce. Test if creating graphs takes longer than reading from S3

# graph_dir='./data/graph_snapshots' - For local tests


graph_keys = [obj.key 
        for obj in s3.Bucket(name=bucket).objects.all()
        if re.match(".*"+str(extraction_id)+"_connected/.*\.gpickle",obj.key)]





In [124]:
len(blocks)

36536

In [62]:
# Define Blocks to be extracted and define graph


# Base lists to be populated
graph_snapshots=[]
blocks=[]
base_ix=6


extract_keys=graph_keys[base_ix:] # Blocks below 6th index are <3 and affect some graph metrics

for key in extract_keys: # Change to [700:] for full range
    
    # Create block list from file_names
    block_i=int(key.split(".")[0].split("/")[-1]) 
    blocks.append(block_i)
    
    #Extract graphs - UNCOMMENT TO have them out of function
    #G=dask.delayed(load_graph)(key)
    #graph_snapshots.append(G)
    
   
start_block=np.min(np.array(blocks))
end_block=np.max(np.array(blocks))
no_blocks=len(blocks)

    

**SCRIPT: Extract nodes and calculate age of nodes**

In [21]:
# Define nodes and calculate details

nodes=list(set(decisions_df['node0_id'].tolist()).union(set(decisions_df['node1_id'].tolist())))

# DETAIL_1:Calculate birth block

# Create list of sets (node pairs )
opens_list=sorted(list(channel_opens.items()))
open_list_sets=[(opens[0],[{t[0],t[1]} for t in opens[1]]) for opens in opens_list]

# Dic to store details per node 
node_details={}


with tqdm(total=len(nodes)) as pbar:
    for node in nodes:

        for opens in open_list_sets:
            if opens[1] and node in set.union(*opens[1]):
                birth_block=opens[0]
                node_details[node]={'birth_block':birth_block}
                break
    
        pbar.update(1)
        

# SAVE to S3

key='node_details.p'
pickle_byte_obj = pickle.dumps(node_details) 
response=s3.Object(bucket,key).put(Body=pickle_byte_obj)['ResponseMetadata']

if response['HTTPStatusCode']==200:
    print('Details extracted and save to S3 succesful')
else: 
    print(response)



HBox(children=(FloatProgress(value=0.0, max=7735.0), HTML(value='')))


Details extracted and save to S3 succesful


In [22]:
# TESTS
len_nodes=len(nodes)
len_details=len(list(node_details.items()))


if len_nodes==len_details and response['HTTPStatusCode']==200:
    print('Details extracted and save to S3 succesful')
else:
    print('Details for {} nodes were not extracted'.format(len_nodes-len_details))   
    print(response)

Details extracted and save to S3 succesful


In [20]:
# Test: extracted formats
print("Number of total graph keys:{}".format(len(graph_keys)))
print("Number of blocks to be processed:{}".format(len(extract_keys)))
print("---Sample graph keys---")
print(graph_keys[0])
print("---Sample channel opens---")
print(channel_opens[508090])
print("---Sample channel closures---")
print(channel_closures[592638])



Number of total graph keys:36542
Number of blocks to be processed:94
---Sample graph keys---
graph_snapshots/1587447789_connected/505149.gpickle
---Sample channel opens---
[(2378, 4223, {'capacity': 400000, 'open_fee': 4557, 'dec_id': 58766, 'channel_id': '508090x1515x1', 'no_channels': 0})]
---Sample channel closures---
[(2643, 6038, {'close_type': 'force', 'dec_id': 26620, 'channel_id': '570913x720x1', 'capacity': 300000}), (6038, 5314, {'close_type': 'mutual', 'dec_id': 0, 'channel_id': '505149x622x0', 'capacity': 300000})]


In [35]:
# Sort and visualize DataFrame

decisions_df.sort_values(by=['open_block'],inplace=True,ascending=True)
print('Size of DataFrame in Memory:{}'.format(sys.getsizeof(decisions_df)))
# Check specific channel id
#decisions_df[decisions_df['short_channel_id']=='513675x2245x0'].head()

decisions_df.sort_values(by=['close_block'],ascending=True).head(20)

Size of DataFrame in Memory:64821774


Unnamed: 0.1,Unnamed: 0,short_channel_id,open_block,open_transaction,address,close_block,close_transaction,node0,node1,satoshis,...,close_fee,last_update,close_type,close_htlc_count,close_balance_a,close_balance_b,dec_id,node0_id,node1_id,node_pair
70648,1027,535029x2012x1,535029,d01928d350e1ba04d7335a91e6dd54f5dbf94859e0c59b...,bc1qszamn0la3yqrqhjj8yepdxkl9qlr84zfwgg9zrkccl...,535029.0,d01928d350e1ba04d7335a91e6dd54f5dbf94859e0c59b...,022a7809052db05fde648391a53aba82286e4a517cff1d...,031b71cbad0cb4e22141e45f16c83c332f755e1ba68195...,462124,...,1989.0,,unused,0.0,275630.0,0.0,1027,4091,3578,14637598
70625,1045,535177x446x1,535177,7376d5bc0c18bbff8f644d0827e759a1518b38e1e95a08...,bc1qauzljedtlva73ngg7suqketlvn5gnnuemxpeuevcqt...,535177.0,7376d5bc0c18bbff8f644d0827e759a1518b38e1e95a08...,02272bd12e59324d0f2b231fb88f134b57eb26dd100d2c...,031b71cbad0cb4e22141e45f16c83c332f755e1ba68195...,257307,...,767.0,,unused,0.0,218405.0,0.0,1045,3604,3578,12895112
68629,2745,549037x2738x0,549037,b7128bbbe422b4f18fad71b091eed1f9e4b0d231be8117...,bc1q95fytjzs8f7fma2nf66gcva7c3w7hnkdwrkef9pu33...,549037.0,b7128bbbe422b4f18fad71b091eed1f9e4b0d231be8117...,028b892b15f5cabcea5165b236db0e36dc06553c323c84...,038b36a43c38f75cd15bb25394f1cd162f717df0055852...,400000,...,1991.0,1547494000.0,unused,0.0,400000.0,0.0,2745,1781,4968,8848008
68636,2744,549037x2737x0,549037,0825da5e96cd45fced3233ebe615721b687285839d3036...,bc1q5mqzhw5e42rfqh250zalwu47ru8gvz4g4k968me0mg...,549037.0,0825da5e96cd45fced3233ebe615721b687285839d3036...,02b95713bbe4609a337f3ca5aab3a75674083ddf5331a4...,038b36a43c38f75cd15bb25394f1cd162f717df0055852...,400000,...,1992.0,1547503000.0,unused,0.0,400000.0,0.0,2744,210,4968,1043280
68461,40227,549489x1194x1,549489,58dafe493648fbdd69143c26e0cf8a66ae11a272c2739d...,bc1q25j5l6crv4mrjkjjjw4rzyv890cwwnyyw9dezcqs5x...,549489.0,58dafe493648fbdd69143c26e0cf8a66ae11a272c2739d...,02574ffa55d394b9326f6e5c15992cc0516b0d6e6a79a1...,03a5927b64b1ea8657d5b770d61a3e2d0554fdb5d56877...,2500000,...,2363.0,,unused,0.0,8974.0,0.0,40227,7558,4495,33973210
59540,37095,562592x1695x1,562592,06b4d9b3cfa10bd2cd33131d034a6b38c1651eee49018a...,bc1q9jnkm78y45kyasnu43p52gc8sqazwy4yfjtqzkwqx5...,562592.0,06b4d9b3cfa10bd2cd33131d034a6b38c1651eee49018a...,029b71b8186914267ea59cb081c43ad1aeb874b5a185a4...,03864ef025fde8fb587d989186ce6a4a186895ee44a926...,5000000,...,2136.0,,unused,0.0,4995006.0,0.0,37095,3210,7259,23301390
55969,37213,564495x455x1,564495,8a5764e1f0cb659b687a0675cd88983526bd9213665986...,bc1qqgcvu4nl3vjm4vvr9r5l2f4ufppph5fn070fzscq4r...,564510.0,655c3d44c09055e9af9f8d13d55c99979e0b0b306230b9...,024655b768ef40951b20053a5c4b951606d4d86085d512...,0375a154b8f94eb0556566d60d96acc47f99f2f0d74ef9...,400000,...,4889.0,,unused,0.0,395111.0,0.0,37213,4927,1014,4995978
56020,37209,564476x2292x0,564476,0e00d6dc5cf2232d15750bd3177c57521cdff678a5666a...,bc1qec377ms3a79e3v3pe8gfjrzp6syfqykeqkw2lsh86f...,564511.0,58fea309da14892858be78c8c45a7d06fa6796e77e51d8...,028303182c9885da93b3b25c9621d22cf34475e63c1239...,03820e3b7bdbf7ccafe67791088de15df162b352f3b7ba...,20000,...,2889.0,,unused,0.0,17111.0,0.0,37209,642,4037,2591754
55208,37256,564948x2100x0,564948,e1b4de87949168dafae980bc1f467b2bee878c2667383a...,bc1ql47jp0hprvcq4dz8y5dpape60vhf532uea2rv9v20q...,565057.0,af2a36eb0782958a3f2e7aacb89dfdb92defeaed44df3c...,02529db69fd2ebd3126fb66fafa234fc3544477a23d509...,02e63d3e5a2351cc8de6c63b0d0784d1940406c5addce4...,47882,...,3643.0,,force,0.0,44239.0,0.0,37256,6568,5977,39256936
55207,37255,564948x2082x0,564948,eb12d66c34e8009c408fa56d948bf87baec4888caddd2d...,bc1q4ksyf7c7jphsmwdj8n944y0mypy7n28mts5tt9zrph...,565057.0,af557d74c148c434156edc034b0a94d97afcda20e8c2b6...,02e63d3e5a2351cc8de6c63b0d0784d1940406c5addce4...,039edc94987c8f3adc28dab455efc00dea876089a120f5...,47882,...,3643.0,,force,0.0,44239.0,0.0,37255,5977,1392,8319984


In [70]:
decisions_df.columns

Index(['Unnamed: 0', 'short_channel_id', 'open_block', 'open_transaction',
       'address', 'close_block', 'close_transaction', 'node0', 'node1',
       'satoshis', 'last_seen', 'open_time', 'open_fee', 'close_time',
       'close_fee', 'last_update', 'close_type', 'close_htlc_count',
       'close_balance_a', 'close_balance_b', 'dec_id', 'node0_id', 'node1_id',
       'node_pair'],
      dtype='object')

**TESTS**

In [None]:
# Test Lazy Graph extract
blocks_att=[]
for i in range(len(graph_snapshots)):
    graph_i=dask.compute(graph_snapshots[i])
    block=graph_i.graph['block']
    blocks_att.append(block)

print(blocks_att)

#graph_snapshots=dask.compute(*graph_snapshots)
#block=graph_snapshots[0].graph['block']
    
#print(len(graph_snapshots[5]))
#print(graph_snapshots[3].graph['block'])

# Delayed testing
#results = dask.compute(*futures)
#graphs=dask.compute(*graph_snapshots)


# Comparative Analysis

In order to understand the potential motivations behind each decision we analyze each decission (opening or closure of a channel) independently from the perspective of each of the participants in the decission, which we'll call the node under analysis. For each decission we extract or compute the following information: 

Betweenness centrality measures how central is a network to the flow of information in a network. In the case of the Lightning Network the higher the betweenness centrality of a node, the more transactions (messages) that are routed through it. In particular, we will use a measure of betweenness centrality defined in (Brandes and Fleischer 2005 - https://link.springer.com/chapter/10.1007/978-3-540-31856-9_44) that models infomation through a network, as electric current, efficiently and not only considering shortest path. This allows us to account for the fact that not all transactions travel through shortes path given that there are fee and capacity considerations.  

## Baseline Measurments

**FUNCTION: Property measurement for a collection of graphs**

In [45]:
### NEW
# Function to calculate base measurement for every graph in snapshot

"""
Function
--------
collection_measure
    Iterates over graph snapshots and calculates measurement for every node in each of the graphs.

Parameters
----------
g_snapshots : list
    List of delayed nx graph elements contianing graph snapshots
    

measurement: string
    Name of the type of measurment that will applied to the graph. (See graph_measurement function for options)



Returns
-------
snapshot_mes_dic: dic
    Dictionary with blocks as keys and dic of measurements as values

"""
def collection_measure(bucket,graph_keys,measurement):
    
    snapshot_mes_list=[]
    
    
    
    with tqdm(total=len(range(1,len(graph_keys)))) as pbar:
        for i in (range(1,len(graph_keys))):
            
            key=graph_keys[i]
            prev_key=graph_keys[i-1]

            measurement_input=(key,measurement,'capacity',bucket,prev_key)

            b_g_tuple=dask.delayed(graph_measurement)(measurement_input)
            snapshot_mes_list.append(b_g_tuple)
            pbar.update(1)

    futures = dask.persist(*snapshot_mes_list)
    
    start=time.time()
    snapshot_mes_list = dask.compute(*futures)
    #snapshot_mes_list = dask.compute(*snapshot_mes_list)
    snapshot_mes_dic={record[0]:record[1] for record in snapshot_mes_list}
    end=time.time()
    print('Compute in seconds: {}'.format(end-start))
    
    return snapshot_mes_dic

**FUNCTION: Property measurment for a single graph**

In [145]:

"""
Function
--------
graph_measurement
    Performs specific graph measurement 

Parameters
----------
measurment_input: tuple
    g : nx graph
        NetworkX graph object over which measurment will be calculated for each node

    measurment: string
        Type of measurement to be performed on graph
        
    weight: string
        Edge attribute to be used as weight
    
Returns
-------
g_dir: dir
    Dictionary with measurment values for each node
"""

def graph_measurement(measurement_input):
    
    # Extract inputs
    key=measurement_input[0]
    measurement=measurement_input[1]
    weight=measurement_input[2]
    bucket=measurement_input[3]
    
    
    if len(measurement_input)>4:
        prev_key=measurement_input[4]
    
    # Download graph
    session = boto3.session.Session()
    s3 = session.resource('s3')
    response = s3.Object(bucket_name=bucket, key=key).get()
    g=pickle.loads(response['Body'].read())
    
    # Extract Block
    
    block=g.graph['block']
    score_type='/raw_score/'
   
    
    if measurement=='current_betweeness_full':
        g_dir=nx.algorithms.centrality.current_flow_betweenness_centrality(g,weight=weight)
    
    elif measurement=='current_betweeness':
        g_dir=nx.algorithms.centrality.approximate_current_flow_betweenness_centrality(g,weight=weight)
        
    elif measurement=='current_betweeness_unweighted': # for unweighted current betweeness
        g_dir=nx.algorithms.centrality.approximate_current_flow_betweenness_centrality(g)
    
    elif measurement=='current_closeness':
        g_dir=nx.algorithms.centrality.current_flow_closeness_centrality(g,weight=weight)
    
    elif measurement=='closeness':
        g_dir=nx.closeness_centrality(g)
        
    elif measurement=='clustering':
        g_dir=nx.clustering(g,weight=weight)
        
    elif measurement=='node_count':
        g_dir=len(g.nodes())
        
    elif measurement=='channels':
        g_dir=dict(list(g.degree(g.nodes())))
    
    elif measurement=='capacity':
        g_dir=dict(list(g.degree(g.nodes(),weight=weight)))
        
    elif measurement=='age': 
        
        # Get node_details from S3 
        opens_file = s3.Object(bucket_name=bucket, key='node_details.p').get()
        node_details = pickle.loads(opens_file['Body'].read())
        
        # Create dic with node's age in blocks
        g_dir={node:block-node_details[node]['birth_block'] for node in list(g.nodes())} 
        
        
    elif measurement=='capacity_growth':  
        g_dir=capacity_growth (weight,bucket,g,block,s3,block_frame=3600)
        
    elif measurement=='closeness_approx_rank':
        
        # Re-select previous block
        response = s3.Object(bucket_name=bucket, key=prev_key).get()
        g=pickle.loads(response['Body'].read())
        
        g_dir=closeness_approx_rank (s3,bucket,g,block,p=13.38,estimate_sample=50)
        score_type='/norm_rank/'
        
    elif measurement=='closeness_approx_rank_post': # Same measurement as above, just looking at the rank after block decisions happen
        
        g_dir=closeness_approx_rank (s3,bucket,g,block,p=13.38,estimate_sample=50)
        score_type='/norm_rank/'
        
    elif measurement=='avg_short_path':
        g_dir=nx.average_shortest_path_length(g)
        
    elif measurement=='min_nodes':
        g_dir=nx.minimum_node_cut(g)
        
        
    elif measurement=='robustness_eff_loss':
        attack_perc=0.01
        g_dir=robustness_eff_loss(s3,bucket,g,block,attack_perc)
        measurement=measurement+'_'+str(attack_perc*100)
    
        
    # Safe graph processing to S3
    
    extraction_id=key.split('/')[1].split('_')[0]
    key_out='graph_snapshots/'+extraction_id+'_connected/.data_transformations/'+measurement+score_type+str(block)+'.pkl'
    pickle_byte_obj = pickle.dumps(g_dir) 
    response=s3.Object(bucket,key_out).put(Body=pickle_byte_obj)['ResponseMetadata']['HTTPStatusCode']
    
    
    
    return (block,response)

In [228]:

"""
Function
--------
capacity_growth
    Calculates how much has capacity grown (or decreased) for all nodes in a graph

Parameters
----------
weight: str
    Node property that will be used to weight the calculation.
    
bucket: str
    S3 bucket where data is stored

g: NetworkX graph
    Graph for which the calculation will be computed
    
block: int
    Block number corresponding to the selected graph

s3: S3 session object
    S3 session object for the boto3 api
    
block_frame: int
    The amount of blocks into the past that will be considered to calculate growth
    
Returns
-------
g_dir: dir
    Dictionary with measurment values for each node
"""


def capacity_growth (weight,bucket,g,block,s3,block_frame=3600):
    
    # Initialize g_dir items and min_block
    
    g_dir={}
    min_block=block-block_frame
    
    
    # Get graph nodes
    nodes=list(g.nodes())
    
    # Load decisions DataFrame
    
    decisions_load = s3.Object(bucket_name=bucket, key='decisions_df.csv').get()
    decisions_df=pd.read_csv(io.BytesIO(decisions_load['Body'].read()))
    
    with tqdm(total=len(nodes),disable=True) as pbar:
        
        for node in nodes:

            # Find all channel creations and closerues in block frame
            opens_blockframe_node0=decisions_df[(decisions_df['node0_id']==node) & (decisions_df['open_block']>=min_block) & (decisions_df['open_block']<=block)]['satoshis']
            opens_blockframe_node1=decisions_df[(decisions_df['node1_id']==node) & (decisions_df['open_block']>=min_block) & (decisions_df['open_block']<=block)]['satoshis']
            closes_blockframe_node0=decisions_df[(decisions_df['node0_id']==node) & (decisions_df['close_block']>=min_block) & (decisions_df['close_block']<=block)]['satoshis']
            closes_blockframe_node1=decisions_df[(decisions_df['node1_id']==node) & (decisions_df['close_block']>=min_block) & (decisions_df['close_block']<=block)]['satoshis']

            # Calculate growth by adding capacity created in block frame and subtracting capacity lost
            if weight==1: #Unweighted calculation
                gain=opens_blockframe_node0.count()+opens_blockframe_node1.count()
                loss=closes_blockframe_node0.count()+closes_blockframe_node1.count()

            else:
                gain=opens_blockframe_node0.sum()+opens_blockframe_node1.sum()
                loss=closes_blockframe_node0.sum()+closes_blockframe_node1.sum()

            # Calculate growth and save to dir
            net_growth=gain-loss
            g_dir[node]=net_growth
            pbar.update(1)
        

    return g_dir

**TEST: Weighted capacity function**

In [220]:
# Define test parameters


measurement='capacity_growth'
weight='capacity'
g_test,nodes_test,g_dic_test,block=load_graph_measurement(extraction_id,measurement,weight,blocks,extract_keys,test_ix=-1)




Block selected:532022


HBox(children=(FloatProgress(value=0.0, max=683.0), HTML(value='')))


Dic was saved correctly. Sample below:
[(6038, 0), (5314, 0), (934, 0), (3023, 0), (3436, 3131000), (3310, 0), (422, 0), (1912, 0), (5154, 0), (4688, 0)]


In [216]:
positive=[t for t in list(g_dic_test.items()) if t[1]>0]
print(positive)

[(3436, 3131000), (1331, 5500000), (6156, 520000), (4998, 4485183), (4527, 200000), (2757, 1677721), (3065, 1200000), (346, 19061712), (2724, 1491661), (2460, 100000), (2476, 9564725), (1893, 300000), (6418, 15737856), (6599, 93799), (4534, 805272), (5641, 97000), (2295, 1200000), (6832, 792562), (7711, 1500000), (1410, 2700000), (1514, 89141), (6296, 1267024), (7608, 2541273), (7631, 182031), (3271, 21417), (2674, 9193000), (3382, 1872962), (1220, 20000), (4580, 500000), (6215, 11315423), (6363, 10000), (4620, 7124495), (7673, 520550), (4639, 20000), (4426, 553341), (5738, 1319859), (7259, 4490000), (6924, 343070), (448, 174997), (227, 675364), (326, 2000), (5372, 77107), (4490, 10000), (2739, 50000), (2881, 20000), (4819, 20000), (4427, 500000), (1172, 80000), (6378, 100000), (1120, 1777721), (5634, 5000), (2300, 500000), (2973, 34003), (6249, 60000), (3, 27529), (7606, 743866), (415, 389026), (5601, 3390969), (5406, 1500000), (5495, 600000), (1257, 500000), (2512, 350000), (7073, 44

In [245]:
def test_growth(g_test,g_dic_test,block,channel_opens,channel_closures):

    test_list=[]
    error_nodes=[]


    min_block=block-3600

    relevant_opens=[t[0] for t in list(channel_opens.items()) if (t[0]>=min_block and t[0]<=block)]
    relevant_closures=[t[0] for t in list(channel_closures.items()) if (t[0]>=min_block and t[0]<=block)]
    relevant_blocks=sorted(list(set(relevant_opens).union(set(relevant_closures))))


    # Get graph nodes
    nodes=list(g_test.nodes())

        # Calculate net change for each node in each of the relevant blocks
    with tqdm(total=len(nodes)) as pbar:
        for node in nodes:
            net_change=0
            for block in relevant_blocks:

                # Create list of nodes involved in channel opens and count how many times a node appears

                if weight==1:
                    list_block_opens=[[t[0],t[1]] for t in channel_opens[block] if (t[0]==node or t[1]==node)]
                    total_weights=len(list_block_opens)

                else:
                    list_block_opens=[[t[0],t[1],t[2][weight]] for t in channel_opens[block] if (t[0]==node or t[1]==node)]
                    total_weights=np.array([l[2] for l in list_block_opens]).sum()


                net_change+=total_weights


                # Create list of nodes involved in channel closures and count how many times a node appears

                if weight==1:
                    list_block_closures=[[t[0],t[1]] for t in channel_closures[block] if (t[0]==node or t[1]==node)]
                    total_weights=len(list_block_opens)

                else:
                    list_block_closures=[[t[0],t[1],t[2][weight]] for t in channel_closures[block] if (t[0]==node or t[1]==node)]
                    total_weights=np.array([l[2] for l in list_block_closures]).sum()


                net_change-=total_weights

            #Retrieve recorded growth for node

            recorded_growth=g_dic_test[node] 

            # Check if growth match and populate test_list accordingly
            if net_change==recorded_growth:
                test_list.append(1)
            else:
                test_list.append(0)
                error_nodes.append((node,recorded_growth,net_growth))

            pbar.update(1)

        # Add up all passed tests
        tests_passed=np.array(test_list).sum()

        # Print out statements based on test results
    if tests_passed==len(nodes):
        print('Growth calculated correctly for all nodes')


    else: 
        print('Growth failed to be correctly calculated for {} nodes'.format(len(list(g_test.nodes()))-tests_passed))
        if len(error_nodes)>10:
            print('Some nodes with errors (node,recorded,actual)')
            print(error_nodes[:10])

        else:
            print('Some nodes with errors (node,recorded,actual)')
            print(error_nodes[:len(error_nodes)])
        
    return


In [235]:
test_growth(g_test,g_dic_test,block,channel_opens,channel_closures)

HBox(children=(FloatProgress(value=0.0, max=683.0), HTML(value='')))


Growth calculated correctly for all nodes


**TEST: Test for age calculation**

In [81]:
test_ix=-100
test_block=blocks[test_ix]
measurement='age'
g_key=graph_keys[test_ix]
block,response_test=graph_measurement((g_key,'age',None,bucket))

#Load graph

response = s3.Object(bucket_name=bucket, key=g_key).get()
g_test=pickle.loads(response['Body'].read())
nodes_test=list(g_test.nodes())

if response_test==200:
    key_test='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/raw_score/'+str(test_block)+'.pkl'
    g_age_test_load = s3.Object(bucket_name=bucket, key=key_test).get()
    g_age_test = pickle.loads(g_age_test_load['Body'].read())
    
    test_list=[]
    error_nodes=[]
    # Test that each node age is calculated correctly by looking at the decisions_df
    with tqdm(total=len(nodes_test)) as pbar:
        
        for n in nodes_test:
            
            # Find creation block by looking at node0 and node1 columns
            firstseenas_node0=decisions_df[decisions_df['node0_id']==n]['open_block'].min()
            firstseenas_node1=decisions_df[decisions_df['node1_id']==n]['open_block'].min()
            
            # Correct for nan values, in case node is not present in either column (make it infinite)
            fs_list=[firstseenas_node0,firstseenas_node1]
            fs_list=[np.inf if np.isnan(i) else i for i in fs_list]
            
            # Calculate age
            actual_age=block-min(fs_list[0],fs_list[1])
            recorded_age=g_age_test[n]
            
            # Check if ages match and populate test_list accordingly
            if recorded_age==actual_age:
                test_list.append(1)
            else:
                test_list.append(0)
                error_nodes.append((n,recorded_age,actual_age))
                
            pbar.update(1)
            
        # Add up all passed tests
        tests_passed=np.array(test_list).sum()

    
# Print out statements based on test results
    if tests_passed==len(nodes_test):
        print('Age calculated correctly for all nodes')
   
    
    else: 
        print('Age failed to be correctly calculated for {} nodes'.format(len(list(g_test.nodes()))-tests_passed))
        if len(error_nodes)>10:
            print('Some nodes with errors (node,recorded_age,actual_age)')
            print(error_nodes[:10])
        
        else:
            print('Some nodes with errors (node,recorded_age,actual_age)')
            print(error_nodes[:len(error_nodes)])
        
    
    print('Example age for nodes in block {}'.format(test_block))
    print(list(g_age_test.items())[:10])
    

else:
    print('Age was not saved correctly')



HBox(children=(FloatProgress(value=0.0, max=5745.0), HTML(value='')))


Age calculated correctly for all nodes
Example age for nodes in block 617169
[(5314, 112020), (934, 110767), (3023, 110767), (3452, 110322), (576, 110322), (3436, 109094), (3310, 109094), (4223, 109079), (422, 108849), (1912, 108849)]


**FUNCTION: Aproximate Node closeness rank**

In [12]:
'''
---OLD VERSION---
def approx_node_closrank (bucket,key,block,prev_block,nodes,p=13.38,estimate_sample=50):
                             
    
    # Download graph and extract nodes
    session = boto3.session.Session()
    s3 = session.resource('s3')
    response = s3.Object(bucket_name=bucket, key=key).get()
    g=pickle.loads(response['Body'].read())
    g_nodes=list(g.nodes())
    n=len(g_nodes)
                             
    # Estimate c_mid for graph
    estimation_nodes=random.sample(g_nodes, estimate_sample)
    c_mid=np.array([nx.closeness_centrality(g,n) for n in estimation_nodes]).mean()

                             
    # Calculate closeness centrality for selected nodes in block
    
    clo_list=np.array([nx.closeness_centrality(g,n) for n in nodes])
    
    # Aproximate ranking using formula from: https://arxiv.org/pdf/1706.02083.pdf
    norm_rank=n+((1-n)/(1+np.power((clo_list/c_mid),p)))
    norm_rank=list(norm_rank)
    
    # Create dictionary with ranking per node
    g_dir={node:rank for node,rank in zip(nodes,norm_rank)}
    
    # Save to S3    
    measurement='closeness_approx'
    extraction_id=key.split('/')[1].split('_')[0]
    key_out='graph_snapshots/'+extraction_id+'_connected/.data_transformations/'+measurement+'/norm_rank/'+block+'.pkl'
    pickle_byte_obj = pickle.dumps(g_dir) 
    response=s3.Object(bucket,key_out).put(Body=pickle_byte_obj)['ResponseMetadata']['HTTPStatusCode']
    
    
    return response
'''

In [None]:
def closeness_approx_rank (s3,bucket,g,block,p=13.38,estimate_sample=50):
    
    g_nodes=list(g.nodes())
    n=len(g_nodes)
    
    # Download decisions_df
    decisions_load = s3.Object(bucket_name=bucket, key='decisions_df.csv').get()
    decisions_df=pd.read_csv(io.BytesIO(decisions_load['Body'].read()))
    
    # Extract nodes involved in decisions from the block 
    node0_open=decisions_df[decisions_df['open_block']==block]['node0_id'].tolist()
    node1_open=decisions_df[decisions_df['open_block']==block]['node1_id'].tolist()
    node0_close=decisions_df[decisions_df['close_block']==block]['node0_id'].tolist()
    node1_close=decisions_df[decisions_df['close_block']==block]['node1_id'].tolist()
    
    # Create list of nodes existing in previous graph
    nodes=node0_open+node1_open+node0_close+node1_close
    nodes=list(set(nodes).intersection(set(g_nodes)))
    
    # Create list of nodes and set closeness to 0 to nodes that are not present
    missing_nodes=list(set(nodes).difference(set(g_nodes)))
    missing_clo=list(np.zeros(len(missing_nodes)))
                             
    # Estimate c_mid for graph by averaging closeness for a sample of nodes
    
    if len(g_nodes)<=estimate_sample:
        estimation_nodes=g_nodes
    else:
        estimation_nodes=random.sample(g_nodes, estimate_sample)
    
    c_mid=np.array([nx.closeness_centrality(g,n) for n in estimation_nodes]).mean()

                             
    # Calculate closeness centrality for selected nodes in block
    clo_list=np.array([nx.closeness_centrality(g,n) for n in nodes])
    
    # Aproximate ranking using formula from: https://arxiv.org/pdf/1706.02083.pdf
    norm_rank_array=(n+((1-n)/(1+np.power((clo_list/c_mid),p))))/n
    norm_rank=list(norm_rank_array)+missing_clo
    
    # Update nodes list with missing nodes
    nodes=nodes+missing_nodes
    
    # Create dictionary with ranking per node
    g_dir={node:rank for node,rank in zip(nodes,norm_rank)}
    
    
    return g_dir

In [61]:
# TEST aprox node closeness rank

# Select testing Block and extract key/block
test_ix=random.choice(range(1,len(blocks))) 
measurement='closeness_approx_rank'
g_key=graph_keys[test_ix]
prev_key=graph_keys[test_ix-1]

rand_block=blocks[test_ix-1]


# Download graph associated to test
g_test_key='graph_snapshots/'+str(extraction_id)+'_connected/'+str(rand_block)+'.gpickle'
response = s3.Object(bucket_name=bucket, key=g_test_key).get()
g_test=pickle.loads(response['Body'].read())
nodes_test=list(g_test.nodes())

# Run node closeness function
start=time.time()
block,response_test=graph_measurement((g_key,measurement,None,bucket,prev_key))
end=time.time()


# Test if function saved result correctly and download result
if response_test==200:
    key_test='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/norm_rank/'+str(block)+'.pkl'
   
    g_clo_test_load = s3.Object(bucket_name=bucket, key=key_test).get()
    g_clo_test = pickle.loads(g_clo_test_load['Body'].read())
    clo_values=[v for k,v in list(g_clo_test.items())]


# Test if no. items in dictionary are equal to nodes in graph
dic_items=list(g_clo_test.items())
print(len(dic_items))



# Test that values are >0 and <1
range_test=[]
for n,v in dic_items:
    if v>=0 and v<=1:
        range_test.append(1)
    else:
        range_test.append(0)

range_test_passed=np.array(range_test).sum()        

if range_test_passed==len(dic_items):
    print('Values for all nodes are between 0-1')
else:
    print('Values for some nodes are outside the [0,1] range')
    


# Download result from function

print('Test Block:{}'.format(rand_block))
print('Nodes in Block:{}'.format(len(nodes_test)))
print('Standard deviation of measurement: {}'.format(np.std(np.array(clo_values))))
print('Time elapse: {}'.format(end-start))
print(list(g_clo_test.items()))



2
Values for all nodes are between 0-1
Test Block:518195
Nodes in Block:316
Standard deviation of measurement: 0.07186561837374655
Time elapse: 1.0128343105316162
[(5424, 0.8409218063257561), (2537, 0.9846530430732492)]


**FUNCTION: Aproximate Node closeness rank**

In [121]:
def robustness_eff_loss(s3,bucket,g,block,attack_perc):
    
    
    # Get global base efficiency by extracting sp from S3
    
    print(block)
    avg_sp_key='graph_snapshots/1587447789_connected/.data_transformations/avg_short_path/raw_score/'+str(block)+'.pkl'
    response = s3.Object(bucket_name=bucket, key=avg_sp_key).get()
    avg_sp=pickle.loads(response['Body'].read())
    init_efficiency=1/avg_sp
    
    # Calculate 1% of highest degree nodes
    num_nodes=len(g.nodes())
    num_top_nodes=max(1,int(attack_perc*num_nodes))
    g_degrees=[deg for deg in g.degree()]
    top_degrees=sorted(g_degrees,key=lambda y: y[1],reverse=True)[:num_top_nodes]
    top_nodes=[n for n,d in top_degrees]
    
    # Remove top nodes 
    g_pruned=g.copy()
    g_pruned.remove_nodes_from(top_nodes)
    
    # Re-calculate efficiency for pruened graph and robustness
    final_efficiency=nx.global_efficiency(g_pruned)
    robustness=final_efficiency/init_efficiency
    
    
    return robustness
    


In [112]:
# TEST robustness_eff_loss

# Select testing Block and extract key/block
test_ix=random.choice(range(1,len(sample_keys))) 


measurement='robustness_eff_loss'
g_test_key=sample_keys[test_ix]
#prev_key=graph_keys[test_ix-1]

print(g_test_key)


# Download graph associated to test

response = s3.Object(bucket_name=bucket, key=g_test_key).get()
g_test=pickle.loads(response['Body'].read())
nodes_test=list(g_test.nodes())
rand_block=g_test.graph['block']

# Run robustness_eff_loss
start=time.time()
block,response_test=graph_measurement((g_test_key,measurement,None,bucket,None))
end=time.time()




graph_snapshots/1587447789_connected/591968.gpickle
591968


In [116]:
# Test if function saved result correctly and download result
if response_test==200:
    key_test='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'_1.0/raw_score/'+str(block)+'.pkl'
    print(key_test)
    g_rob_test_load = s3.Object(bucket_name=bucket, key=key_test).get()
    g_rob_test = pickle.loads(g_rob_test_load['Body'].read())
    
    

print('Test Block:{}'.format(block))
print('Nodes in Block:{}'.format(len(nodes_test)))
print('Time elapse: {}'.format(end-start))   
print('Sample: {}'.format(g_rob_test))
 

graph_snapshots/1587447789_connected/.data_transformations/robustness_eff_loss_1.0/raw_score/591968.pkl
Test Block:591968
Nodes in Block:5184
Time elapse: 76.71983003616333
Sample: 0.4836248242892463


In [None]:
59

**FUNCTION: Property measurment for a couple of nodes in a graph**

In [15]:

"""
Function
--------
node_measurement
    Performs selected graph measurment on specific nodes in graph

Parameters
----------
g : nx graph
    NetworkX graph object over which the measurment will be performed

measurement: string
    Type of measurement to be performend in graph
    
node0: int
    Node id for node 0

node1: int
    Node id for node 1
    
Returns
-------
node_tuple: tuple
    Tuple of the form (node0_mes,node1_mes)
    
    node0_mes: float
        Graph measurement for node0
    node1_mes: float
        Graph measurement for node1
"""


        
    
    

def node_measurement(g,measurement,node0,node1):
    
    measurement_input=(g,measurement,'capacity')
    block,g_dir=graph_measurement(measurement_input)
    
    node0_mes=g_dir[node0]
    node1_mes=g_dir[node1]
        
    # Update marginal values for node0 and node1
        
    if (g.has_node(node0)): #If connected component of marginal graph contains node0 find betweeness
        node0_mes=g_dir[node0]
    else: # else update with fixed value
        node0_mes=0
            
    if (g.has_node(node1)): #If connected component of marginal graph contains node1 find betweeness
        node1_mes=g_dir[node1]
    else: # else update with fixed value
        node1_mes=0
    
    return (node0_mes,node1_mes)
    
    

In [347]:
def incremental_closeness(bucket,graph_keys,blocks,start_point):
    
    session = boto3.session.Session()
    s3 = session.resource('s3')
    counter=0
    extraction_id=graph_keys[0].split('/')[1].split('_')[0]
    responses=[]
    
    
    
    # Channel closures
    closure_file = s3.Object(bucket_name=bucket, key='channel_closures.p').get()
    channel_closures = pickle.loads(closure_file['Body'].read())
    
    
    # Channel openings 
    opens_file = s3.Object(bucket_name=bucket, key='channel_opens.p').get()
    channel_opens = pickle.loads(opens_file['Body'].read())
    snapshot_clo={}
    
    #Initialize graph with all nodes 
    lastgraph_block=blocks[-1]
    last_key='graph_snapshots/'+str(extraction_id)+'/'+str(lastgraph_block)+'.gpickle'
    response = s3.Object(bucket_name=bucket, key=last_key).get()
    G_final=pickle.loads(response['Body'].read())
    nodes_final=list(G_final.nodes())
    G=nx.Graph()
    G.add_nodes_from(nodes_final)
    prev_clo=None
    
    if start_point>0:
        # Get previous graph
        inigraph_block=blocks[start_point-1]
        ini_key='graph_snapshots/'+str(extraction_id)+'/'+str(inigraph_block)+'.gpickle'
        response = s3.Object(bucket_name=bucket, key=ini_key).get()
        G_ini=pickle.loads(response['Body'].read())
        
        # Get previous closeness centrality
        measurement='incremental_closeness'
        key_clo='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/raw_score/'+str(inigraph_block)+'.pkl'
        response = s3.Object(bucket_name=bucket, key=key_clo).get()
        prev_clo=pickle.loads(response['Body'].read())
      
        G.add_edges_from(list(G_ini.edges(data=True)))
        
        
    
    
    
    
    with tqdm(total=len(range(start_point,len(graph_keys)))) as pbar:
        for i  in range(start_point,len(graph_keys)):

            
            block=blocks[i]
            new_edges=channel_opens[block]
            closed_edges=channel_closures[block]
        

            # Incremental closeness calculation for OPENS

            with tqdm(total=len(new_edges),disable=True) as pbar1:
                for edge in new_edges:

                    #If edge exists in previous graph closeness doesn't change
                    if G.has_edge(edge[0],edge[1]):
                        new_clo=prev_clo
                        G.edges[edge[0],edge[1]]['no_channels']+=1

                    else:

                        new_clo=nx.incremental_closeness_centrality(G,(edge[0],edge[1]),prev_cc=prev_clo,insertion=True)
                        G.add_edges_from([edge])
                

                    prev_clo=new_clo
                    pbar1.update(1)
                
            

            # Incremental closeness calculation for CLOSES

            with tqdm(total=len(closed_edges),disable=True) as pbar2:
                for edge in closed_edges:

                    # Verify if existing edges result from multiple channels, if so, only reduce capacity otherwise remove edge
                    no_channels=G.edges[edge[0],edge[1]]['no_channels']

                    #If edge exists in previous graph closeness doesn't change
                    if no_channels>1:
                        new_clo=prev_clo
                        G.edges[edge[0],edge[1]]['no_channels']-=1

                    else:                                    
                        new_clo=nx.incremental_closeness_centrality(G,(edge[0],edge[1]),prev_cc=prev_clo,insertion=False)                   
                        G.remove_edge(edge[0],edge[1])

                    prev_clo=new_clo
                    pbar2.update(1)
                


            # Safe outcome
            g_dir=new_clo


            # Safe graph processing to S3
            
            measurement='incremental_closeness'
            key_out='graph_snapshots/'+extraction_id+'_connected/.data_transformations/'+measurement+'/raw_score/'+str(block)+'.pkl'
            pickle_byte_obj = pickle.dumps(g_dir) 
            response=s3.Object(bucket,key_out).put(Body=pickle_byte_obj)['ResponseMetadata']['HTTPStatusCode']
            #print((block,response))


            # Loop updates
            pbar1.close()
            pbar2.close()
            pbar.update(1)
            responses.append(response)
            
        
        output={b:res for b,res in zip(blocks,responses)}

           
    return output

In [None]:
# fit sigmoid for p by looking at sample of graphs
# estimate mid closeness by samplig closeness

In [391]:
def sp_closeness(G):
    
    nodes=list(G.nodes())
    n=len(nodes)
    sp_matrix=np.zeros((len(nodes),len(nodes)))
    sp_matrix=sp_matrix.astype(object)
    print(sp_matrix.dtype)
    
    dic_clo={}
    
    
    for i in range(n):

        for j in range(i,n):
            # Calculate delayed shortest path
            input_tuple=(G,nodes[i],nodes[j])
            sp=dask.delayed(len_shortest_path)(input_tuple)
            #print(type(sp))
            sp_matrix[i][j]=sp
            sp_matrix[j][i]=sp
        
        sp_sum=np.array(dask.compute(*sp_matrix[i].tolist())).sum()
        clo_i=n-1/sp_sum
        dic_clo[i]=clo_i
    return dic_clo

In [None]:


start=time.time()
dic_clo_test=sp_closeness(G_test)
end=time.time()
print(sp)
print('Compute in seconds: {}'.format(end-start))
print(list(dic_clo_test.items()))

In [32]:
snapshot_nodes=collection_measure(bucket,extract_keys,'node_count')

Compute in seconds: 10.438627481460571


In [None]:
sorted(snapshot_nodes.items())[:100]

**SCRIPT: Baseline Current Betweeness**

In [40]:
# Calculate Baseline betweeness
snapshot_bet=collection_measure(bucket,extract_keys,'current_betweeness')

Compute in seconds: 8009.403836488724


In [151]:
# Calculate Baseline unweighted betweeness
snapshot_bet_uw=collection_measure(bucket,extract_keys,'current_betweeness_unweighted')

HBox(children=(FloatProgress(value=0.0, max=36535.0), HTML(value='')))


Compute in seconds: 12896.576698541641


In [41]:
# Save baseline current betweeness to S3: For old calculation of Betweeness
response=pickle_save_s3(snapshot_bet,blocks,extraction_id,'snapshot_bet')
if response['HTTPStatusCode']==200:
    print('Save to S3 succesful')
else:
    print(response)

Save to S3 succesful


In [8]:
# Convert to single file per graph format
# Load large dic from S3
key='graph_snapshots/1587447789_connected/.data_transformations/1587447789snapshot_bet-36536-508400-617297.pkl'
response = s3.Object(bucket_name=bucket, key=key).get()
snapshot_bet=pickle.loads(response['Body'].read())

In [None]:
# Delayed store function

responses=[]
with tqdm(total=len(blocks)) as pbart:
    for block in blocks:
        dic=snapshot_bet[block]
        #create save key
        measurement='betweeness_curr_aprox'
        key_out='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/raw_score/'+str(block)+'.pkl'

        #run save function
        input_tuple=(bucket,key_out,dic)
        response=simple_psave_s3(input_tuple)
        responses.append(response)
        pbart.update(1)
        

HBox(children=(FloatProgress(value=0.0, max=36536.0), HTML(value='')))

In [8]:
# TEST all saved correctly
#responses=np.array([r for b,r in snapshot_capgrowth.items()])

measurement='betweeness_curr_aprox' 
# Retrieve saved items

bet_cur_keys = [obj.key 
        for obj in s3.Bucket(name=bucket).objects.all()
        if re.match(".*"+str(extraction_id)+"_connected/\.data_transformations/betweeness_curr_aprox/.*\.pkl",obj.key)]

if len(bet_cur_keys)==len(blocks):
    print('{} was calculated for ALL {} blocks correctly'.format(measurement,len(bet_cur_keys)))
else:
    print('Error saving to S3')
    
  
rand_block=str(random.choice(blocks))    
test_key='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/raw_score/'+rand_block+'.pkl'
test_file = s3.Object(bucket_name=bucket, key=test_key).get()
test_object = pickle.loads(test_file['Body'].read())
print('Example of {} saved for block {}:'.format(measurement,rand_block))
print(list(test_object.items())[:10])
print('Total entries: {}'.format(len(list(test_object.items()))))

# Get test graph
g_test_key='graph_snapshots/'+str(extraction_id)+'_connected/'+str(rand_block)+'.gpickle'
response = s3.Object(bucket_name=bucket, key=g_test_key).get()
g_test=pickle.loads(response['Body'].read())

# Run test function on test graph: Check that dic stored has same number of nodes than graph

nodes_gtest=list(g_test.nodes())
items_dict_test=list(test_object.items())

if len(nodes_gtest)==len(items_dict_test):
    print('Number of nodes match')
else:
    print('Number of nodes does NOT match')

betweeness_curr_aprox was calculated for ALL 36536 blocks correctly
Example of betweeness_curr_aprox saved for block 607627:
[(5314, 2.7627499157415683e-05), (934, 3.5404153621536765e-05), (3023, 7.752852808574474e-15), (3452, 4.186574492807752e-06), (576, 2.7889615061953227e-16), (3436, 0.0013869522957864274), (3310, 1.6268942119472713e-15), (2378, 5.268038400591164e-15), (4223, 0.0007224518380271746), (422, 0.0005759993487047296)]
Total entries: 5506
Number of nodes match


**SCRIPT: Baseline Current Closeness**

In [None]:
# Calculate Baseline current closeness
#snapshot_clo=collection_measure(bucket,extract_keys,'current_closeness')

In [None]:
# Calculate incremental closeness
snapshot_clo=incremental_closeness(bucket,extract_keys,blocks,1425+3073)

In [254]:
extract_keys[1402]

#channel_opens[1407]

'graph_snapshots/1587447789_connected/535029.gpickle'

In [360]:

key='graph_snapshots/1587447789_connected/'+str(blocks[-100])+'.gpickle'
response = s3.Object(bucket_name=bucket, key=key).get()
G_test=pickle.loads(response['Body'].read())

node1=random.choice(list(G.nodes()))
node2=random.choice(list(G.nodes()))

In [364]:
print((node1,node2))
#len(G_test.nodes())
start=time.time()
sp=nx.shortest_path_length(G_test, source=node1, target=node1)
end=time.time()
print(sp)
print('Compute in seconds: {}'.format(end-start))

(6306, 2470)
0
Compute in seconds: 9.965896606445312e-05


In [193]:
G=nx.Graph()
G.add_nodes_from([1,2,3,4,5,6])
#G.add_edge(2,1)
new_clo=nx.incremental_closeness_centrality(G,(2,1),prev_cc=None,insertion=True)
G.add_edge(2,1)
print(new_clo)
print(G.edges())


new_clo=nx.incremental_closeness_centrality(G,(3,4),prev_cc=new_clo,insertion=True)
G.add_edge(3,4)
print(new_clo)
print(G.edges())


new_clo=nx.incremental_closeness_centrality(G,(5,6),prev_cc=new_clo,insertion=True)
G.add_edge(5,6)
print(new_clo)
print(G.edges())


new_clo=nx.incremental_closeness_centrality(G,(5,6),prev_cc=new_clo,insertion=False)
G.remove_edge(5,6)

print(new_clo)
print(G.edges())

{1: 0.2, 2: 0.2, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0}
[(1, 2)]
{1: 0.2, 2: 0.2, 3: 0.2, 4: 0.2, 5: 0.0, 6: 0.0}
[(1, 2), (3, 4)]
{1: 0.2, 2: 0.2, 3: 0.2, 4: 0.2, 5: 0.2, 6: 0.2}
[(1, 2), (3, 4), (5, 6)]
{1: 0.2, 2: 0.2, 3: 0.2, 4: 0.2, 5: 0.0, 6: 0.0}
[(1, 2), (3, 4)]


In [267]:
# TEST all saved correctly
responses=np.array([r for b,r in snapshot_clo.items()])

if np.amax(responses)==np.amin(responses): #and len(responses) == len(extract_keys):
    print('Save to S3 succesful')
else:
    print('Error saving to S3')

range_test=[]
no_blocks=1400
    
with tqdm(total=no_blocks) as pbar:
    for i in range(no_blocks):

        measurement='incremental_closeness'    
        #rand_block=str(random.choice(blocks[:1200]))    
        rand_block=str(blocks[i])

        test_key='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/raw_score/'+rand_block+'.pkl'
        test_file = s3.Object(bucket_name=bucket, key=test_key).get()
        test_object = pickle.loads(test_file['Body'].read())

        graph_key='graph_snapshots/'+str(extraction_id)+'_connected/'+str(rand_block)+'.gpickle'
        graph_response = s3.Object(bucket_name=bucket, key=graph_key).get()
        G=pickle.loads(graph_response['Body'].read())


        #print('Example of {} saved for block {}:'.format(measurement,rand_block))
        #print(list(test_object.items())[:100]) 
        #print('Total nodes graph: {}'.format(len(G.nodes())))

        connected_nodes=[(b,cent) for b,cent in list(test_object.items()) if cent>0]
        #print('Total entries: {}'.format(len(connected_nodes)))

        node_cons=[]
        for node in list(G.nodes()):

            node_con=test_object[node]
            
          
            if node_con>0:
                node_cons.append(1)
            else:
                node_cons.append(0)
            
            

        block_test=np.array(node_cons).sum()/len(G.nodes)          

        '''
        if block_test==1:
            test=1
        else:
            test=0
        '''

        range_test.append(block_test)
        #range_test.append(test)
        pbar.update(1)

print('% of nodes with positive closeness per Block : {}'.format(np.array(range_test).mean()))
#print('Blocks with closeness for all nodes: {}'.format(np.array(range_test).sum()))
print('Total Blocks: {}'.format(no_blocks))

Save to S3 succesful


HBox(children=(FloatProgress(value=0.0, max=1400.0), HTML(value='')))


% of nodes with positive closeness per Block : 0.9888914970559514
Total Blocks: 1400


In [275]:
np.array(range_test)[1399]

0.9950617283950617

**SCRIPT: Baseline Channels**

In [44]:
# Calculate Baseline degree
snapshot_channels=collection_measure(bucket,extract_keys,'channels')

Compute in seconds: 186.33308386802673


In [None]:
# TEST all saved correctly
responses=np.array([r for b,r in snapshot_channels.items()])

if np.amax(responses)==np.amin(responses) and len(responses) == len(extract_keys):
    print('Save to S3 succesful')
else:
    print('Error saving to S3')
    
measurement='channels'    
rand_block=str(random.choice(blocks))    
test_key='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/raw_score/'+rand_block+'.pkl'
test_file = s3.Object(bucket_name=bucket, key=test_key).get()
test_object = pickle.loads(test_file['Body'].read())
print('Example of {} saved for block {}:'.format(measurement,rand_block))
print(list(test_object.items())[:10])    
print('Total entries: {}'.format(len(list(test_object.items()))))

**SCRIPT: Baseline Capacity**

In [45]:
# Calculate Baseline degree
snapshot_capacity=collection_measure(bucket,extract_keys,'capacity')

Compute in seconds: 205.32206177711487


In [51]:
# TEST all saved correctly
responses=np.array([r for b,r in snapshot_capacity.items()])

if np.amax(responses)==np.amin(responses) and len(responses) == len(extract_keys):
    print('Save to S3 succesful')
else:
    print('Error saving to S3')
    
measurement='capacity'    
rand_block=str(random.choice(blocks))    
test_key='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/raw_score/'+rand_block+'.pkl'
test_file = s3.Object(bucket_name=bucket, key=test_key).get()
test_object = pickle.loads(test_file['Body'].read())
print('Example of {} saved for block {}:'.format(measurement,rand_block))
print(list(test_object.items())[:10])
print('Total entries: {}'.format(len(list(test_object.items()))))

Save to S3 succesful
Example of capacity saved for block 586042:
[(6038, 8878679), (5314, 14026340), (934, 1171934), (3023, 1111934), (3452, 2063908), (576, 40000), (3436, 6948131), (3310, 100000), (2378, 400000), (4223, 24298841)]
Total entries: 4920


**SCRIPT: Baseline Age**

In [94]:
# Calculate Baseline age
snapshot_age=collection_measure(bucket,extract_keys,'age')

HBox(children=(FloatProgress(value=0.0, max=36536.0), HTML(value='')))


Compute in seconds: 511.2060635089874


In [95]:
# TEST all saved correctly
responses=np.array([r for b,r in snapshot_age.items()])

if np.amax(responses)==np.amin(responses) and len(responses) == len(extract_keys):
    print('Save to S3 succesful')
else:
    print('Error saving to S3')
    
measurement='age'    
rand_block=str(random.choice(blocks))    
test_key='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/raw_score/'+rand_block+'.pkl'
test_file = s3.Object(bucket_name=bucket, key=test_key).get()
test_object = pickle.loads(test_file['Body'].read())
print('Example of {} saved for block {}:'.format(measurement,rand_block))
print(list(test_object.items())[:10])
print('Total entries: {}'.format(len(list(test_object.items()))))

Save to S3 succesful
Example of age saved for block 568893:
[(6038, 63744), (5314, 63744), (934, 62491), (3023, 62491), (3452, 62046), (576, 62046), (3436, 60818), (3310, 60818), (2378, 60803), (4223, 60803)]
Total entries: 3872


**SCRIPT: Baseline Growth**

In [None]:
# Calculate Baseline capacity growth
snapshot_capgrowth=collection_measure(bucket,extract_keys,'capacity_growth')

In [262]:
# TEST all saved correctly
responses=np.array([r for b,r in snapshot_capgrowth.items()])

if np.amax(responses)==np.amin(responses) and len(responses) == len(extract_keys):
    print('Save to S3 succesful')
else:
    print('Error saving to S3')
    
measurement='capacity_growth'    
rand_block=str(random.choice(blocks))    
test_key='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/raw_score/'+rand_block+'.pkl'
test_file = s3.Object(bucket_name=bucket, key=test_key).get()
test_object = pickle.loads(test_file['Body'].read())
print('Example of {} saved for block {}:'.format(measurement,rand_block))
print(list(test_object.items())[:10])
print('Total entries: {}'.format(len(list(test_object.items()))))

# Get test graph
g_test_key='graph_snapshots/'+str(extraction_id)+'_connected/'+str(rand_block)+'.gpickle'
response = s3.Object(bucket_name=bucket, key=g_test_key).get()
g_test=pickle.loads(response['Body'].read())

# Run test function on test graph
test_growth(g_test,test_object,int(rand_block),channel_opens,channel_closures)

Save to S3 succesful
Example of capacity_growth saved for block 603179:
[(5314, 0), (934, 0), (3023, 0), (3452, 0), (576, 0), (3436, -12000), (3310, 0), (2378, 0), (4223, 500000), (422, 0)]
Total entries: 5338


HBox(children=(FloatProgress(value=0.0, max=5338.0), HTML(value='')))


Growth calculated correctly for all nodes


**SCRIPT: Baseline Closeness rank**

In [None]:
# Calculate closeness rank before making block decisions
snapshot_clorank=collection_measure(bucket,extract_keys,'closeness_approx_rank')

In [None]:
# Calculate closeness rank after making block decisions
snapshot_clorank=collection_measure(bucket,extract_keys,'closeness_approx_rank_post')

HBox(children=(FloatProgress(value=0.0, max=36535.0), HTML(value='')))




**Sample Blocks**

In [33]:
sample_keys=[extract_keys[10*i] for i in range(int(len(extract_keys)/10)+1)]
print(len(sample_keys))

3654


In [105]:
sample_keys[:10]

['graph_snapshots/1587447789_connected/508400.gpickle',
 'graph_snapshots/1587447789_connected/509496.gpickle',
 'graph_snapshots/1587447789_connected/511852.gpickle',
 'graph_snapshots/1587447789_connected/513758.gpickle',
 'graph_snapshots/1587447789_connected/514060.gpickle',
 'graph_snapshots/1587447789_connected/514345.gpickle',
 'graph_snapshots/1587447789_connected/514412.gpickle',
 'graph_snapshots/1587447789_connected/514703.gpickle',
 'graph_snapshots/1587447789_connected/514850.gpickle',
 'graph_snapshots/1587447789_connected/515162.gpickle']

**SCRIPT: Average Shortest path**

In [None]:
snapshot_avg_shortpath=collection_measure(bucket,sample_keys,'avg_short_path')


**SCRIPT: Robustness**

In [None]:
snapshot_robustness=collection_measure(bucket,sample_keys,'robustness_eff_loss')

## Comparissons

**FUNCTION: Compare property changes for nodes in a collection of graphs**

In [None]:
###NEW 

"""
Function
--------
collection_compare
    Iterates over blocks to calculate marginal change in metric for nodes that made decisions (opens/closures)

Parameters
----------
blocks : list
    List of blocks extracted when reading graphs
    
    
dec_dic: dic
    Dictionary with channel decisions (open or closure) per block
    
    
graph_snapshots: list
    List of delayed dask objects each pointing to a graph snapshot to be loaded from S3

snapshots_base: dic
    Dictionary of dictionaries containing the base measurments per block per node

measurement: string
    Name of the type of measurment that will applied to the graph
    
type_dec: string
    The type of decisions that will be analyzed 'opens' or 'closures'
    

Returns
-------
futures: list
    List of tuples of the form (dic_node0,dic_node1) where dic_node0/1 is a dictionary containing the marginal changes for node0/1 
    for every decision in a given block. The dictionaries are future dask objects that still need to be explicitly computed. 

"""
def collection_compare(blocks,dec_dic,graph_keys,snapshots_base,measurement,type_dec):

    futures_list=[] # list to populate with futures per block
    
    with tqdm(total=len(range(1,len(graph_keys)))) as pbar:
        for i in range(1,len(graph_keys)): # iterate through blocks

            # extract information from parameters and construct input tuple to delayed function
            block=blocks[i]
            block_prev=blocks[i-1]
            block_dec=dec_dic[block]
            key=graph_keys[i-1] # Pass previous graph
            block_base=snapshots_base[block_prev]
            block_res=snapshots_base[block]
            input_tuple=(block,g,block_dec,block_base,measurement,type_dec,block_res)
            
            # submit to delayed function and append to list
            output_tuple=dask.delayed(graph_compare)(input_tuple)
            futures_list.append(output_tuple)
            pbar.update(1)

    # persist to disk and return
    futures = dask.persist(*futures_list)
    return futures
    

In [None]:

"""
Function
--------
collection_compare
    Iterates over blocks to calculate marginal change in metric for nodes that made decisions (opens/closures)

Parameters
----------
blocks : list
    List of blocks extracted when reading graphs
    
    
dec_dic: dic
    Dictionary with channel decisions (open or closure) per block
    
    
graph_snapshots: list
    List of delayed dask objects each pointing to a graph snapshot to be loaded from S3

snapshots_base: dic
    Dictionary of dictionaries containing the base measurments per block per node

measurement: string
    Name of the type of measurment that will applied to the graph
    
type_dec: string
    The type of decisions that will be analyzed 'opens' or 'closures'
    

Returns
-------
futures: list
    List of tuples of the form (dic_node0,dic_node1) where dic_node0/1 is a dictionary containing the marginal changes for node0/1 
    for every decision in a given block. The dictionaries are future dask objects that still need to be explicitly computed. 


def collection_compare(blocks,dec_dic,graph_snapshots,snapshots_base,measurement,type_dec):

    futures_list=[] # list to populate with futures per block
    
    with tqdm(total=len(range(1,len(graph_snapshots)))) as pbar:
        for i in range(1,len(graph_snapshots)): # iterate through blocks

            # extract information from parameters and construct input tuple to delayed function
            block=blocks[i]
            block_prev=blocks[i-1]
            block_dec=dec_dic[block]
            g=graph_snapshots[i-1] # Pass previous graph
            block_base=snapshots_base[block_prev]
            block_res=snapshots_base[block]
            input_tuple=(block,g,block_dec,block_base,measurement,type_dec,block_res)
            
            # submit to delayed function and append to list
            output_tuple=dask.delayed(graph_compare)(input_tuple)
            futures_list.append(output_tuple)
            pbar.update(1)

    # persist to disk and return
    futures = dask.persist(*futures_list)
    return futures
"""    

**FUNCTION: Compare property changes for nodes in a graph**

In [None]:

"""
Function
--------
graph_compare
    Calculates marginal change in metric for node0, node1 make decisions (open/close channels) in a single block

Parameters
----------
input_tuple : tuple
    
    block: int
        Block number
    g: nx_graph 
        Graph snapshot (as dask delayed object)
    block_dec: list
        List of tuples in nx edge format (u,v,att_dic) for all the decisions (channel opens/closures) made in that block  
    block_base: dic
        Dictionary of base measurements for each node in the graph snapshot corresponding to that block
    measurement: string
        Name of measurement to be computed
    type_dec: string
    The type of decisions that will be analyzed 'opens' or 'closures'
    
    block_res: dic
        Dictionary of base measurements for each node in the graph snapshot corresponding to the next block
    

Returns
-------
nodes_mar_dic: tuple
    Tuples of the form (mar_node0_dic_i,mar_node0_dic_i) where each element in the tuple is a dictionary containing the marginal changes for node0/1 
    for every node0 and node1 involved in a decision (channel open/closures) in the block.
"""


def graph_compare(input_tuple):
    
    block=input_tuple[0]
    key=input_tuple[1]
    block_dec=input_tuple[2]
    block_base=input_tuple[3]
    measurement=input_tuple[4]
    type_dec=input_tuple[5]
    block_res=input_tuple[6]
   
    mar_node0_dic_i={} # dictionary to story function output
    mar_node1_dic_i={} 
    
    
    # Load data
    session = boto3.session.Session()
    s3 = session.resource('s3')
    response = s3.Object(bucket_name=bucket, key=key).get()
    G=pickle.loads(response['Body'].read())
    
    ###########################---------------------
    ##if decisiono 
    
    
    
    
    
    
    
    # For each decision calculate marginal change in measure for node0 and node1
    for edge in block_dec:
        
        # Extract info about channel
        
        node0=edge[0]
        node1=edge[1]
        channel_id=edge[2]['channel_id']
        capacity=edge[2]['capacity']

        
        # Copy original graph
        g_mar=G.copy()   
        old_nodes=False
        
        
        
        # Retrieve base measurement before channel if nodes existed, else define base measure as 0
        if (g_mar.has_node(node0)):
            node0_base=block_base[node0]
            old_nodes=True
        else:
            node0_base=0
            
        if (g_mar.has_node(node1)):
            node1_base=block_base[node1]
            old_nodes=True
        else:
            node1_base=0
        
            
        if old_nodes: # If at least one node is old (part of the connected graph)
            
            if type_dec=='mar_opens': # marginal calculation for opens
                
                
                # Define and add edges and calculate betweeness if at least one of the nodes is in graph 
                edge_list=[edge]
                
                
                # If channel exists increase capacity
                
                if g_mar.has_edge(node0,node1):
                   
                    g_mar.edges[node0,node1]['capacity']+=capacity
                    g_mar.edges[node0,node1]['no_channels']+=1

                else:
                    g_mar.add_edges_from(edge_list)
                
                
                g_mar_mes=node_measurement(g_mar,measurement,node0,node1)
                
                # Update measurement values after marginal change
                node0_new_mes=g_mar_mes[0]
                node1_new_mes=g_mar_mes[1]
            
            elif type_dec=='mar_closures': # marginal calculation for closes
                
                # Define and remove edges, define new connected graph and calculate betweeness 
                edge_list=[(node0,node1)]
                
                
                # If channel exists decrease capacity
                if g_mar.edges[node0,node1]['no_channels']>1:
                    g_mar.edges[node0,node1]['capacity']-=capacity
                    g_mar.edges[node0,node1]['no_channels']-=1
                
                else: 
                    g_mar.remove_edges_from(edge_list) 
                    connected_components=[c for c in nx.algorithms.components.connected_components(g_mar)]
                    g_mar=g_mar.subgraph(connected_components[0]).copy()
                    
                g_mar_mes=node_measurment(g_mar,measurement,node0,node1)
                node0_new_mes=g_mar_mes[0]
                node1_new_mes=g_mar_mes[1]
                
            elif type_dec=='actual': # actual calculation for both opens and closures
                
                # Check individualy if in the graph for the resulting block the node is present (in the connected component, 
                # if not assign measurment to 0. 
                
                try:
                    node0_new_mes=block_res[node0]

                except KeyError:
                    node0_new_mes=0

                try:
                    node1_new_mes=block_res[node1]

                except KeyError:
                    node1_new_mes=0

                
                   
            node0_mar=(node0_new_mes-node0_base)
            node1_mar=(node1_new_mes-node1_base) 
        
        
        else: # If both nodes are new (outside of connected graph) their marginal decision outcome is 0
            node0_mar=0
            node1_mar=0

        
        # Update dictionary - new betweenness
        mar_node0_dic_i[channel_id]=node0_mar
        mar_node1_dic_i[channel_id]=node1_mar
        
    
    return (mar_node0_dic_i,mar_node1_dic_i)

### MARGINAL - CURRENT BETWEENESS

**SCRIPT: Marginal current betweeness for opens**

In [None]:
# Compute marginal betweeness for channel openings

futures_bet_maropen=collection_compare(blocks,channel_opens,graph_snapshots,snapshot_bet,measurement='current_betweeness',type_dec='mar_opens')
start=time.time()
bet_maropen_diclist = dask.compute(*futures_bet_maropen)
end=time.time()
print('Compute in seconds: {}'.format(end-start))
print('Size in memory: {}'.format(sys.getsizeof(bet_maropen_diclist)))



In [None]:
# Write to marginal current betweeness for opens into decisions Dataframe

bet_maropen_channels=add_columns(bet_maropen_diclist,decisions_df,'bet_maropen_node0','bet_maropen_node1')

# Updated DataFrame

print('Rows edited with marginal current betweeness for opens: {}'.format(len(bet_maropen_channels)))
decisions_df[decisions_df['short_channel_id'].isin(bet_maropen_channels)].head(2)


**SCRIPT: Marginal current betweeness for closures**

In [None]:
# Compute marginal betweeness for channel closures

futures_bet_marclose=collection_compare(blocks,channel_closures,graph_snapshots,snapshot_bet,measurement='current_betweeness',type_dec='mar_closures')
start=time.time()
bet_marclose_diclist = dask.compute(*futures_bet_marclose)
end=time.time()
print('Compute in seconds: {}'.format(end-start))
print('Size in memory: {}'.format(sys.getsizeof(bet_marclose_diclist)))


In [None]:
# Write to marginal current betweeness for closures into decisions Dataframe

bet_marclose_channels=add_columns(bet_marclose_diclist,decisions_df,'bet_marclose_node0','bet_marclose_node1')

# Updated DataFrame

print('Rows edited with marginal current betweeness for closures: {}'.format(len(bet_marclose_channels)))
decisions_df[decisions_df['short_channel_id'].isin(bet_marclose_channels)].head(2)

### MARGINAL - CURRENT CLOSENESS

**SCRIPT: Marginal current closeness for opens**

In [None]:
# Compute marginal betweeness for channel openings

futures_clo_maropen=collection_compare(blocks,channel_opens,graph_snapshots,snapshot_clo,measurement='current_closeness',type_dec='mar_opens')
start=time.time()
clo_maropen_diclist = dask.compute(*futures_clo_maropen)
end=time.time()
print('Compute in seconds: {}'.format(end-start))
print('Size in memory: {}'.format(sys.getsizeof(clo_maropen_diclist)))


In [None]:
# Write to marginal current betweeness for opens into decisions Dataframe

clo_maropen_channels=add_columns(clo_maropen_diclist,decisions_df,'clo_maropen_node0','clo_maropen_node1')

# Updated DataFrame

print('Rows edited with marginal current closeness for opens: {}'.format(len(clo_maropen_channels)))
decisions_df[decisions_df['short_channel_id'].isin(clo_maropen_channels)].head(2)


**SCRIPT: Marginal current closeness for closures**

In [None]:
# Compute marginal current closeness for channel closures

futures_clo_marclose=collection_compare(blocks,channel_closures,graph_snapshots,snapshot_clo,measurement='current_closeness',type_dec='mar_closures')
start=time.time()
clo_marclose_diclist = dask.compute(*futures_clo_marclose)
end=time.time()
print('Compute in seconds: {}'.format(end-start))
print('Size in memory: {}'.format(sys.getsizeof(clo_marclose_diclist)))

# Write to marginal current betweeness for opens into decisions Dataframe

clo_marclose_channels=add_columns(clo_marclose_diclist,decisions_df,'clo_marclose_node0','clo_marclose_node1')

# Updated DataFrame

print('Rows edited with marginal current closeness for closures: {}'.format(len(clo_marclose_channels)))
decisions_df[decisions_df['short_channel_id'].isin(clo_marclose_channels)].head()

### ACTUAL - CURRENT BETWEENESS

**SCRIPT: Actual current betweeness for opens**

In [None]:
# Compute actual current betweeness for channel opens

futures_bet_actopen=collection_compare(blocks,channel_opens,graph_snapshots,snapshot_bet,measurement='current_betweeness',type_dec='actual')
start=time.time()
bet_actopen_diclist = dask.compute(*futures_bet_actopen)
end=time.time()
print('Compute in seconds: {}'.format(end-start))
print('Size in memory: {}'.format(sys.getsizeof(bet_actopen_diclist)))

In [None]:
# Write to marginal current betweeness for opens into decisions Dataframe

bet_actopen_channels=add_columns(bet_actopen_diclist,decisions_df,'bet_actopen_node0','bet_actopen_node1')

# Updated DataFrame

print('Rows edited with actual current betweeness for opens: {}'.format(len(bet_actopen_channels)))
decisions_df[decisions_df['short_channel_id'].isin(bet_actopen_channels)].head(2)

**SCRIPT: Actual current betweeness for closures**

In [None]:
# Compute actual current betweeness for channel closures

futures_bet_actclose=collection_compare(blocks,channel_closures,graph_snapshots,snapshot_bet,measurement='current_betweeness',type_dec='actual')
start=time.time()
bet_actclose_diclist = dask.compute(*futures_bet_actclose)
end=time.time()
print('Compute in seconds: {}'.format(end-start))
print('Size in memory: {}'.format(sys.getsizeof(bet_actclose_diclist)))

In [None]:
# Write to marginal current betweeness for opens into decisions Dataframe

bet_actclose_channels=add_columns(bet_actclose_diclist,decisions_df,'bet_actclose_node0','bet_actclose_node1')

# Updated DataFrame

print('Rows edited with actual current betweeness for closures: {}'.format(len(bet_actclose_channels)))
decisions_df[decisions_df['short_channel_id'].isin(bet_actclose_channels)].head(2)

### ACTUAL - CURRENT CLOSENESS

**SCRIPT: Actual current closeness for opens**

In [None]:
# Compute actual current closeness for channel opens

futures_clo_actopen=collection_compare(blocks,channel_opens,graph_snapshots,snapshot_bet,measurement='current_closeness',type_dec='actual')
start=time.time()
clo_actopen_diclist = dask.compute(*futures_clo_actopen)
end=time.time()
print('Compute in seconds: {}'.format(end-start))
print('Size in memory: {}'.format(sys.getsizeof(bet_actopen_diclist)))

In [None]:
# Write to marginal current closeness for opens into decisions Dataframe

clo_actopen_channels=add_columns(clo_actopen_diclist,decisions_df,'clo_actopen_node0','clo_actopen_node1')

# Updated DataFrame

print('Rows edited with actual current closeness for opens: {}'.format(len(clo_actopen_channels)))
decisions_df[decisions_df['short_channel_id'].isin(clo_actopen_channels)].head(2)

**SCRIPT: Actual current closeness for closures**

In [None]:
# Compute actual current closeness for channel closures

futures_clo_actclose=collection_compare(blocks,channel_closures,graph_snapshots,snapshot_bet,measurement='current_closeness',type_dec='actual')
start=time.time()
clo_actclose_diclist = dask.compute(*futures_clo_actclose)
end=time.time()
print('Compute in seconds: {}'.format(end-start))
print('Size in memory: {}'.format(sys.getsizeof(clo_actclose_diclist)))

In [None]:
# Write to marginal current closeness for opens into decisions Dataframe

clo_actclose_channels=add_columns(clo_actclose_diclist,decisions_df,'clo_actclose_node0','clo_actclose_node1')

# Updated DataFrame

print('Rows edited with actual current closeness for closures: {}'.format(len(clo_actclose_channels)))
decisions_df[decisions_df['short_channel_id'].isin(clo_actclose_channels)].head(2)

# -------------------------------

### Pairwise stability 

- **Marginal betweenness (bet_mar_nodei)**: The % change between the betweenness centrality, for the node under analysis, given the graph from the previous block and the betweenness centrality of the resulting graph after enacting the decission (adding or removing a channel). Weighted current betweenness centrality is used for this measure.

> **Marginal betweenness for opens** 

**TODO**: Why is length of Dataframe longer than the number of snapshots extracted? Could it be that some channels appear more than once in dataframe?

> **Marginal betweenness for closures** 

- **Actual change in betweenness (bet_act_nodei)**: The % change between the betweenness centrality, for the node under analysis, given the graph from the previous block and the betweenness centrality of the resulting graph after enacting **all** the decissions (adding or removing a channels) in the current block. Weighted current betweenness centrality is used for this measure.

- **Marginal betweeness pairwise stability (bet_mar_pairst/open/close)**: Evaluates if given the marginal graph that results from just enacting this decission is consistent with pairwise stability, from a betweenness perspective.

In [None]:
# MARGINAL - Add column with check for pairwise stability compatability using marginal outcomes

# OPEN - Channel is opened if both nodes gain

# Define function 
def bet_pairst_maropen(row):
    if not math.isnan(row['bet_mar_node0']):
        pairst=(row['bet_maropen_node0']>=0 and row['bet_maropen_node1']>=0)
    else:
        pairst=row['bet_maropen_node0']
    return pairst

# Apply function
decisions_df['bet_mar_pairstopen']=decisions_df.apply(bet_pairst_maropen,axis=1)

# CLOSE - Channel is closed if at least one node is better off

# Define function 
def bet_pairst_marclose(row):
    if not math.isnan(row['bet_marclose_node0']):
        pairst=(row['bet_marclose_node0']>0 or row['bet_marclose_node1']>0)
    else:
        pairst=row['bet_marclose_node0']
    return pairst

# Apply function
decisions_df['bet_mar_pairstclose']=decisions_df.apply(bet_pairst_marclose,axis=1)

In [None]:
# Test MARGINAL OPEN
decisions_df[decisions_df['bet_mar_node0'].notnull()][['bet_mar_node0','bet_mar_node1','bet_mar_pairstopen']].head()

In [None]:
# Test MARGINAL CLOSE
decisions_df[decisions_df['bet_marclose_node0'].notnull()][['bet_marclose_node0','bet_marclose_node1','bet_mar_pairstclose']].head()

- **Actual betweeness pairwise stability (bet_act_pairstopen/close)**: Evaluates if given the marginal graph that results from all the decisions in the block is consitend with pairwise stability, from a betweenness perspective. 

In [None]:
# ACTUAL - Add column with check for pairwise stability compatability using marginal outcomes

# OPEN - Channel is opened if both nodes gain

# Define function 
def bet_pairst_actopen(row):
    if not math.isnan(row['bet_actopen_node0']):
        pairst=(row['bet_actopen_node0']>=0 and row['bet_actopen_node1']>=0)
    else:
        pairst=row['bet_actopen_node0']
    return pairst

# Apply function
decisions_df['bet_act_pairstopen']=decisions_df.apply(bet_pairst_actopen,axis=1)

# CLOSE - Channel is closed if at least one node is better off

# Define function 
def bet_pairst_actclose(row):
    if not math.isnan(row['bet_actclose_node0']):
        pairst=(row['bet_actclose_node0']>0 or row['bet_actclose_node1']>0)
    else:
        pairst=row['bet_actclose_node0']
    return pairst

# Apply function
decisions_df['bet_act_pairstclose']=decisions_df.apply(bet_pairst_actclose,axis=1)

In [None]:
# Test ACTUAL OPEN
decisions_df[decisions_df['bet_actopen_node0'].notnull()][['bet_actopen_node0','bet_actopen_node1','bet_act_pairstopen']].head()

In [None]:
# Test ACTUAL CLOSE
decisions_df[decisions_df['bet_actclose_node0'].notnull()][['bet_actclose_node0','bet_actclose_node1','bet_act_pairstclose']].head()

In [None]:
# Save Updated DataFrame to S3

# Create S3 resource and define values
session = boto3.session.Session()
s3 = session.resource('s3')
csv_buffer = io.StringIO()

# File path and name ([extraction_id]snapshot_bet-[no_blocks]-[start_block]-[end_block])
key_decisions_df='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+str(extraction_id)+'decisions_df_bet-'+str(no_blocks)+'-'+str(start_block)+'-'+str(end_block)+'.csv'


# Safe DataFrame
decisions_df.to_csv(csv_buffer)
s3.Object(bucket, key_decisions_df).put(Body=csv_buffer.getvalue())


In [None]:
# Test Save
decisions_df_test_load = s3.Object(bucket_name=bucket, key=key_decisions_df).get()
decisions_df_test=pd.read_csv(io.BytesIO(decisions_df_test_load['Body'].read()),index_col=0)
decisions_df_test==decisions_df

In [None]:
decisions_df_test.head()

In [None]:
decisions_df.head()

### Efficiency
- **Average betweeness per block (bet_effic)**: Average betweenness centrality for all the nodes. 

### Nash stability 

- **% Change with respect to not making decision (bet_binstat_deltai)**: The % change in betwewnness centrality, for the node under analysis, given the resulting graph after all of the decissions have been executed. 
- **Nash compatible - binary strategy (bet_binstat_nash)**: Returns true if given the other decissions enacted in the block not making decision would have NOT have resulted in higher betweenness centrality. This tells me if my strategy helped me be better off (took into account what others were doing)

(Optional approaches - Check for tracktability)
- **Nash compatible - close only strategy (bet_closestat_nash)**: Returns true if given the other decissions enacted in the block, closing any other channels would NOT have not resulted in higher betwneenness centrality. (NOTE: Check if there are combinatorial considerations, if so just look at closings up to x) 
- **Nash compatible - close/open (bet_allstat_nash)**: Returns true if given the other decissions enacted in the block, closing any other channels (with any node) or opening a channel with one of the round participants would NOT have not resulted in lower betwneenness centrality. (NOTE: To make it reasonable and constraint the strategy space only consider 'similar nodes' or with relationships in the past?).






## Connectivity

### Pairwise stability 

- **Marginal % change in connectivity (con_mar_deltai)**: The % change between the shortest path average, for the node under analysis, given the graph from the previous block and the shortest path average of the resulting graph after enacting the decission (adding or removing a channel). Weighted shortest path (_single_source_dijkstra_path_) is used for this measure.

- **Actual % change in connectivity (con_act_deltai)**: The % change between the shortest path average, for the node under analysis, given the graph from the previous block and the shortest path average of the resulting graph after enacting **all** the decissions (adding or removing a channels) in the current block. Weighted shortest path (_single_source_dijkstra_path_) is used for this measure.

- **Marginal connectivity pairwise stability (con_mar_pairstab)**: Evaluates if given the marginal graph that results from just enacting this decission is consistent with pairwise stability, from a connectivity perspective.

- **Actual connectivity pairwise stability (con_act_pairstab)**: Evaluates if given the marginal graph that results from all the decisions in the block is consitend with pairwise stability, from a connectivity perspective.  



### Nash stability 

- **% Change with respect to not making decision (con_binstat_deltai)**: The % change in shortest path average, for the node under analysis, given the resulting graph after all of the decissions have been executed. 
- **Nash compatible - binary strategy (con_binstat_nash)**: Returns true if given the other decissions enacted in the block not making decision would have NOT have resulted in higher shortest path average. NOTE: This indicates if the strategy selected made the node better off (took into account what others were doing)

(Optional approaches - Check for tracktability)
- **Nash compatible - close only strategy (con_closestat_nash)**: Returns true if given the other decissions enacted in the block, closing any other channels would NOT have not resulted in higher shortest path average. (NOTE: Check if there are combinatorial considerations, if so just look at closings up to x) 
- **Nash compatible - close/open (con_allstat_nash)**: Returns true if given the other decissions enacted in the block, closing any other channels (with any node) or opening a channel with one of the round participants would NOT have not resulted in lower shortest path average. (NOTE: To make it reasonable and constraint the strategy space only consider 'similar nodes' or with relationships in the past?).



### Efficiency
- **Average betweeness per block (bet_effic)**: Average shortest path average for all the nodes. 


## Utility Functions

In [None]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))