# LN - Data Pre-Processing - Normalized rankings

## Import Libraries

In [258]:
import numpy as np
import pandas as pd
import networkx as nx
import itertools
#import matplotlib.pyplot as plt
import time
import pickle

import os
import re
import sys
import io
import random
import itertools
import math

from tqdm.notebook import trange, tqdm
#from tqdm.notebook import trange
#from tqdm import tqdm_notebook as tqdm
from time import sleep

from dask_cloudprovider import FargateCluster
from dask.distributed import Client
import dask.array as da
import dask
dask.config.set({'distributed.scheduler.allowed-failures': 50}) 


import boto3

In [62]:
# Define parameters

bucket='ln-strategy-data'
extraction_id=1587447789


In [297]:
# Initiate s3 resource

session = boto3.session.Session()
s3 = session.resource('s3')

## Load Data

In [366]:
# Load objects form S3
# Dataframe

decisions_load = s3.Object(bucket_name=bucket, key='decisions_df.csv').get()
decisions_df=pd.read_csv(io.BytesIO(decisions_load['Body'].read()))

# Full Dataframe

decisions_load = s3.Object(bucket_name=bucket, key='full_decisions_df.csv').get()
full_decisions_df=pd.read_csv(io.BytesIO(decisions_load['Body'].read()))

# Channel closures
closure_file = s3.Object(bucket_name=bucket, key='channel_closures.p').get()
channel_closures = pickle.loads(closure_file['Body'].read())
    
    
# Channel openings 
opens_file = s3.Object(bucket_name=bucket, key='channel_opens.p').get()
channel_opens = pickle.loads(opens_file['Body'].read())

In [195]:
# Transform data: Create list of .items with nodes involved in opens/closures per block
open_nodes_list=[(opens[0],list(set([i for t in opens[1] for i in t[:2]]))) for opens in sorted(list(channel_opens.items()))]
closure_nodes_list=[(closes[0],list(set([i for t in closes[1] for i in t[:2]]))) for closes in sorted(list(channel_closures.items()))]
print('--OPENS---')
print(open_list_sets[:10])
print('--CLOSURES---')
print(closure_list_sets[:10])


--OPENS---
[(505149, [5314, 6038]), (506402, [934, 3023]), (506847, [576, 3452]), (508075, [3436, 3310]), (508090, [2378, 4223]), (508320, [1912, 422]), (508400, [1912, 5154]), (508447, [6656, 6595, 2120, 4688, 4119]), (508503, [422, 2953, 5426, 7059, 3957, 7478, 2518, 5725]), (508666, [422, 5294])]
--CLOSURES---
[(505149, []), (506402, []), (506847, []), (508075, []), (508090, []), (508320, []), (508400, []), (508447, []), (508503, []), (508666, [])]


In [None]:
graph_keys = [obj.key 
        for obj in s3.Bucket(name=bucket).objects.all()
        if re.match(".*"+str(extraction_id)+"_connected/.*\.gpickle",obj.key)]

In [279]:
# Base lists to be populated
blocks=[]
base_ix=6 # From this index onward the connected component has more than 3 items. 
#final_ix=2000
final_ix=len(graph_keys)
extract_keys=graph_keys[base_ix:final_ix] # Blocks below 6th index are <3 and affect some graph metrics

for key in extract_keys: 
    
    # Create block list from file_names
    block_i=int(key.split(".")[0].split("/")[-1]) 
    blocks.append(block_i)
    
# Update node lists
open_nodes=open_nodes_list[base_ix:final_ix]
closure_nodes=closure_nodes_list[base_ix:final_ix]

# Open and close blocks
open_blocks=[b for b,o in open_nodes if len(o)>0]
close_blocks=[b for b,o in closure_nodes if len(o)>0]

In [None]:
len(blocks)

In [371]:
cluster = FargateCluster(n_workers=20,scheduler_timeout='15 minutes',image='dsrincon/dask-graph:nx-scipy-v1',scheduler_cpu=4096,scheduler_mem=16384)

  next(self.gen)


In [372]:
cluster

VBox(children=(HTML(value='<h2>FargateCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n  …

In [373]:
client=Client(cluster)
#cluster=Client('tcp://18.234.80.68:8786')


python
+---------------------------+---------------+
|                           | version       |
+---------------------------+---------------+
| client                    | 3.7.3.final.0 |
| scheduler                 | 3.7.4.final.0 |
| tcp://172.31.11.168:37195 | 3.7.4.final.0 |
| tcp://172.31.19.135:36337 | 3.7.4.final.0 |
| tcp://172.31.26.103:46345 | 3.7.4.final.0 |
| tcp://172.31.3.207:36345  | 3.7.4.final.0 |
| tcp://172.31.41.171:37527 | 3.7.4.final.0 |
| tcp://172.31.44.189:46683 | 3.7.4.final.0 |
| tcp://172.31.48.231:34031 | 3.7.4.final.0 |
| tcp://172.31.57.192:35531 | 3.7.4.final.0 |
| tcp://172.31.59.157:40685 | 3.7.4.final.0 |
| tcp://172.31.62.234:33779 | 3.7.4.final.0 |
| tcp://172.31.63.169:42943 | 3.7.4.final.0 |
| tcp://172.31.70.86:44591  | 3.7.4.final.0 |
| tcp://172.31.71.55:43301  | 3.7.4.final.0 |
| tcp://172.31.72.0:41117   | 3.7.4.final.0 |
| tcp://172.31.74.105:35869 | 3.7.4.final.0 |
| tcp://172.31.75.162:37533 | 3.7.4.final.0 |
| tcp://172.31.78.196:3896

In [24]:


def graph_ranking(input_tuple):
    
    # Unpacking input
    block_num=input_tuple[0]
    measurement=input_tuple[1]
    extraction_id=input_tuple[2]
    key_rawscore=input_tuple[3]
    bucket=input_tuple[4]
    
    
    # Retrieve snapshot from S3
    session = boto3.session.Session()
    s3 = session.resource('s3')
    response = s3.Object(bucket_name=bucket, key=key_rawscore).get()
    snapshot=pickle.loads(response['Body'].read())
    
    
    # Calculate ranking for snapshot
    max_value = max(snapshot.values())
    norm_rank = {k: v / max_value for k, v in snapshot.items()}
    
    
    # Write output into S3
    key_out='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/norm_rank/'+str(block_num)+'.pkl'
    pickle_byte_obj = pickle.dumps(norm_rank)
    response=s3.Object(bucket,key_out).put(Body=pickle_byte_obj)['ResponseMetadata']['HTTPStatusCode']
    
    
    return response 

In [None]:
# TEST calculate_ranking 

test_block=516790
measurement='channels'
input_tuple=(test_block,measurement,extraction_id)
response_test=calculate_ranking(input_tuple)

# Test if function saved result correctly and download result
if response_test==200:
    key_test='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/norm_rank/'+str(test_block)+'.pkl'
    g_rank_test_load = s3.Object(bucket_name=bucket, key=key_test).get()
    g_rank_test = pickle.loads(g_rank_test_load['Body'].read())
    g_rank_values=sorted([v for k,v in g_rank_test.items()])
    #print(g_rank_values)
    #print('The dic saved has these first items: {}'.format(list(g_rank_test.items())))



In [50]:
def collection_ranking(extraction_id,blocks,measurement,bucket):

    session = boto3.session.Session()
    s3 = session.resource('s3')
    
  
    

    delayed_responses=[]
    with tqdm(total=len(blocks)) as pbar:
        for block_num in blocks:

            
            # Create key
            key='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/raw_score/'+str(block_num)+'.pkl'
            
            # Create input tuple
            input_tuple=(block_num,measurement,extraction_id,key,bucket)
            
            # Run delayed function using dask
            response=dask.delayed(graph_ranking)(input_tuple)
            delayed_responses.append(response)
            
            # Update progress bar
            pbar.update(1)

    # Collect futures
    futures = dask.persist(*delayed_responses)

    # Run parallel computations
    start=time.time()
    final_responses = dask.compute(*futures)
    end=time.time()
    print('Compute in seconds: {}'.format(end-start))

    return final_responses






In [56]:
# Test collection_ranking
test_responses=collection_ranking(extraction_id,blocks,'channels',bucket)

HBox(children=(FloatProgress(value=0.0, max=36536.0), HTML(value='')))


Compute in seconds: 591.6712484359741


In [54]:
print(test_responses[:20])

(200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200)


## Calculate normalized rankings for measures

**Age ranking**

In [57]:
age_responses=collection_ranking(extraction_id,blocks,'age',bucket)

HBox(children=(FloatProgress(value=0.0, max=36536.0), HTML(value='')))


Compute in seconds: 384.95358204841614


**Capacity ranking**

In [58]:
capacity_responses=collection_ranking(extraction_id,blocks,'capacity',bucket)

HBox(children=(FloatProgress(value=0.0, max=36536.0), HTML(value='')))


Compute in seconds: 360.1695795059204


**Betweeness ranking**

In [59]:
betweeness_responses=collection_ranking(extraction_id,blocks,'betweeness_curr_aprox',bucket)

HBox(children=(FloatProgress(value=0.0, max=36536.0), HTML(value='')))


Compute in seconds: 363.8422577381134


**Growth ranking**

In [None]:
growth_responses=collection_ranking(extraction_id,blocks,'capacity_growth',bucket)

**Channels ranking**

In [None]:
channels_responses=collection_ranking(extraction_id,blocks,'channels',bucket)

**Betweeness Unweighted ranking**

In [None]:
betweeness_uw_responses=collection_ranking(extraction_id,blocks,'current_betweeness_unweighted',bucket)

**TETS: Test correct norm_ranking creation**

In [376]:
test_block=blocks[-100]
measurement='current_betweeness_unweighted'
key_raw='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/raw_score/'+str(test_block)+'.pkl'
key_norm='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/norm_rank/'+str(test_block)+'.pkl'


raw_load = s3.Object(bucket_name=bucket, key=key_raw).get()
dic_raw = pickle.loads(raw_load['Body'].read())
items_raw=list(dic_raw.items())

norm_load = s3.Object(bucket_name=bucket, key=key_norm).get()
dic_norm = pickle.loads(norm_load['Body'].read())
items_norm=list(dic_norm.items())


print(items_raw[:10])
print(items_norm[:10])
print(len(items_raw)==len(items_norm))

[(5314, 7.885188797837179e-05), (934, 0.00012824277473443815), (3023, 0.0), (3452, 5.784695593887777e-05), (576, 0.0), (3436, 0.0021980450863651082), (3310, 0.0), (4223, 0.0062914361259963835), (422, 0.0020213538805751515), (1912, 0.0003722872728578278)]
[(5314, 0.00042371385522347026), (934, 0.0006891178116393132), (3023, 0.0), (3452, 0.00031084299110919317), (576, 0.0), (3436, 0.011811285453992206), (3310, 0.0), (4223, 0.03380729015098958), (422, 0.010861828010311452), (1912, 0.0020005009350757648)]
True


## Add normalized rankings to DataFrame

In [356]:
def extract_values(input_tuple):
    
    # Unpack input tuple
    
    bucket=input_tuple[0]
    extraction_id=input_tuple[1]
    measurement=input_tuple[2]
    score_type=input_tuple[3]
    prev_block=input_tuple[4]
    act_block=input_tuple[5]
    nodes_lists=input_tuple[6]
    decision_type=input_tuple[7]
    actual_bool=input_tuple[8]
    
    
    # Add extra tag to account for same block calculations
    if actual_bool:
        extra_tag='post'
    else:
        extra_tag=''
        
    
    
    
    # Initialize list of lists to return
    values_lists=[]
    
    # Start S3 session
    session = boto3.session.Session()
    s3 = session.resource('s3')
    
    # Create keys
    
    dic_key='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/'+score_type+'/'+str(prev_block)+'.pkl'
    
    # Adjust for approximated measurments that already account for index -1.
    if measurement=='closeness_approx_rank':
        dic_key='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/'+score_type+'/'+str(act_block)+'.pkl'
    
    g_key='graph_snapshots/'+str(extraction_id)+'_connected/'+str(prev_block)+'.gpickle'
    
    
    # Load dic
    dic_load = s3.Object(bucket_name=bucket, key=dic_key).get()
    dic = pickle.loads(dic_load['Body'].read())
                       
    # Load graph
    g_load = s3.Object(bucket_name=bucket, key=g_key).get()
    g = pickle.loads(g_load['Body'].read())
                     
                       
    # Extract relevant values
    for nodes in nodes_lists:
        # Define list to return
        values_i=[]
        
        # loop over nodes in lists
        for node in nodes:
            # Check if node is in graph and retrieve metric from dic, else set to 0
            if g.has_node(node):         
                values_i.append((node,dic[node]))
            else:
                values_i.append((node,0))  
        
        # Update value list in global return list
        values_lists.append(values_i)
        
    # Save value to S3
    key_out='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/'+decision_type+'_'+score_type+extra_tag+'/'+str(act_block)+'.pkl'
    pickle_byte_obj = pickle.dumps(values_lists)
    response=s3.Object(bucket,key_out).put(Body=pickle_byte_obj)['ResponseMetadata']['HTTPStatusCode']
                     
    return response
                

**TEST before block decisions**

In [352]:
# TEST extract_values function

# set parameters
test_ix=-3000
test_act_block=blocks[test_ix]
test_prev_block=blocks[test_ix-1]
test_nodes=[open_nodes[test_ix][1],closure_nodes[test_ix][1]]
measurement='betweeness_curr_aprox'
score_type='norm_rank'
decision_type='open'
actual_bool=True
if actual_bool:
    extra_tag='post'
else:
    extra_tag=''

print('Test Block:{}'.format(test_act_block))
print('Test Nodes:{}'.format(test_nodes))
print('Open blocks:{}'.format(channel_opens[test_act_block]))

# run function and print results
start=time.time()
#Test for prev block
test_input_tuple=(bucket,extraction_id,measurement,score_type,test_prev_block,test_act_block,test_nodes,decision_type,actual_bool)
#Test for act block=prev_block
#test_input_tuple=(bucket,extraction_id,measurement,score_type,test_act_block,test_act_block,test_nodes,decision_type,actual_bool)
response=extract_values(test_input_tuple)

end=time.time()


if response==200:
    
    # Check value recorded to s3
    values_key='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/'+decision_type+'_'+score_type+extra_tag+'/'+str(test_act_block)+'.pkl'
    test_load = s3.Object(bucket_name=bucket, key=values_key).get()
    values = pickle.loads(test_load['Body'].read())
    
    # Print results
    print('Elapsed time seconds:{}'.format(end-start))
    print('Values:{}'.format(values))
    
else:
    print('Did not save to S3 correctly')
    


Test Block:612260
Test Nodes:[[545, 2401, 4803, 4548, 4292, 5990, 4998, 7659, 6220, 4175, 2865, 3315, 4086, 4249, 4927], [4086, 5990, 4927]]
Open blocks:[(4998, 5990, {'capacity': 16777215, 'open_fee': 222, 'dec_id': 67218, 'channel_id': '612260x2185x1', 'no_channels': 0}), (3315, 4249, {'capacity': 2000000, 'open_fee': 154, 'dec_id': 67221, 'channel_id': '612260x2327x0', 'no_channels': 0}), (545, 6220, {'capacity': 8000000, 'open_fee': 361, 'dec_id': 67213, 'channel_id': '612260x2179x0', 'no_channels': 0}), (5990, 2865, {'capacity': 2000000, 'open_fee': 223, 'dec_id': 67219, 'channel_id': '612260x2311x1', 'no_channels': 0}), (4548, 4803, {'capacity': 500000, 'open_fee': 177, 'dec_id': 67222, 'channel_id': '612260x2358x1', 'no_channels': 0}), (5990, 4086, {'capacity': 2000000, 'open_fee': 292, 'dec_id': 67214, 'channel_id': '612260x2181x1', 'no_channels': 0}), (2401, 5990, {'capacity': 16777215, 'open_fee': 223, 'dec_id': 67215, 'channel_id': '612260x2182x1', 'no_channels': 0}), (4292,

**TEST after block decisions**

In [390]:
# TEST extract_values function

# set parameters
test_ix=blocks.index(536475)
#test_ix=-5000
test_act_block=blocks[test_ix]
test_prev_block=blocks[test_ix-1]
test_nodes=[open_nodes[test_ix][1],closure_nodes[test_ix][1]]
measurement='current_betweeness_unweighted'
score_type='norm_rank'
decision_type='open'
actual_bool=True
if actual_bool:
    extra_tag='post'
else:
    extra_tag=''

print('Test Block:{}'.format(test_act_block))
print('Test Nodes:{}'.format(test_nodes))

# run function and print results
start=time.time()
#Test for prev block
#test_input_tuple=(bucket,extraction_id,measurement,score_type,test_prev_block,test_act_block,test_nodes,decision_type,actual_bool)
#Test for act block=prev_block
test_input_tuple=(bucket,extraction_id,measurement,score_type,test_act_block,test_act_block,test_nodes,decision_type,actual_bool)
response=extract_values(test_input_tuple)

end=time.time()


if response==200:
    
    # Check value recorded to s3
    values_key='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/'+decision_type+'_'+score_type+extra_tag+'/'+str(test_act_block)+'.pkl'
    test_load = s3.Object(bucket_name=bucket, key=values_key).get()
    values = pickle.loads(test_load['Body'].read())
    
    # Print results
    print('Elapsed time seconds:{}'.format(end-start))
    print('Values:{}'.format(values))
    
else:
    print('Did not save to S3 correctly')

Test Block:536475
Test Nodes:[[2766, 4686], []]
Elapsed time seconds:0.3497591018676758
Values:[[(2766, 0.027421155243756427), (4686, 0.05200834996754011)], []]


In [357]:
def collection_extract_values(decisions_df,blocks,decision_type,measurement,score_type,actual=False):
    
    # Define dictonary to return
    delayed_responses=[]
    
    # Set list of blocks and column names
    if decision_type=='open':
        column_name='open_block'
        dec_blocks=[b for b,o in open_nodes if len(o)>0]
        
    if decision_type=='close':
        column_name='close_block'
        dec_blocks=[b for b,o in closure_nodes if len(o)>0]
    
    # Loop over blocks in decision type, starting from the 2nd one
    print(len(dec_blocks))
    with tqdm(total=len(range(1,len(dec_blocks)))) as pbar:
        for i in range(1,len(dec_blocks)):

            # Define blocks to look at decisions and prev block to query measurments
            dec_block_i=dec_blocks[i]
            prev_block=blocks[blocks.index(dec_block_i)-1]

            # Select nodes in node0,node1 and create list of lists
            node0_nodes=decisions_df[decisions_df[column_name]==dec_block_i]['node0_id'].tolist()
            node1_nodes=decisions_df[decisions_df[column_name]==dec_block_i]['node1_id'].tolist()
            nodes_lists=[node0_nodes,node1_nodes]

            # Run delayed function to extract values for node0 and node1 in prev_block
            input_tuple=(bucket,extraction_id,measurement,score_type,prev_block,dec_block_i,nodes_lists,decision_type,actual)
            
            if actual: # Calculation is done for nodes after block decisions are made prev_block=dec_block_i
            
                input_tuple=(bucket,extraction_id,measurement,score_type,dec_block_i,dec_block_i,nodes_lists,decision_type,actual)
            
            response=dask.delayed(extract_values)(input_tuple)
            delayed_responses.append(response)
            
            # Update progress bar
            pbar.update(1)
        

        # Collect futures
        futures = dask.persist(*delayed_responses)

        # Run parallel computations
        start=time.time()
        final_responses = dask.compute(*futures)
        end=time.time()
        print('Compute in seconds: {}'.format(end-start))
    
    
    return final_responses
    

**Age: DF update extraction**

In [247]:
open_age_rank_res=collection_extract_values(decisions_df,blocks,'open','age','norm_rank')

30537


HBox(children=(FloatProgress(value=0.0, max=30536.0), HTML(value='')))

Compute in seconds: 198.8955729007721



In [248]:
close_age_rank_res=collection_extract_values(decisions_df,blocks,'close','age','norm_rank')

11022


HBox(children=(FloatProgress(value=0.0, max=11021.0), HTML(value='')))

Compute in seconds: 79.43323612213135



**Betweeness: DF update extraction**

In [249]:
open_betweeness_rank_res=collection_extract_values(decisions_df,blocks,'open','betweeness_curr_aprox','norm_rank')

30537


HBox(children=(FloatProgress(value=0.0, max=30536.0), HTML(value='')))

Compute in seconds: 202.19830679893494



In [250]:
close_betweeness_rank_res=collection_extract_values(decisions_df,blocks,'close','betweeness_curr_aprox','norm_rank')

11022


HBox(children=(FloatProgress(value=0.0, max=11021.0), HTML(value='')))

Compute in seconds: 78.17207503318787



**Capacity: DF update extraction**

In [251]:
open_capacity_rank_res=collection_extract_values(decisions_df,blocks,'open','capacity','norm_rank')

30537


HBox(children=(FloatProgress(value=0.0, max=30536.0), HTML(value='')))

Compute in seconds: 203.74693655967712



In [252]:
close_capacity_rank_res=collection_extract_values(decisions_df,blocks,'close','capacity','norm_rank')

11022


HBox(children=(FloatProgress(value=0.0, max=11021.0), HTML(value='')))

Compute in seconds: 80.01387286186218



**Capacity Growth: DF update extraction**

In [253]:
open_capacitygrowth_rank_res=collection_extract_values(decisions_df,blocks,'open','capacity_growth','norm_rank')

30537


HBox(children=(FloatProgress(value=0.0, max=30536.0), HTML(value='')))

Compute in seconds: 212.46921563148499



In [254]:
close_capacitygrowth_rank_res=collection_extract_values(decisions_df,blocks,'close','capacity_growth','norm_rank')

11022


HBox(children=(FloatProgress(value=0.0, max=11021.0), HTML(value='')))

Compute in seconds: 83.05098509788513



**Channels: DF update extraction**

In [None]:
open_channels_rank_res=collection_extract_values(decisions_df,blocks,'open','channels','norm_rank')

In [None]:
close_channels_rank_res=collection_extract_values(decisions_df,blocks,'close','channels','norm_rank')

**Closeness: DF update extraction**

In [282]:
open_closeness_rank_res=collection_extract_values(decisions_df,blocks,'open','closeness_approx_rank','norm_rank')

30537


HBox(children=(FloatProgress(value=0.0, max=30536.0), HTML(value='')))

Compute in seconds: 133.6988923549652



In [None]:
close_closeness_rank_res=collection_extract_values(decisions_df,blocks,'close','closeness_approx_rank','norm_rank')

**Closeness_post: DF update extraction**

In [361]:
open_closeness_rankpost_res=collection_extract_values(decisions_df,blocks,'open','closeness_approx_rank_post','norm_rank',True)

30537


HBox(children=(FloatProgress(value=0.0, max=30536.0), HTML(value='')))

Compute in seconds: 350.180921792984



In [362]:
close_closeness_rankpost_res=collection_extract_values(decisions_df,blocks,'close','closeness_approx_rank_post','norm_rank',True)

11022


HBox(children=(FloatProgress(value=0.0, max=11021.0), HTML(value='')))

Compute in seconds: 141.4614691734314



**Betweeness_post: DF update extraction**

In [363]:
open_betweeness_rankpost_res=collection_extract_values(decisions_df,blocks,'open','betweeness_curr_aprox','norm_rank',True)

30537


HBox(children=(FloatProgress(value=0.0, max=30536.0), HTML(value='')))

Compute in seconds: 367.6419939994812



In [None]:
close_betweeness_rankpost_res=collection_extract_values(decisions_df,blocks,'close','betweeness_curr_aprox','norm_rank',True)

**Betweeness Unweighted: DF update extraction before and after decision**

In [None]:
open_betweenessuw_rank_res=collection_extract_values(decisions_df,blocks,'open','current_betweeness_unweighted','norm_rank',False)

In [378]:
close_betweenessuw_rank_res=collection_extract_values(decisions_df,blocks,'close','current_betweeness_unweighted','norm_rank',False)

11022


HBox(children=(FloatProgress(value=0.0, max=11021.0), HTML(value='')))

Compute in seconds: 189.29941248893738



In [379]:
open_betweenessuw_rankpos_res=collection_extract_values(decisions_df,blocks,'open','current_betweeness_unweighted','norm_rank',True)

30537


HBox(children=(FloatProgress(value=0.0, max=30536.0), HTML(value='')))

Compute in seconds: 384.76038432121277



In [380]:
close_betweenessuw_rankpos_res=collection_extract_values(decisions_df,blocks,'close','current_betweeness_unweighted','norm_rank',True)

11022


HBox(children=(FloatProgress(value=0.0, max=11021.0), HTML(value='')))

Compute in seconds: 151.33098697662354



## Add normalized rankings to DataFrame

In [393]:
def add_to_dataframe(s3,bucket,df,decision_blocks,decision_type,measurement,score_type,actual_bool=False):
    
       
    # Define tag
    
    if actual_bool:
        extra_tag='post'
    else:
        extra_tag=''

    # Create new empty columns for measurment for node0 and node1
    df[measurement+'_n0_'+decision_type+'_'+score_type+extra_tag]=np.nan
    df[measurement+'_n1_'+decision_type+'_'+score_type+extra_tag]=np.nan
    
    # Create updated row counter
    updated_rows=0
    
    # Create counter of missing items
    missing=0
    
    
    
    with tqdm(total=len(range(1,len(decision_blocks)))) as pbar:
        for i in range(1,len(decision_blocks)):
            
            block=decision_blocks[i]

            # Extract measurement values for block from S3:
            values_key='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/'+decision_type+'_'+score_type+extra_tag+'/'+str(block)+'.pkl'
           
            # Check if file exists
            bucket_check = s3.Bucket(bucket)
            objs = list(bucket_check.objects.filter(Prefix=values_key))
            if len(objs) > 0 and objs[0].key == values_key:
                
                values_load = s3.Object(bucket_name=bucket, key=values_key).get()
                values = pickle.loads(values_load['Body'].read())
                value_tuples=[t for t in zip(values[0],values[1])]

                # Update values in DataFrame
                for t in value_tuples:
                    df.loc[(df[decision_type+'_block']==block) & (df['node0_id']==t[0][0]),measurement+'_n0_'+decision_type+'_'+score_type+extra_tag]=t[0][1]
                    df.loc[(df[decision_type+'_block']==block) & (df['node1_id']==t[1][0]),measurement+'_n1_'+decision_type+'_'+score_type+extra_tag]=t[1][1]
                    updated_rows+=1
            
            else:
                missing+=1
                pbar.update(1)
                continue
            
            pbar.update(1)
                 
    print('Missing items:{}'.format(missing))
    return updated_rows



In [236]:
# Test add_to_dataframe
test_df=decisions_df.copy()
test_rows=add_to_dataframe(s3,test_df,open_blocks,'open','age','norm_rank')

HBox(children=(FloatProgress(value=0.0, max=1993.0), HTML(value='')))




In [243]:
test_df[test_df['open_block'].isin(open_blocks)].head()

Unnamed: 0.1,Unnamed: 0,short_channel_id,open_block,open_transaction,address,close_block,close_transaction,node0,node1,satoshis,...,close_type,close_htlc_count,close_balance_a,close_balance_b,dec_id,node0_id,node1_id,node_pair,age_n0_open_norm_rank,age_n1_open_norm_rank
69871,1661,540687x2074x0,540687,e339d1d2ce9aadb2e8afbe58d65dbd0151a69336f8ae74...,bc1qku4zsccymmvuvjtmgxpvtc0sn8pv8hwfzlej0fejf9...,592327.0,c8615eb9e4f8f9087f7f7ebb755f0f7e85fef65a26fda3...,024c8c764d8e3657d3fd3bafca31d6733f2bdf6b10e9c4...,039f01ad62e5208940faff11d0bbc997582eafad7642aa...,30000,...,force,0.0,7920.0,7920.0,1661,7569,4787,36232803,0.104373,0.040438
69872,1660,540685x614x1,540685,8645bec46da9a3a53d204a4a8c0cded916c643147f6685...,bc1qn056cvaxuf2zg3qrmglk7ju72aql63zkrzj48vhweq...,,,03295d2e292565743a40bd44da227a820f8730877bc3df...,036265cf7c7356b06b9d64a09dad1c7f7519971be47510...,323078,...,,,,,1660,1182,3755,4438410,0.334112,0.063158
69873,1659,540679x1544x0,540679,f63a898977af9cf30358a6af9d563a669edf0b33d3e82f...,bc1qtrklnue4cvurj8rm4kwum2hkjwmvejgmgxpr2zallg...,592593.0,0173c1b539f5978a760965573e094652afdd46936328b7...,02529db69fd2ebd3126fb66fafa234fc3544477a23d509...,039f01ad62e5208940faff11d0bbc997582eafad7642aa...,2993787,...,mutual,0.0,190517.0,2802647.0,1659,6568,4787,31441016,0.271993,0.040087
69874,1658,540672x846x1,540672,0fc988844c1c790a93062b5bbcb068d9fd6cba9e96c0b1...,bc1quwdh45r2509suags3he9fnazrh0jg9h47v74tcwd8v...,,,021f05bd7bec2dacaafa9eea30e3c2dd64a1eee699c3aa...,03984d92cdc95dfc197c190f8d99af095089ce70e00757...,458300,...,,,,,1658,5813,7566,43981158,0.000141,0.437437
69875,36488,540649x1336x0,540649,db82e7a38005c6d56f6be8f7aef65b38948b657ad82e15...,bc1qcxups0x2ccme9jvy60lwkj88ae595rplc6z74zgp8c...,596917.0,44932f2eedfbb5766e1326cd655f628fd58e99c584dabd...,0258f3c375d6e2805db7471993d35395f075b52d287255...,032271efcb35188ef00e3f28469a2bb18b50a5f2f325bd...,43152,...,unused,0.0,41784.0,0.0,36488,492,1410,693720,0.100876,0.714217


In [None]:
full_df=decisions_df.copy() 

In [394]:
#measurements=['age','betweeness_curr_aprox','capacity','capacity_growth','channels']
measurements=['current_betweeness_unweighted']
for m in measurements:
    add_to_dataframe(s3,bucket,full_df,open_blocks,'open',m,'norm_rank',actual_bool=False)
    add_to_dataframe(s3,bucket,full_df,close_blocks,'close',m,'norm_rank',actual_bool=False)
    print('{} Succesfully added'.format(m))

HBox(children=(FloatProgress(value=0.0, max=30536.0), HTML(value='')))


Missing items:1


HBox(children=(FloatProgress(value=0.0, max=11021.0), HTML(value='')))


Missing items:0
current_betweeness_unweighted Succesfully added


In [396]:
full_df.sort_values(by=['open_block'],ascending=True).head(200)

Unnamed: 0.1,Unnamed: 0,short_channel_id,open_block,open_transaction,address,close_block,close_transaction,node0,node1,satoshis,...,closeness_approx_rank_post_n0_close_norm_rankpost,closeness_approx_rank_post_n1_close_norm_rankpost,current_betweeness_unweighted_n0_open_norm_rank,current_betweeness_unweighted_n1_open_norm_rank,current_betweeness_unweighted_n0_close_norm_rank,current_betweeness_unweighted_n1_close_norm_rank,current_betweeness_unweighted_n0_open_norm_rankpost,current_betweeness_unweighted_n1_open_norm_rankpost,current_betweeness_unweighted_n0_close_norm_rankpost,current_betweeness_unweighted_n1_close_norm_rankpost
72475,0,505149x622x0,505149,f6bc767df9148ebf76d2b9baf4eb46e3230712c2bf5a51...,bc1qjmg6ev344fenh3zhg0yjl6hyvxpxluw6x9nn2a5lv4...,592638.0,82cb2ea2a06c8c453d8b9ca08e17bbefe87225aa380b2d...,0250373555232cec757ea141273e75381c84cc3ab22f1e...,02ef61a252f9504a42fc264a28476f44cea0711a44b2da...,300000,...,0.792552,0.665954,,,0.003693,0.001780,,,0.061808,0.001224
72474,38787,506402x1391x1,506402,2cdfc4fec2049d66a04fa5bdf468efb19c0354c60b8cf2...,bc1qvjx5t8y7j83udzuj38ukmqecv5d9jn762mchxkgvaf...,,,0313f9449cdb528dc9707c02da507cc9306eedc415091c...,035f1498c929d4cefba4701ae36a554691f526ff60b176...,1111934,...,,,,,,,,,,
72473,38788,506847x1633x0,506847,19ee11ce977facd380b92126834a3aca318f3cb905d99b...,bc1q29g43xrz9gujgt60gykzq3vh0ewfav7vmfqcnmf50u...,,,023d280ae29f84dcfd289eb66b57227fea3a7bde97ec28...,0273081ce642554d5a68a5236564fe88a3783457dc09e5...,40000,...,,,,,,,,,,
72472,38789,508075x1694x1,508075,e267e54872053a7618567f31a9d27e38cdbff0e4176144...,bc1qpxzqp2xyy0gzn8xwu6lqg3a66tuhsg5w849t0j5rdr...,,,03557fd11b58cb93d2ad4fab4dd4cff7462a97e21e8f6b...,03cbf298b068300be33f06c947b9d3f00a0f0e8089da32...,100000,...,,,,,,,,,,
72471,58766,508090x1515x1,508090,33d645657de8a587137b8039e52452557d4279a3f47366...,bc1qneudwey0dpgy9nj2g8ech0lqqrhz52agcj984rs6zh...,616838.0,123777e4dfadc7c008a54c2d55b670067a58cdcbc8b2ec...,028314f021602092779aedd4ef39f3b5809f9b6046f8bc...,02d4531a2f2e6e5a9033d37d548cff4834a3898e74c3ab...,400000,...,0.000000,0.859265,,,0.000000,0.011709,,,0.000000,0.003707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72280,84,516116x816x0,516116,70b55eae6f818eff5cb26cb245810c2fb4031b5043df88...,bc1q4gw9e8j53kx2ha32knvup35xtzld32zgzsa5c3kvfk...,,,0331a2398d2787f39af6d958994dd4edc3e809f567b717...,03a8334aba5660e241468e2f0deb2526bfd50d0e3fe808...,2222,...,,,0.000000,0.231237,,,0.174754,0.216062,,
72279,38871,516116x1085x0,516116,7fb66e26a7ccd0c6eaf1b9e4a997e5ea1b5c14df16e84f...,bc1q3r8npjt4z4y0u8kx7aug4jyjt38zr3d3fca7uw0fyj...,,,02d0f53a403a0b16efdec1983a082ab7645e35858f6a85...,0331a2398d2787f39af6d958994dd4edc3e809f567b717...,2222,...,,,0.475259,0.000000,,,0.501804,0.174754,,
72278,85,516121x1572x0,516121,e9d372ef0b6509119f29836f6494ee4b03d8180afe2b30...,bc1qkc96d7fadsrl302hkgrwhjg09jrtpjl8zy24nw2zr9...,616967.0,2a6d49377016c46cb30870746a7081d9093b878a4580ff...,0327049d8d63f0c40193cdf3afc61817c8647808a4e482...,03341b688f09301a8c0f7528b587bd59db9f8e3f9aec46...,10000,...,0.670436,0.042963,0.241681,0.078710,0.028468,0.016082,0.308156,0.065629,0.074452,0.021378
72277,38873,516126x918x0,516126,3237782d2ac33d01e299008485f8f7bad76896a6087297...,bc1qjdcdrls9glk3x65pk2fxy482g5neee22vv38ere07s...,,,032b4be34fa4944c3a3ae55c3bdeed2b2d902bd02aa583...,038d108e6cd9b8e84fa0b89d018c2e254324d3674b2bb3...,50000,...,,,0.000000,0.153419,,,0.000000,0.144921,,


In [397]:
# Save DataFrame to disk
csv_buffer = io.StringIO()
full_df.to_csv(csv_buffer)
key='full_decisions_df.csv'
s3.Object(bucket, key).put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'BCF61FE4A09473D3',
  'HostId': 'g29DVIZge5ccBSzzUFsqToPWAGwV9OfNMmu0dEtLMKJC/cB6Hu+E3MIsOeOjyCWKsLwJouZqPwc=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'g29DVIZge5ccBSzzUFsqToPWAGwV9OfNMmu0dEtLMKJC/cB6Hu+E3MIsOeOjyCWKsLwJouZqPwc=',
   'x-amz-request-id': 'BCF61FE4A09473D3',
   'date': 'Sun, 10 May 2020 06:40:27 GMT',
   'etag': '"e91c42c938d71299672c70b11ca9cf36"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"e91c42c938d71299672c70b11ca9cf36"'}

In [391]:
full_df.columns

Index(['Unnamed: 0', 'short_channel_id', 'open_block', 'open_transaction',
       'address', 'close_block', 'close_transaction', 'node0', 'node1',
       'satoshis', 'last_seen', 'open_time', 'open_fee', 'close_time',
       'close_fee', 'last_update', 'close_type', 'close_htlc_count',
       'close_balance_a', 'close_balance_b', 'dec_id', 'node0_id', 'node1_id',
       'node_pair', 'age_n0_open_norm_rank', 'age_n1_open_norm_rank',
       'age_n0_close_norm_rank', 'age_n1_close_norm_rank',
       'betweeness_curr_aprox_n0_open_norm_rank',
       'betweeness_curr_aprox_n1_open_norm_rank',
       'betweeness_curr_aprox_n0_close_norm_rank',
       'betweeness_curr_aprox_n1_close_norm_rank',
       'capacity_n0_open_norm_rank', 'capacity_n1_open_norm_rank',
       'capacity_n0_close_norm_rank', 'capacity_n1_close_norm_rank',
       'capacity_growth_n0_open_norm_rank',
       'capacity_growth_n1_open_norm_rank',
       'capacity_growth_n0_close_norm_rank',
       'capacity_growth_n1_close_n

In [288]:
# Drop old column
measurements=['closeness_approx_rank','channels']
decision_types=['open','close']

for m in measurements:
    for decision_type in decision_types:
        full_df.drop([measurement+'_n0_'+decision_type+'_norm_rank', measurement+'_n1_'+decision_type+'_norm_rank'], axis=1)