# LN - Data Pre-Processing - Normalized rankings

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import itertools
#import matplotlib.pyplot as plt
import time
import pickle

import os
import re
import sys
import io
import random
from itertools import islice
import math

from tqdm.notebook import trange, tqdm
#from tqdm.notebook import trange
#from tqdm import tqdm_notebook as tqdm
from time import sleep

from dask_cloudprovider import FargateCluster
from dask.distributed import Client
import dask.array as da
import dask
dask.config.set({'distributed.scheduler.allowed-failures': 50}) 


import boto3

In [4]:
# Define parameters

bucket='ln-strategy-data'
extraction_id=1587447789


In [2]:
# Initiate s3 resource

session = boto3.session.Session()
s3 = session.resource('s3')

## Load Data

In [None]:
graph_keys = [obj.key 
        for obj in s3.Bucket(name=bucket).objects.all()
        if re.match(".*"+str(extraction_id)+"_connected/.*\.gpickle",obj.key)]

In [55]:
# Base lists to be populated
blocks=[]
base_ix=6


extract_keys=graph_keys[base_ix:] # Blocks below 6th index are <3 and affect some graph metrics

for key in extract_keys: 
    
    # Create block list from file_names
    block_i=int(key.split(".")[0].split("/")[-1]) 
    blocks.append(block_i)

In [40]:
cluster = FargateCluster(n_workers=10,scheduler_timeout='20 minutes',image='dsrincon/dask-graph:nx-scipy-v1',scheduler_cpu=4096,scheduler_mem=16384)

In [41]:
cluster

VBox(children=(HTML(value='<h2>FargateCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n  …

In [42]:
client=Client(cluster)


python
+---------------------------+---------------+
|                           | version       |
+---------------------------+---------------+
| client                    | 3.7.3.final.0 |
| scheduler                 | 3.7.4.final.0 |
| tcp://172.31.13.174:35387 | 3.7.4.final.0 |
| tcp://172.31.39.80:39373  | 3.7.4.final.0 |
| tcp://172.31.44.128:44783 | 3.7.4.final.0 |
| tcp://172.31.47.113:41259 | 3.7.4.final.0 |
| tcp://172.31.56.45:45011  | 3.7.4.final.0 |
| tcp://172.31.57.159:36223 | 3.7.4.final.0 |
| tcp://172.31.60.5:38091   | 3.7.4.final.0 |
| tcp://172.31.66.63:39963  | 3.7.4.final.0 |
| tcp://172.31.7.172:34479  | 3.7.4.final.0 |
| tcp://172.31.87.75:39141  | 3.7.4.final.0 |
+---------------------------+---------------+

tornado
+---------------------------+---------+
|                           | version |
+---------------------------+---------+
| client                    | 6.0.3   |
| scheduler                 | 6.0.4   |
| tcp://172.31.13.174:35387 | 6.0.4   |
| tcp:/

In [24]:


def graph_ranking(input_tuple):
    
    # Unpacking input
    block_num=input_tuple[0]
    measurement=input_tuple[1]
    extraction_id=input_tuple[2]
    key_rawscore=input_tuple[3]
    bucket=input_tuple[4]
    
    
    # Retrieve snapshot from S3
    session = boto3.session.Session()
    s3 = session.resource('s3')
    response = s3.Object(bucket_name=bucket, key=key_rawscore).get()
    snapshot=pickle.loads(response['Body'].read())
    
    
    # Calculate ranking for snapshot
    max_value = max(snapshot.values())
    norm_rank = {k: v / max_value for k, v in snapshot.items()}
    
    
    # Write output into S3
    key_out='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/norm_rank/'+str(block_num)+'.pkl'
    pickle_byte_obj = pickle.dumps(norm_rank)
    response=s3.Object(bucket,key_out).put(Body=pickle_byte_obj)['ResponseMetadata']['HTTPStatusCode']
    
    
    return response 

In [None]:
# TEST calculate_ranking 

test_block=516790
measurement='channels'
input_tuple=(test_block,measurement,extraction_id)
response_test=calculate_ranking(input_tuple)

# Test if function saved result correctly and download result
if response_test==200:
    key_test='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/norm_rank/'+str(test_block)+'.pkl'
    g_rank_test_load = s3.Object(bucket_name=bucket, key=key_test).get()
    g_rank_test = pickle.loads(g_rank_test_load['Body'].read())
    g_rank_values=sorted([v for k,v in g_rank_test.items()])
    #print(g_rank_values)
    #print('The dic saved has these first items: {}'.format(list(g_rank_test.items())))



In [50]:
def collection_ranking(extraction_id,blocks,measurement,bucket):

    session = boto3.session.Session()
    s3 = session.resource('s3')
    
  
    

    delayed_responses=[]
    with tqdm(total=len(blocks)) as pbar:
        for block_num in blocks:

            
            # Create key
            key='graph_snapshots/'+str(extraction_id)+'_connected/.data_transformations/'+measurement+'/raw_score/'+str(block_num)+'.pkl'
            
            # Create input tuple
            input_tuple=(block_num,measurement,extraction_id,key,bucket)
            
            # Run delayed function using dask
            response=dask.delayed(graph_ranking)(input_tuple)
            delayed_responses.append(response)
            
            # Update progress bar
            pbar.update(1)

    # Collect futures
    futures = dask.persist(*delayed_responses)

    # Run parallel computations
    start=time.time()
    final_responses = dask.compute(*futures)
    end=time.time()
    print('Compute in seconds: {}'.format(end-start))

    return final_responses






In [56]:
# Test collection_ranking
test_responses=collection_ranking(extraction_id,blocks,'channels',bucket)

HBox(children=(FloatProgress(value=0.0, max=36536.0), HTML(value='')))


Compute in seconds: 591.6712484359741


In [54]:
print(test_responses[:20])

(200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200)


## Calculate normalized rankings for measures

**Age ranking**

In [57]:
age_responses=collection_ranking(extraction_id,blocks,'age',bucket)

HBox(children=(FloatProgress(value=0.0, max=36536.0), HTML(value='')))


Compute in seconds: 384.95358204841614


**Capacity ranking**

In [58]:
capacity_responses=collection_ranking(extraction_id,blocks,'capacity',bucket)

HBox(children=(FloatProgress(value=0.0, max=36536.0), HTML(value='')))


Compute in seconds: 360.1695795059204


**Betweeness ranking**

In [59]:
betweeness_responses=collection_ranking(extraction_id,blocks,'betweeness_curr_aprox',bucket)

HBox(children=(FloatProgress(value=0.0, max=36536.0), HTML(value='')))


Compute in seconds: 363.8422577381134


**Growth ranking**

In [60]:
growth_responses=collection_ranking(extraction_id,blocks,'capacity_growth',bucket)

HBox(children=(FloatProgress(value=0.0, max=36536.0), HTML(value='')))


Compute in seconds: 408.5428283214569


distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
concurrent.futures._base.CancelledError


**Channels ranking**

In [None]:
channels_responses=collection_ranking(extraction_id,blocks,'channels',bucket)