In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import os
from tqdm import tqdm
import json

import collections
from random import choice
import copy

from graph_tool.all import *

threshold: 1, 2, 4, 8

#### Locate folder

In [2]:
#source = '/mnt/lynxkite/data/kite_data/upload/'
source = '/mnt/processed_data/day_graphs_cleaned/'

#### Read all filenames and sort

In [3]:
files = sorted([ i for i in os.listdir(source) if '.csv.gz' in i ])
files = np.array( sorted([ i for i in files if 'output_' in i ]) )
files.shape, files[:3], files[-4:]

((365,),
 array(['output_20181201.csv.gz', 'output_20181202.csv.gz',
        'output_20181203.csv.gz'], dtype='<U22'),
 array(['output_20191127.csv.gz', 'output_20191128.csv.gz',
        'output_20191129.csv.gz', 'output_20191130.csv.gz'], dtype='<U22'))

#### Select the year 2018

In [4]:
# 2018.12.01-2019.11.30

In [5]:
files[:3], files[-3:], files.shape

(array(['output_20181201.csv.gz', 'output_20181202.csv.gz',
        'output_20181203.csv.gz'], dtype='<U22'),
 array(['output_20191128.csv.gz', 'output_20191129.csv.gz',
        'output_20191130.csv.gz'], dtype='<U22'),
 (365,))

In [6]:
df = pd.read_csv( source+files[0], delimiter=',' )
df.head()

Unnamed: 0,src,dst,traffic
0,290500,282186,1.052632
1,539818,541729,1.291901
2,171554,175387,2.653278
3,528854,531414,1.1349
4,303912,289218,1.017857


In [7]:
df.shape

(735332, 3)

#### Preprocess data to be able to parse it

In [8]:
src_part = df.src.values
dst_part = df.dst.values
weight_part = []
for i in range( df.shape[0] ):
    weight_part.append( '{\'weight\': '+str( np.round( df.traffic[i], 4 ) )[:7]+'}' )
weight_part = np.array( weight_part )
src_part[:3], dst_part[:3], weight_part[:3]

(array([290500, 539818, 171554]),
 array([282186, 541729, 175387]),
 array(["{'weight': 1.0526}", "{'weight': 1.2919}", "{'weight': 2.6533}"],
       dtype='<U19'))

In [9]:
grap_to_parse = []
for i in range( src_part.shape[0] ):
     grap_to_parse.append( str(src_part[i]) + ' ' + str( dst_part[i]) +' ' + weight_part[i] )

In [10]:
grap_to_parse[:5]

["290500 282186 {'weight': 1.0526}",
 "539818 541729 {'weight': 1.2919}",
 "171554 175387 {'weight': 2.6533}",
 "528854 531414 {'weight': 1.1349}",
 "303912 289218 {'weight': 1.0179}"]

#### Creation of graph

In [11]:
G = nx.parse_edgelist( grap_to_parse , delimiter=' ', nodetype=int, create_using=nx.Graph())

#### Check if graph is correct

In [12]:
[ i for i in nx.all_neighbors( G, 290500) ].index(282186)

0

#### Automatization of this process

In [13]:
def create_day_graph_from_csv( path_to_file ):
    df = pd.read_csv( path_to_file, delimiter=',' )
    src_part = df.src.values
    dst_part = df.dst.values
    weight_part = []
    
    for i in range( df.shape[0] ):
        weight_part.append( '{\'weight\': '+str( np.round( df.traffic[i], 4 ) )[:7]+'}' )
    weight_part = np.array( weight_part )
    
    grap_to_parse = []
    for i in range( src_part.shape[0] ):
         grap_to_parse.append( str(src_part[i]) + ' ' + str( dst_part[i]) +' ' + weight_part[i] )
            
    G = nx.parse_edgelist( grap_to_parse , delimiter=' ', nodetype=int, create_using=nx.Graph())
    # graph node id based on cell id
    G_nodes_id = np.array( [i[0] for i in G.nodes(True) ] ).astype(int)
    G_nodes_df = pd.read_csv( source+'../boxids_500.csv' ).iloc[ G_nodes_id ]
    G_nodes_df_header = list( G_nodes_df )
    G_nodes_id = G_nodes_df.values
    
    return G, G_nodes_id, G_nodes_df_header

In [14]:
G, G_nodes_id, G_nodes_df_header = create_day_graph_from_csv( source+files[0] )

In [15]:
len(G.nodes), len(G.edges)

(98214, 460521)

In [16]:
G_nodes_id[:2]

array([[2.90500000e+05, 2.41194000e+05, 6.53591000e+05, 4.75146629e+01,
        1.90951229e+01],
       [2.82186000e+05, 2.37694000e+05, 6.47091000e+05, 4.74831838e+01,
        1.90088470e+01]])

In [17]:
G_nodes_df_header

['boxid', 'eovx', 'eovy', 'lat', 'lon']

#### Convert nx graph to Graph-tool graph to get some processing speed increase

https://bbengfort.github.io/snippets/2016/06/23/graph-tool-from-networkx.html (outdated)

https://gist.github.com/tomshaffner/7a2df7f9ec6b1be33dd0413897125683 (updated, works with nx 2.5)

In [18]:
def get_prop_type(value, key=None):
    """
    Performs typing and value conversion for the graph_tool PropertyMap class.
    If a key is provided, it also ensures the key is in a format that can be
    used with the PropertyMap. Returns a tuple, (type name, value, key)
    """
    if isinstance(key, str):
        # Encode the key as utf-8
        key = key.encode('utf-8', errors='replace')

    # Deal with the value
    if isinstance(value, bool):
        tname = 'bool'

    elif isinstance(value, int):
        tname = 'float'
        value = float(value)

    elif isinstance(value, float):
        tname = 'float'

    elif isinstance(value, str):
        tname = 'string'
        value = value.encode('utf-8', errors='replace')

    elif isinstance(value, dict):
        tname = 'object'

    else:
        tname = 'string'
        value = str(value)
        
    #If key is a byte value, decode it to string
    try:
        key = key.decode('utf-8')
    except AttributeError:
        pass

    return tname, value, key


def nx2gt(nxG):
    """
    Converts a networkx graph to a graph-tool graph.
    """
    # Phase 0: Create a directed or undirected graph-tool Graph
    gtG = Graph(directed=nxG.is_directed())

    # Add the Graph properties as "internal properties"
    for key, value in list(nxG.graph.items()):
        # Convert the value and key into a type for graph-tool
        tname, value, key = get_prop_type(value, key)

        prop = gtG.new_graph_property(tname) # Create the PropertyMap
        
        gtG.graph_properties[key] = prop     # Set the PropertyMap
        gtG.graph_properties[key] = value    # Set the actual value

    # Phase 1: Add the vertex and edge property maps
    # Go through all nodes and edges and add seen properties
    # Add the node properties first
    nprops = set() # cache keys to only add properties once
    for node, data in nxG.nodes(data=True):

        # Go through all the properties if not seen and add them.
        for key, val in list(data.items()):            
            if key in nprops: continue # Skip properties already added

            # Convert the value and key into a type for graph-tool
            tname, _, key  = get_prop_type(val, key)

            prop = gtG.new_vertex_property(tname) # Create the PropertyMap
            gtG.vertex_properties[key] = prop     # Set the PropertyMap

            # Add the key to the already seen properties
            nprops.add(key)

    # Also add the node id: in NetworkX a node can be any hashable type, but
    # in graph-tool node are defined as indices. So we capture any strings
    # in a special PropertyMap called 'id' -- modify as needed!
    gtG.vertex_properties['id'] = gtG.new_vertex_property('string')

    # Add the edge properties second
    eprops = set() # cache keys to only add properties once
    for src, dst, data in nxG.edges(data=True):

        # Go through all the edge properties if not seen and add them.
        for key, val in list(data.items()):            
            if key in eprops: continue # Skip properties already added

            # Convert the value and key into a type for graph-tool
            tname, _, key = get_prop_type(val, key)
            
            prop = gtG.new_edge_property(tname) # Create the PropertyMap
            gtG.edge_properties[key] = prop     # Set the PropertyMap

            # Add the key to the already seen properties
            eprops.add(key)

    # Phase 2: Actually add all the nodes and vertices with their properties
    # Add the nodes
    vertices = {} # vertex mapping for tracking edges later
    for node, data in nxG.nodes(data=True):

        # Create the vertex and annotate for our edges later
        v = gtG.add_vertex()
        vertices[node] = v

        # Set the vertex properties, not forgetting the id property
        data['id'] = str(node)
        for key, value in list(data.items()):
            gtG.vp[key][v] = value # vp is short for vertex_properties

    # Add the edges
    for src, dst, data in nxG.edges(data=True):

        # Look up the vertex structs from our vertices mapping and add edge.
        e = gtG.add_edge(vertices[src], vertices[dst])

        # Add the edge properties
        for key, value in list(data.items()):
            gtG.ep[key][e] = value # ep is short for edge_properties

    # Done, finally!
    return gtG

#### Test functions

##### Load graph from csv

In [19]:
#read graph from csv file
day_csv = pd.read_csv( source+files[0] )
G, G_nodes_id, G_nodes_df_header = create_day_graph_from_csv( path_to_file=source+files[0] )
# convert to Graph-tool graph object
gtG = nx2gt(G)

In [20]:
gtG

<Graph object, undirected, with 98214 vertices and 460521 edges, 1 internal vertex property, 1 internal edge property, at 0x7f0cb30ffeb8>

In [21]:
gtG._Graph__edge_properties['weight']

<EdgePropertyMap object with value type 'double', for Graph 0x7f0cb30ffeb8, at 0x7f0cba9919b0>

In [22]:
{ 'num_edges': len(list(gtG._Graph__edge_properties['weight'])) }

{'num_edges': 460521}

In [23]:
gtG._Graph__vertex_properties['id']

<VertexPropertyMap object with value type 'string', for Graph 0x7f0cb30ffeb8, at 0x7f0cf744b320>

In [24]:
{ 'num_vertices': len(list(gtG._Graph__vertex_properties['id'])) }

{'num_vertices': 98214}

##### Generate random graph based on the configuration model

In [25]:
gtG_rnd = copy.deepcopy(gtG)
random_rewire( gtG_rnd, parallel_edges=True, self_loops=True )

50

#### Metrics to be calculated:

In [26]:
def calculate_num_vertices(g):
    return { 'value': len(list(g._Graph__vertex_properties['id'])) }

In [27]:
def calculate_num_edges(g):
    return { 'value': len(list(g._Graph__edge_properties['weight'])) }

In [28]:
def calculate_assortativity(g):
    value, var = assortativity( g, 'total', eweight=g._Graph__edge_properties['weight'] )
    return {'value': value, 'variance': var }

In [29]:
def calculate_scalar_assortativity(g):
    value, var = scalar_assortativity( g, 'total', eweight=g._Graph__edge_properties['weight'] )
    return {'value': value, 'variance': var }

In [30]:
def calculate_pseudo_diameter(g):
    value, _ = pseudo_diameter(g, weights=g._Graph__edge_properties['weight'])
    return {'value': int(value) }

In [31]:
def calculate_min_spanning_tree(g):
    return {'num_edges_involved': sum( list(min_spanning_tree(g, weights=g._Graph__edge_properties['weight'])) ) }

In [32]:
def calculate_vertex_percolation(g):
    vertices = sorted( [v for v in g.vertices()], key=lambda v: v.out_degree() )
    sizes, comp = vertex_percolation(g, vertices)
    np.random.shuffle(vertices)
    sizes2, comp = vertex_percolation(g, vertices)
    fractions_remaining = np.array( [ 0.99, 0.9, 0.5, 0.3, 0.1, 0.01, 0.001 ] )
    items_direct = {}
    for i in fractions_remaining:
        idx = np.argmax( (sizes/sizes[-1])[::-1] < i ) # where only fraction remaining
        vert_frac_removed = idx / len(vertices)
        items_direct[str(i)] = vert_frac_removed
    items_random = {}
    for i in fractions_remaining:
        idx = np.argmax( (sizes2/sizes2[-1])[::-1] < i ) # where only fraction remaining
        vert_frac_removed = idx / len(vertices)
        items_random[str(i)] = vert_frac_removed
    
    return { 'directed': items_direct, 'random': items_random }

In [33]:
def calculate_edge_percolation(g):
    edges = sorted([(e.source(), e.target()) for e in g.edges()],
                   key=lambda e: e[0].out_degree() * e[1].out_degree())
    sizes, comp = edge_percolation(g, edges)
    np.random.shuffle(edges)
    sizes2, comp = edge_percolation(g, edges)
    fractions_remaining = np.array( [ 0.99, 0.9, 0.5, 0.3, 0.1, 0.01, 0.001 ] )
    items_direct = {}
    for i in fractions_remaining:
        idx = np.argmax( (sizes/sizes[-1])[::-1] < i ) # where only fraction remaining
        edge_frac_removed = idx / len(edges)
        items_direct[str(i)] = edge_frac_removed
    items_random = {}
    for i in fractions_remaining:
        idx = np.argmax( (sizes2/sizes2[-1])[::-1] < i ) # where only fraction remaining
        edge_frac_removed = idx / len(edges)
        items_random[str(i)] = edge_frac_removed
    
    return { 'directed': items_direct, 'random': items_random }

In [34]:
def calculate_global_clustering(g):
    values, num_triangs, num_triples = global_clustering(g, weight=g._Graph__edge_properties['weight'], ret_counts=True)
    return { 'value': values[0], 'std': values[1], 
             'number_of_triangs': int(num_triangs), 'number_of_triples': int(num_triples) }

#### New calculations, tests

In [35]:
calculate_assortativity(gtG), calculate_assortativity(gtG_rnd)

({'value': -0.0018567809410044496, 'variance': 0.00016557744031522688},
 {'value': 0.0005371268632893054, 'variance': 0.0002743402724337821})

In [36]:
calculate_scalar_assortativity(gtG), calculate_scalar_assortativity(gtG_rnd)

({'value': 0.3633888310742778, 'variance': 0.04259266080533669},
 {'value': -0.02527205164860106, 'variance': 0.015980774766955037})

In [37]:
calculate_pseudo_diameter(gtG), calculate_pseudo_diameter(gtG_rnd)

({'value': 422}, {'value': 2844})

In [38]:
calculate_min_spanning_tree(gtG), calculate_min_spanning_tree(gtG_rnd)

({'num_edges_involved': 98210}, {'num_edges_involved': 96130})

In [39]:
calculate_vertex_percolation(gtG), calculate_vertex_percolation(gtG_rnd)

({'directed': {'0.99': 0.00012218217362086872,
   '0.9': 0.0006618201071130388,
   '0.5': 0.008970207913332112,
   '0.3': 0.017584051153603356,
   '0.1': 0.027847353737756327,
   '0.01': 0.050308509988392695,
   '0.001': 0.06689474005742563},
  'random': {'0.99': 0.0036349196652208445,
   '0.9': 0.058596534099008286,
   '0.5': 0.33893334962428984,
   '0.3': 0.510222575192946,
   '0.1': 0.730761398578614,
   '0.01': 0.9171401225894475,
   '0.001': 0.9711344614820697}},
 {'directed': {'0.99': 9.163663021565153e-05,
   '0.9': 0.001690186735088684,
   '0.5': 0.01871423625959639,
   '0.3': 0.03375282546276498,
   '0.1': 0.05184596900645529,
   '0.01': 0.059176899423707414,
   '0.001': 0.06644673875414911},
  'random': {'0.99': 0.006058199442034741,
   '0.9': 0.05976744659620828,
   '0.5': 0.3370598896287698,
   '0.3': 0.508114932697986,
   '0.1': 0.7452705316960923,
   '0.01': 0.9290936119086892,
   '0.001': 0.982365039607388}})

In [40]:
calculate_edge_percolation(gtG), calculate_edge_percolation(gtG_rnd)

({'directed': {'0.99': 0.555800929816447,
   '0.9': 0.6825074209428017,
   '0.5': 0.8023521185787401,
   '0.3': 0.8332844756265186,
   '0.1': 0.8746615246644561,
   '0.01': 0.9304049109595437,
   '0.001': 0.9809780661468207},
  'random': {'0.99': 0.01651173344972325,
   '0.9': 0.1556541395506394,
   '0.5': 0.6506891108114505,
   '0.3': 0.8323160073047701,
   '0.1': 0.9571007619630809,
   '0.01': 0.9934508958331976,
   '0.001': 0.9974572277920007}},
 {'directed': {'0.99': 0.6810851188110857,
   '0.9': 0.7425546283448529,
   '0.5': 0.8371583489135132,
   '0.3': 0.8594070628700972,
   '0.1': 0.8771391532633691,
   '0.01': 0.8901722179878876,
   '0.001': 0.9133633428225857},
  'random': {'0.99': 0.015176289463455521,
   '0.9': 0.14984767252742004,
   '0.5': 0.6428089055656528,
   '0.3': 0.8284877345441359,
   '0.1': 0.9657301187133703,
   '0.01': 0.9942955912976824,
   '0.001': 0.9960414400211934}})

In [41]:
calculate_global_clustering(gtG)

{'value': 0.7658621097396574,
 'std': 0.04466038167056891,
 'number_of_triangs': 2033937768,
 'number_of_triples': 7967247926}

In [42]:
calculate_global_clustering(gtG_rnd)

{'value': 0.06007740540677145,
 'std': 0.0023349111599151044,
 'number_of_triangs': 69953168,
 'number_of_triples': 3493151919}

##### NOTE: For undirected graphs, the “out-degree” is synonym for degree, and in this case the in-degree of a vertex is always zero.

#### Select output folder

In [43]:
destination = '/mnt/graph_analitics_data/'

In [44]:
for f in tqdm( range( files.shape[0] ) ):
    # global metrics in json format!
    savename = destination+'graph_global_attributes_'+files[f].split('.')[0][-8:]+'.json'
    if not os.path.exists( savename ):

        #read graph from csv file
        day_csv = pd.read_csv( source+files[f] )
        G, G_nodes_id, G_nodes_df_header = create_day_graph_from_csv( path_to_file=source+files[f] )
        # convert to Graph-tool graph object
        gtG = nx2gt(G)
        # get random graph with configuration model
        gtG_rnd = copy.deepcopy(gtG)
        random_rewire( gtG_rnd, parallel_edges=True, self_loops=True )
        
        dict_to_dump = {} # every attribute will be written to this
        
        dict_to_dump['num_vertices_graph'] = calculate_num_vertices(gtG)
        dict_to_dump['num_vertices_config'] = calculate_num_vertices(gtG_rnd)
        
        dict_to_dump['num_edges_graph'] = calculate_num_edges(gtG)
        dict_to_dump['num_edges_config'] = calculate_num_edges(gtG_rnd)
        
        dict_to_dump['assortativity_graph'] = calculate_assortativity(gtG)
        dict_to_dump['assortativity_config'] = calculate_assortativity(gtG_rnd)
        
        dict_to_dump['scalar_assortativity_graph'] = calculate_scalar_assortativity(gtG)
        dict_to_dump['scalar_assortativity_config'] = calculate_scalar_assortativity(gtG_rnd)
        
        dict_to_dump['pseudo_diameter_graph'] = calculate_pseudo_diameter(gtG)
        dict_to_dump['pseudo_diameter_config'] = calculate_pseudo_diameter(gtG_rnd)
        
        dict_to_dump['min_spanning_tree_graph'] = calculate_min_spanning_tree(gtG)
        dict_to_dump['min_spanning_tree_config'] = calculate_min_spanning_tree(gtG_rnd)
        
        dict_to_dump['vertex_percolation_graph'] = calculate_vertex_percolation(gtG)
        dict_to_dump['vertex_percolation_config'] = calculate_vertex_percolation(gtG_rnd)
        
        dict_to_dump['edge_percolation_graph'] = calculate_edge_percolation(gtG)
        dict_to_dump['edge_percolation_config'] = calculate_edge_percolation(gtG_rnd)
        
        dict_to_dump['global_clustering_graph'] = calculate_global_clustering(gtG)
        dict_to_dump['global_clustering_config'] = calculate_global_clustering(gtG_rnd)

        with open(savename, 'w', encoding='utf-8') as f:
            json.dump( dict_to_dump, f, ensure_ascii=False, indent=4)
        
        #print(dict_to_dump)
    else:
        print('Already processed, skipping!') 

100%|██████████| 365/365 [00:59<00:00,  6.15it/s]

Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already processed, skipping!
Already proces


