In [1]:
%load_ext autoreload
%autoreload 2
from dateutil import rrule
from datetime import date, datetime, timedelta
import pandas as pd
import pickle as pkl
import random
import matplotlib.pyplot as plt
import fastplot
import seaborn as sns
import scipy
import numpy as np
import itertools
import networkx as nx
from multiprocessing import Pool
from tqdm import tqdm
from tqdm.notebook import trange, tqdm
import community
import random
import cdlib
import igraph as ig
import time
import collections


def characterize_network(df, Path_Communities, type_network, snap, s, parameter):
    list_results = []
    G = nx.from_pandas_edgelist(df, 'src', 'trg')   
    del df
    for component in list(nx.connected_components(G)):
        if len(component)<3:
            for node in component:
                G.remove_node(node)
    n_nodes = len(G.nodes())
    n_edges = len(G.edges())
    print(n_nodes, n_edges)
    avg_degree = dict(G.degree)
    avg_degree = round(np.average(list(avg_degree.values())),2)
    density = round(nx.density(G),4)
    n_cc = nx.number_connected_components(G)
    clustering = round(nx.average_clustering(G),4)
    partition = community.best_partition(G, resolution=1)
    pkl.dump(partition, open(Path_Communities, "wb"), protocol=4) 
    n_comm = len(set(partition.values()))
    modularity = community.modularity(partition, G)
    list_results.append((network, type_network, snap, n_nodes, n_edges, 
                    avg_degree, density, clustering, n_cc, n_comm, modularity, parameter))
    df = pd.DataFrame(list_results, columns=['Network', 'Type', 
                                        'Snapshot', '# Nodes', '# Edges', 'Avg. Degree',
                                        'Density', 'Avg. Clustering', '# Components',
                                        '# Communities', 'Modularity', "Parameter"])

    df.to_csv("Networks Characterization.csv", mode='a', header=False, index=None)
    
def gini(x):
    mad = np.abs(np.subtract.outer(x, x)).mean()
    rmad = mad/np.mean(x)
    g = 0.5 * rmad
    return g

def sort_edges(path_network, edge_list):
    df_temp = pd.read_csv(path_network,
                                     names=['u','v','w'], header=None,sep = ' ')
    sorted_edges = set(zip(df_temp['u'],df_temp['v']))
    del df_temp
    tmp = edge_list.apply(lambda r:(int(r['u']),int(r['v']),int(r['w'])) if (r['u'], r['v']) in sorted_edges else (int(r['v']), int(r['u']), int(r['w'])), axis=1)       
    edge_list = pd.DataFrame(list(tmp), columns=['u', 'v', 'w'])
    del tmp
    del sorted_edges
    return edge_list

# Analyzes network and backbone properties



**Complete Networks**

In [24]:
dict_networks = {'WhatsApp':['October']}

for network, snapshots in dict_networks.items(): 
    for snap in snapshots:
        print(snap)
        type_network = 'Original'
        Path_Networks = network+'/networks/'+str(snap)+'.edgelist'
        Path_Communities = network+'/communities/'+type_network+str(snap)+'.pkl'
        df = pd.read_csv(Path_Networks, sep=' ', names=['src', 'trg', 'nij', 'src_n', 'trg_n'])
        characterize_network(df[['src', 'trg', 'nij']], Path_Communities, type_network, snap, 1, 0)

October
4281 220972


**Backbones DF** 

In [None]:
dict_networks = {'WhatsApp':['October']}

for confidence in [0.999, 0.995, 0.99, 0.95, 0.9]:
    for network, snapshots in dict_networks.items(): 
        for snap in snapshots:
            type_network = 'DF'
            Path_Networks = network+'/backbones/df/'+str(snap)+'.edgelist'
            Path_Communities = network+'/communities/'+type_network+str(snap)+'-'+str(confidence)+'.pkl'
            df = pd.read_csv(Path_Networks, sep=',', names=['src', 'trg', 'nij', 'score', 'v'])
            df = df[df['score'] >= confidence]
            characterize_network(df[['src', 'trg']], Path_Communities, type_network, snap, 1, confidence)

**Polya Urn**

In [26]:
dict_networks = {'WhatsApp':['October']}

for alpha in [0.001, 0.005, 0.01, 0.05, 0.1]:
    for network, snapshots in dict_networks.items(): 
        for snap in snapshots:
            print(snap)
            type_network = 'polya'
            Path_Networks = network+'/backbones/polya/'+str(snap)+'.edgelist'
            Path_Communities = network+'/communities/'+type_network+str(snap)+'-'+str(alpha)+'.pkl'
            df = pd.read_csv(Path_Networks, sep=',', names=['src', 'trg', 'nij', 'p_value'])
            df = df[df['p_value'] < alpha]
            characterize_network(df[['src', 'trg']], Path_Communities, type_network, snap, 1, alpha)

October
326 2675
October
443 4632
October
506 5872
October
734 10527
October
829 13699


**HSS**

In [None]:
dict_networks = {'WhatsApp':['October']}

for percentile in [0.995, 0.99, 0.95, 0.9, 0.8, 0.7, 0.6, 0.5]:
    for network, snapshots in dict_networks.items(): 
        for snap in snapshots:
            type_network = 'hss'
            Path_Networks = network+'/backbones/hss/'+str(snap)+'.edgelist'
            Path_Communities = network+'/communities/'+type_network+str(snap)+'-'+str(percentile)+'.pkl'
            df = pd.read_csv(Path_Networks, sep=',', names=['src', 'trg', 'nij', 'score'])
            value = np.percentile(list(df['score']),percentile*100)
            df = df[df['score'] >= value]
            characterize_network(df[['src', 'trg']], Path_Communities, type_network, snap, 1, percentile)

**Naive Threshold**

In [None]:
dict_networks = {'WhatsApp':['October']}

for percentile in [0.995, 0.99, 0.95, 0.9, 0.8]:
    for network, snapshots in dict_networks.items(): 
        for snap in snapshots:
            print(snap)
            type_network = 'threshold'
            Path_Networks = network+'/networks/'+str(snap)+'.edgelist'
            Path_Communities = network+'/communities/'+type_network+str(snap)+'-'+str(percentile)+'.pkl'
            df = pd.read_csv(Path_Networks, sep=' ', names=['src', 'trg', 'nij', 'src_n', 'trg_n'])
            value = np.percentile(list(df['nij']),percentile*100)
            df = df[df['nij'] >= value]
            characterize_network(df[['src', 'trg']], Path_Communities, type_network, snap, 1, percentile)

**RECAST**

In [5]:
PATH_Networks = 'WhatsApp/networks/'
PATH_Backbones = 'WhatsApp/backbones/recast/' 
snap ='October'
network = 'WhatsApp'
type_network = 'recast'
for alpha in [0.001, 0.005,0.01, 0.05, 0.1]:
    Path_Communities = network+'/communities/'+type_network+str(snap)+'-'+str(alpha)+'.pkl'
    edge_list = pd.read_csv('WhatsApp/backbones/recast/October-'+str(alpha)+'.edgelist', delimiter=' ', 
                            names =['src', 'trg', 'w', 'src_n', 'trg_n'])
    characterize_network(edge_list[['src', 'trg']], Path_Communities, type_network, snap, 1, alpha)

313 1875


# Summary

In [4]:
k='October'
network = 'WhatsApp'
df = pd.read_csv('Networks Characterization.csv', names=['Network', 'Strategy', 'Snapshot', 
                                                                  '# Nodes', '# Edges',
                                                                  'Avg. Degree', 'Density', 
                                                                  'Avg. Clustering', '# Components',
                                                                  '# Communities', 'Modularity', 'Parameter'])

#Defining the alpha (or confidence (1-alpha)) for each model
threshold = 0.950
disparityfilter = 0.95 
polya = 0.05
hss = 0.95
recast = 0.05
original = -1 #Doesnt exist parameter

dict_map_parameter = {'Original':original,  'threshold':threshold, 'DF':disparityfilter, 'hss':hss, 'polya':polya, 'recast':recast}

dict_map = {"Original":"Original", "DF":'DF', 'polya':'Polya', 
            'threshold':'Threshold', 'hss':'HSS','recast':'RECAST'}

df['Strategy'] = df['Strategy'].map(dict_map)
df = df.sort_values(by = ['Network', 'Strategy'])
df = df[df['Network'] == network]
df = df[df['Strategy'].isin(['DF', 'Polya', 'HSS', 'Original', 'Threshold', 'RECAST'])]
df = df[df['Snapshot'].isin([k])]
df = df[df['Parameter'].isin(dict_map_parameter.values())]

#Read community structure and compute the Gini Index 
dict_method_gini = {}
dict_method_comm = {}

for method in dict_map_parameter.keys():
    if method == 'Original':
        Path_Communities = network+'/communities/'+method+str(k)+'.pkl'
    else:
        Path_Communities = network+'/communities/'+method+str(k)+'-'+str(dict_map_parameter[method])+'.pkl'
        
    comm = pkl.load(open(Path_Communities, "rb")) 
    list_values = list(comm.values())
    counter = collections.Counter(list_values)
    dict_method_comm[dict_map[method]] = len(counter)
    dict_method_gini[dict_map[method]] = gini(list_values)


sorter =['Original', 'DF', 'Polya','Threshold', 'RECAST', 'HSS']
sorterIndex = dict(zip(sorter, range(len(sorter))))
df['Tm_Rank'] = df['Strategy'].map(sorterIndex)
df.sort_values(['Tm_Rank'],
        ascending = [True], inplace = True)
df.drop('Tm_Rank', 1, inplace = True)
df['Gini Index'] = df['Strategy'].map(dict_method_gini)
df['Avg. Clustering'] = round(df['Avg. Clustering'], 2)
df['Modularity'] = round(df['Modularity'], 2)
df['Gini Index'] = round(df['Gini Index'], 2)
df = df.drop_duplicates(subset=['Strategy', 'Snapshot']).reset_index(drop=True)
df['# Communities'] = df['Strategy'].map(dict_method_comm)


df['% Nodes'] = round(df['# Nodes']/max(df['# Nodes'])*100,2)
df['% Edges'] = round(df['# Edges']/max(df['# Edges'])*100,2)
df['Avg. Degree'] = df['Avg. Degree'].astype(int)

df = df[['Strategy', '% Nodes', "# Nodes", '% Edges', "# Edges", 'Avg. Degree', 'Density', 'Avg. Clustering', 
         '# Components', '# Communities', 'Gini Index', 'Modularity']]
df.head(10)
# print(df.to_latex(index=False))

Unnamed: 0,Strategy,% Nodes,# Nodes,% Edges,# Edges,Avg. Degree,Density,Avg. Clustering,# Components,# Communities,Gini Index,Modularity
0,Original,100.0,4281,100.0,220972,103,0.0241,0.62,4,15,0.39,0.25
1,DF,18.69,800,4.51,9962,24,0.0312,0.59,4,13,0.41,0.48
2,Polya,17.15,734,4.76,10527,28,0.0391,0.59,5,15,0.44,0.48
3,Threshold,11.56,495,4.29,9489,38,0.0776,0.73,3,8,0.38,0.45
4,RECAST,7.31,313,0.85,1875,11,0.0384,0.49,2,8,0.42,0.37
5,HSS,100.0,4281,4.98,10996,5,0.0012,0.14,4,29,0.35,0.44


In [4]:
df

0.95

**Edge Weight Distribution**

In [None]:
snap_ref_WhatspApp = 'October'
df_WhatsApp = pd.read_csv('WhatsApp/networks/'+snap_ref_WhatspApp+'.edgelist', sep=' ', usecols=[2],  names =['Weight'])

In [None]:
%matplotlib inline

data = [ ('WhatsApp (Month)', df_WhatsApp['Weight']), 
        ('WhatsApp (Week)', df_WhatsApp_2['Weight'])]
fastplot.plot(data , None, mode='CDF_multi', # CDF_complementary=True, 
              xlabel = 'Edge Weight (x)', legend=True,
              cycler = fastplot.CYCLER_LINES, yscale='linear', xscale='linear', ylim=(0.75,1.005))

plt.savefig('plots/edge_weight_dist.pdf')
plt.show()

# Parameter sensitivity analysis

In [2]:
k='October'
network = 'WhatsApp'
df = pd.read_csv('Networks Characterization.csv', names=['Network', 'Strategy', 'Snapshot', 
                                                                  '# Nodes', '# Edges',
                                                                  'Avg. Degree', 'Density', 
                                                                  'Avg. Clustering', '# Components',
                                                                  '# Communities', 'Modularity', 'Parameter'])


dict_map = {"Original":"Original", "DF":'DF', 'polya':'Polya', 
            'threshold':'Threshold', 'hss':'HSS', 'recast':'RECAST'}


df['Strategy'] = df['Strategy'].map(dict_map)
df = df.sort_values(by = ['Network', 'Strategy'])
df = df[df['Network'] == network]
df = df[df['Strategy'].isin(['DF', 'Polya', 'HSS', 'Original', 'Threshold', 'RECAST'])]

sorter =['Original', 'DF', 'Polya','Threshold','HSS', 'RECAST']
# Create the dictionary that defines the order for sorting
sorterIndex = dict(zip(sorter, range(len(sorter))))
# Generate a rank column that will be used to sort
# the dataframe numerically
df['Tm_Rank'] = df['Strategy'].map(sorterIndex)
df.sort_values(['Tm_Rank'],
        ascending = [True], inplace = True)
df.drop('Tm_Rank', 1, inplace = True)
df['Avg. Clustering'] = round(df['Avg. Clustering'], 2)
df['Modularity'] = round(df['Modularity'], 2)
df['% Nodes'] = round(df['# Nodes']/max(df['# Nodes'])*100,2)
df['% Edges'] = round(df['# Edges']/max(df['# Edges'])*100,2)
df = df[df['Strategy'] != 'Original']
df = df[['Strategy', '% Nodes', '% Edges', '# Communities', 'Modularity', 'Parameter']]
df = df.sort_values(['Strategy', 'Parameter'], ascending=[1,1])
df

Unnamed: 0,Strategy,% Nodes,% Edges,# Communities,Modularity,Parameter
28,DF,26.26,7.23,13,0.45,0.9
27,DF,18.69,4.51,13,0.48,0.95
26,DF,10.77,1.51,13,0.52,0.99
25,DF,8.34,0.91,13,0.54,0.995
24,DF,4.3,0.33,10,0.55,0.999
23,HSS,100.0,33.72,22,0.42,0.5
22,HSS,100.0,33.72,19,0.41,0.6
21,HSS,100.0,28.86,16,0.38,0.7
20,HSS,100.0,19.98,18,0.33,0.8
19,HSS,100.0,9.93,21,0.31,0.9
