In [1]:
%load_ext autoreload
%autoreload 2
from dateutil import rrule
from datetime import date, datetime, timedelta
import pandas as pd
import pickle as pkl
import random
import matplotlib.pyplot as plt
import fastplot
import seaborn as sns
import scipy
import numpy as np
import itertools
import networkx as nx
from multiprocessing import Pool
from tqdm import tqdm
from tqdm.notebook import trange, tqdm
import community
import random
import cdlib
import igraph as ig
import time
import collections


def characterize_network(df, Path_Communities, type_network, snap, s, parameter):
    list_results = []
    G = nx.from_pandas_edgelist(df, 'src', 'trg')   
    del df
    #Remove the smaller components
    for component in list(nx.connected_components(G)):
        if len(component)<100:
            for node in component:
                G.remove_node(node)
    n_nodes = len(G.nodes())
    n_edges = len(G.edges())
    print(n_nodes, n_edges)
    avg_degree = dict(G.degree)
    avg_degree = round(np.average(list(avg_degree.values())),2)
    #Compute clustering coefficient from a sample of the graph
    #The network is very large and this takes a long time.
    k = int(n_nodes*s)
    list_nodes = random.sample(list(G.nodes()),k)
    G_sample = G.subgraph(list_nodes)
    del list_nodes
    clustering = round(nx.average_clustering(G_sample),4)
    density = round(nx.density(G),4)
    n_cc = nx.number_connected_components(G)
    partition = community.best_partition(G, resolution=1)
    pkl.dump(partition, open(Path_Communities, "wb"), protocol=4) 
    n_comm = len(set(partition.values()))
    modularity = community.modularity(partition, G)
    list_results.append(('Instagram-BR', type_network, snap, n_nodes, n_edges, 
                    avg_degree, density, clustering, n_cc, n_comm, modularity, parameter))
    df = pd.DataFrame(list_results, columns=['Network', 'Type', 
                                        'Snapshot', '# Nodes', '# Edges', 'Avg. Degree',
                                        'Density', 'Avg. Clustering', '# Components',
                                        '# Communities', 'Modularity', "Parameter"])

    df.to_csv("Networks Characterization.csv", mode='a', header=False, index=None)
    
def gini(x):
    # Mean absolute difference
    mad = np.abs(np.subtract.outer(x, x)).mean()
    # Relative mean absolute difference
    rmad = mad/np.mean(x)
    # Gini coefficient
    g = 0.5 * rmad
    return g

# Analyze network properties



**Original Network**

In [None]:
type_network = 'Original'
PATH_Networks = 'Instagram-BR/networks/network.edgelist'
Path_Communities = 'communities/'+type_network+'.pkl'
df = pd.read_csv(Path_Networks, sep=' ', names=['src', 'trg', 'nij'])
characterize_network(df, Path_Communities, type_network, 'Election', 0.2, 'NaN')

**Tribe**

In [46]:
type_network = 'TriBE'
confidence = '0.95'
Path_Networks = 'Instagram-BR/backbones/tribe/'+confidence+'.edgelist'
Path_Communities = 'Instagram-BR/communities/'+confidence+'.pkl'
df = pd.read_csv(Path_Networks, sep=' ', names=['src', 'trg', 'nij'])
characterize_network(df, Path_Communities, type_network, 'Election', 0.20, confidence)

40816 4152501


**Noise Corrected**

In [None]:
confidence = 0.999990
type_network = 'NC'
Path_Networks = 'Instagram-BR/backbones/nc/all_p_values.edgelist'
Path_Communities = 'Instagram-BR/communities/'+type_network+'-'+str(confidence)+'.pkl'
nc_table = pd.read_csv(Path_Networks, sep=',', names=['src', 'trg', 'nij','score'])
nc_table = nc_table[nc_table['score'] > confidence]
characterize_network(nc_table[['src', 'trg']], Path_Communities, type_network, "Election", 0.20, confidence)

In [4]:
nc_table = pd.read_csv(Path_Networks, sep=',', names=['src', 'trg', 'nij','score'])

In [5]:
nc_table

Unnamed: 0,src,trg,nij,score
0,17.0,768.0,3.0,1.000000
1,31.0,768.0,4.0,1.000000
2,71.0,768.0,4.0,1.000000
3,101.0,768.0,1.0,0.999999
4,203.0,768.0,3.0,1.000000
...,...,...,...,...
189813500,74.0,111.0,1.0,1.000000
189813501,35.0,111.0,4.0,1.000000
189813502,29.0,111.0,1.0,1.000000
189813503,154.0,407.0,1.0,1.000000


**Gloss Filter**

In [16]:
alpha = 0.005
type_network = 'GloSS'
Path_Networks = 'Instagram-BR/backbones/gloss/all_p_values.edgelist'
Path_Communities = 'Instagram-BR/communities/'+type_network+"-"+str(alpha)+'.pkl'
df = pd.read_csv(Path_Networks, sep=' ', names=['src', 'trg','p','w'])
df = df[df['p'] < alpha]
characterize_network(df[['src', 'trg']], Path_Communities, type_network, 'Election', 0.20, alpha)


23198 339252


**MLF**

In [None]:
import math

alpha=0.05
type_network = 'MLF'
Path_Networks = 'Instagram-BR/backbones/mlf/all_p_values.edgelist'
Path_Communities = 'Instagram-BR/communities/'+type_network+"-"+str(alpha)+'.pkl'
df = pd.read_csv(Path_Networks, sep=',', names=['src', 'trg', 'w','significance'])
df['significance'] = math.e**(-df['significance'])
df = df[df['significance'] < alpha]
characterize_network(df[['src', 'trg']], Path_Communities, type_network, 'Election', 0.20, alpha)

**SDSM**

In [None]:
list_alphas = ['0.10']
type_network = 'SDSM'
for alpha in list_alphas:
    Path_Networks = "Instagram-BR/backbones/sdsm/backbone_edgelist_"+alpha+".csv"
    Path_Communities = 'Instagram-BR/communities/'+type_network+"-"+str(alpha)+'.pkl'
    df = pd.read_csv(Path_Networks, sep=' ', names=['src', 'trg','w'])
    characterize_network(df[['src', 'trg']], Path_Communities, type_network, 'Election', 0.20, alpha)

41044 47589474


# Parameter sensitivity analysis

In [14]:
df = pd.read_csv('Networks Characterization.csv', names=['Network', 'Strategy', 'Snapshot', 
                                                                  '# Nodes', '# Edges',
                                                                  'Avg. Degree', 'Density', 
                                                                  'Avg. Clustering', '# Components',
                                                                  '# Communities', 'Modularity', 'Parameter'])


sorter =['Original','TriBE', 'SDSM', 'GloSS','NC', 'MLF']
# Create the dictionary that defines the order for sorting
sorterIndex = dict(zip(sorter, range(len(sorter))))

df = df.sort_values(by = ['Network', 'Strategy'])

df['Tm_Rank'] = df['Strategy'].map(sorterIndex)
df.sort_values(['Tm_Rank'],
        ascending = [True], inplace = True)
# Create the dictionary that defines the order for sorting
sorterIndex = dict(zip(sorter, range(len(sorter))))
# Generate a rank column that will be used to sort
# the dataframe numerically
df['Tm_Rank'] = df['Strategy'].map(sorterIndex)
df.sort_values(['Tm_Rank'],
        ascending = [True], inplace = True)
df.drop('Tm_Rank', 1, inplace = True)
df['Avg. Clustering'] = round(df['Avg. Clustering'], 2)
df['Modularity'] = round(df['Modularity'], 2)
df['% Nodes'] = round(df['# Nodes']/max(df['# Nodes'])*100,2)
df['% Edges'] = round(df['# Edges']/max(df['# Edges'])*100,2)
df = df[df['Strategy'] != 'Original']
df = df[['Strategy', '% Nodes', '% Edges', '# Communities', 'Modularity', 'Parameter']]
df = df.sort_values(['Strategy', 'Parameter'], ascending=[1,1])
df = df.drop_duplicates(['Strategy', 'Parameter']).reset_index(drop=True)
df


Unnamed: 0,Strategy,% Nodes,% Edges,# Communities,Modularity,Parameter
0,Complete,100.00,100.00,4,0.25,-1.0000
1,GloSS,51.72,0.11,9,0.73,0.0010
2,GloSS,56.44,0.18,7,0.70,0.0050
3,GloSS,58.59,0.27,7,0.58,0.0100
4,GloSS,65.45,0.73,6,0.39,0.0500
...,...,...,...,...,...,...
60,TriBE,99.31,2.19,11,0.56,0.9500
61,TriBE,92.97,0.48,19,0.72,0.9900
62,TriBE,28.32,0.03,44,0.74,0.9990
63,TriBE,21.64,0.02,38,0.77,0.9995


# Summary

In [23]:
pd.set_option("display.precision", 13)
df = pd.read_csv('Networks Characterization.csv', names=['Network', 'Strategy', 'Snapshot', 
                                                                  '# Nodes', '# Edges',
                                                                  'Avg. Degree', 'Density', 
                                                                  'Avg. Clustering', '# Components',
                                                                  '# Communities', 'Modularity', 'Parameter'])

#Defining the alpha (or confidence (1-alpha)) for each model
tribe = 1-0.05
sdsm= 0.001 
gloss = 0.1
mlf = 0.001
nc = 1-0.00001
original = -1 #Doesnt exist parameter

dict_map = {"Original":"Original", "GloSS":'GloSS', 'SDSM':'SDSM', 'TriBE':'TriBE', 'MLF':'MLF', 'NC':'NC'}
dict_map_parameter = {"Original":original, "GloSS":gloss, 'TriBE':tribe, 'MLF':mlf, 'NC':nc, 'SDSM':sdsm}


df['Strategy'] = df['Strategy'].map(dict_map)

df = df[
    ((df['Strategy'] == 'Original') & (df['Parameter'] == original)) |
    ((df['Strategy'] == 'GloSS') & (df['Parameter'] == gloss)) |
    ((df['Strategy'] == 'TriBE') & (df['Parameter'] == tribe)) |
    ((df['Strategy'] == 'SDSM') & (df['Parameter'] == sdsm)) |
    ((df['Strategy'] == 'MLF') & (df['Parameter'] == mlf)) |
    ((df['Strategy'] == 'NC') & (df['Parameter'] == nc))
       ]
#Read community structure and compute the Gini Index 
dict_method_gini = {}
dict_method_comm = {}

for method in dict_map.keys():
    if method == 'Original':
        Path_Communities ='Instagram-BR/communities/'+method+'.pkl'
    else:
        Path_Communities ='Instagram-BR/communities/'+method+'-'+str(dict_map_parameter[method])+'.pkl'
        
    comm = pkl.load(open(Path_Communities, "rb")) 
    list_values = list(comm.values())
    counter = collections.Counter(list_values)
    list_values = [x for x in counter.values() if x > 100]
    dict_method_comm[dict_map[method]] = len(list_values)
    dict_method_gini[dict_map[method]] = gini(list_values)

sorter =['Original','TriBE', 'SDSM', 'GloSS','NC', 'MLF']
sorterIndex = dict(zip(sorter, range(len(sorter))))
df['Tm_Rank'] = df['Strategy'].map(sorterIndex)
df.sort_values(['Tm_Rank'],
        ascending = [True], inplace = True)
df.drop('Tm_Rank', 1, inplace = True)
df['Gini Index'] = df['Strategy'].map(dict_method_gini)
df['Avg. Clustering'] = round(df['Avg. Clustering'], 2)
df['Modularity'] = round(df['Modularity'], 2)
df['Gini Index'] = round(df['Gini Index'], 2)
df['# Communities'] = df['Strategy'].map(dict_method_comm)

df['% Nodes'] = round(df['# Nodes']/max(df['# Nodes'])*100,2)
df['% Edges'] = round(df['# Edges']/max(df['# Edges'])*100,2)
df['Avg. Degree'] = df['Avg. Degree'].astype(int)

df = df[['Strategy', '# Nodes', '% Nodes', '# Edges', '% Edges', 'Avg. Degree', 'Density', 'Avg. Clustering', 
         '# Components', '# Communities', 'Gini Index', 'Modularity']]
df.head(10)
# print(df.to_latex(index=False))

Unnamed: 0,Strategy,# Nodes,% Nodes,# Edges,% Edges,Avg. Degree,Density,Avg. Clustering,# Components,# Communities,Gini Index,Modularity
0,Original,41099,100.0,189813505,100.0,9236,0.2248,0.68,1,4,0.1,0.25
64,TriBE,40816,99.31,4152501,2.19,203,0.005,0.3,1,9,0.23,0.56
54,SDSM,31811,77.4,2794969,1.47,175,0.0055,0.33,2,10,0.38,0.46
1,GloSS,28065,68.29,4891339,2.58,348,0.0124,0.54,1,7,0.18,0.32
45,NC,28459,69.24,4285139,2.26,301,0.0106,0.31,1,7,0.34,0.61
9,MLF,20461,49.78,2195999,1.16,214,0.0105,0.33,1,7,0.49,0.62
