# Encontro 13: Medidas de Centralidade

Importando a biblioteca:

In [1]:
import sys
sys.path.append('..')

from random import choice
from itertools import permutations

import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import socnet as sn

  return f(*args, **kwds)
  return f(*args, **kwds)


Configurando a biblioteca:

In [2]:
sn.node_size = 10
sn.node_color = (255, 255, 255)

sn.edge_width = 1
sn.edge_color = (192, 192, 192)

sn.node_label_position = 'top center'

Carregando rede de casamentos entre famílias de Florença durante a Renascença.

J. F. Padgett e C. K. Ansell. *Robust action and the rise of the Medici, 1400–1434.* American Journal of
Sociology 98, págs. 1259-1319, 1993.

In [3]:
g = sn.load_graph('Renaissance.gml', has_pos=True)

sn.show_graph(g, nlab=True)

Função que registra, em cada nó, seus sucessores em geodésicas de $s$ a $t$.

In [4]:
def set_geodesic_successors(g, s, t):
    for n in g.nodes:
        g.nodes[n]['geodesic_successors'] = set()

    for p in nx.all_shortest_paths(g, s, t):
        for i in range(len(p) - 1):
            g.nodes[p[i]]['geodesic_successors'].add(p[i + 1])

Funções que representam uma escolha aleatória de sucessor para diferentes tipos de trajetórias.

In [5]:
# Pense que o atributo 'passages' abaixo indica quantas
# vezes um fluxo já passou por um nó ou por uma aresta.

def random_geodesic_successor(g, n):
    return choice([m for m in g.nodes[n]['geodesic_successors']])

def random_path_successor(g, n):
    return choice([m for m in g.neighbors(n) if g.nodes[m]['passages'] == 0])

def random_trail_successor(g, n):
    return choice([m for m in g.neighbors(n) if g.edges[n, m]['passages'] == 0])

def random_walk_successor(g, n):
    return choice([m for m in g.neighbors(n)])

Função que faz uma simulação de fluxo de $s$ a $t$, que pode ou não ser bem-sucedida.

In [6]:
def simulate_single_flow(g, s, t,func,dono_insumo):
    functions = [random_geodesic_successor,random_path_successor,random_trail_successor, random_walk_successor ]
    # Inicializa o atributo 'passages' de cada nó.
    for n in g.nodes:
        g.nodes[n]['passages'] = 0
    g.nodes[s]['passages'] = 1

    # Inicializa o atributo 'passages' de cada aresta.
    for n, m in g.edges:
        g.edges[n, m]['passages'] = 0

    # Inicializa s como o único dono do insumo.
    for n in g.nodes:
        g.nodes[n]['owner'] = False
    g.nodes[s]['owner'] = True

    # Simula o fluxo, contando o número total de passos.

    steps = 0

    while True:
        # O conjunto reached representa todos os nós
        # que o fluxo consegue alcançar no passo atual.
        reached = set()

        # Verifica cada um dos donos atuais do insumo.

        owners = [n for n in g.nodes if g.nodes[n]['owner']]

        for n in owners:
            
            if(not dono_insumo):
                # Deixa de ser dono do insumo.
                g.nodes[n]['owner'] = False
                
                
            # Escolhe aleatoriamente um dos sucessores.
            try:
                m = functions[func-1](g, n)
            except IndexError:
                continue

            

            # Incrementa o atributo 'passages' do nó.
            g.nodes[m]['passages'] += 1

            # Incrementa o atributo 'passages' da aresta.
            g.edges[n, m]['passages'] += 1

            # Registra que consegue alcançar esse nó.
            reached.add(m)

        # Todo nó alcançado passa a ser dono do insumo.

        for n in reached:
            g.nodes[n]['owner'] = True

        # Isso conclui o passo atual da simulação.
        steps += 1

        # Se o passo alcançou t, chegamos ao fim da simulação.
        # Ela foi bem-sucedida: devolvemos o número de passos.
        if t in reached:
            return steps

        # Se o passo não alcançou ninguém, chegamos ao fim da
        # simulação. Ela não foi bem-sucedida: devolvemos -1.
        if not reached:
            return -1

Função que faz simulações de fluxo de $s$ a $t$ até uma ser bem-sucedida.

In [7]:
def simulate_successful_flow(g, s, t,func,dono_insumo):
    set_geodesic_successors(g, s, t)

    while True:
        steps = simulate_single_flow(g, s, t,func,dono_insumo)

        if steps != -1:
            return steps

Função que faz simulações de fluxo para todo $s$ e $t$ possíveis, e tira disso um *closeness simulado* e um *betweenness simulado*.

In [8]:
def simulate_all_flows(g,func,dono_insumo):
    for n in g.nodes:
        g.nodes[n]['closeness'] = 0
        g.nodes[n]['betweenness'] = 0

    for s, t in permutations(g.nodes, 2):
        steps = simulate_successful_flow(g, s, t,func,dono_insumo)

        g.nodes[s]['closeness'] += steps
        for n in g.nodes:
            if n != s and n != t:
                g.nodes[n]['betweenness'] += g.nodes[n]['passages']

    # Normalizações necessárias para comparar com os
    # resultados analíticos. Não é preciso entender.
    for n in g.nodes:
        g.nodes[n]['closeness'] = (g.number_of_nodes() - 1) / g.nodes[n]['closeness']
        g.nodes[n]['betweenness'] /= (g.number_of_nodes() - 1) * (g.number_of_nodes() - 2)

Média de *closeness simulado* e *betweenness simulado* para muitas repetições da simulação acima.

In [9]:
def run(g,func,dono_insumo):
    TIMES = 100


    for n in g.nodes:
        g.nodes[n]['mean_closeness'] = 0
        g.nodes[n]['mean_betweenness'] = 0

    for _ in range(TIMES):
        simulate_all_flows(g,func,dono_insumo)

        for n in g.nodes:
            g.nodes[n]['mean_closeness'] += g.nodes[n]['closeness']
            g.nodes[n]['mean_betweenness'] += g.nodes[n]['betweenness']

    for n in g.nodes:
        g.nodes[n]['mean_closeness'] /= TIMES
        g.nodes[n]['mean_betweenness'] /= TIMES

    return pd.DataFrame({
        'família': [g.nodes[n]['label'] for n in g.nodes],
        'closeness simulado': [g.nodes[n]['mean_closeness'] for n in g.nodes],
        'closeness analítico': [cc[n] for n in g.nodes],
        'betweenness simulado': [g.nodes[n]['mean_betweenness'] for n in g.nodes],
        'betweenness analítico': [bc[n] for n in g.nodes],
    })

Cálculo de *closeness* e *betweenness* a partir das funções prontas da NetworkX, para comparação.

Construção de data frame só para comparar mais facilmente.

In [10]:
cc = nx.closeness_centrality(g)

bc = nx.betweenness_centrality(g)

In [11]:
geo_duplic = run(g,1, True)
geo_duplic.sort_values(by=['closeness simulado','betweenness simulado'], ascending=False)

Unnamed: 0,família,closeness simulado,closeness analítico,betweenness simulado,betweenness analítico
6,medici,0.56,0.56,1.387253,0.521978
2,albizzi,0.482759,0.482759,0.570385,0.212454
9,ridolfi,0.482759,0.482759,0.25044,0.086081
7,tornabuon,0.482759,0.482759,0.245,0.091575
3,guadagni,0.466667,0.466667,0.688571,0.260073
13,barbadori,0.4375,0.4375,0.317253,0.115385
11,strozzi,0.424242,0.424242,0.208626,0.075092
8,bischeri,0.4,0.4,0.315495,0.120879
5,salviati,0.388889,0.388889,0.406593,0.142857
14,castellan,0.388889,0.388889,0.229176,0.087912


In [12]:
close_geo_duplic = geo_duplic.iloc[:, 1].values
bet_geo_duplic = geo_duplic.iloc[:, 3].values
geo_duplic.describe()

Unnamed: 0,closeness simulado,closeness analítico,betweenness simulado,betweenness analítico
count,15.0,15.0,15.0,15.0
mean,0.411834,0.411834,0.311568,0.115751
std,0.074293,0.074293,0.364017,0.136863
min,0.285714,0.285714,0.0,0.0
25%,0.359211,0.359211,0.027363,0.010989
50%,0.4,0.4,0.245,0.087912
75%,0.474713,0.474713,0.361923,0.131868
max,0.56,0.56,1.387253,0.521978


In [13]:
geo_transf = run(g,1, False)
geo_transf.sort_values(by=['closeness simulado','betweenness simulado'], ascending=False)

Unnamed: 0,família,closeness simulado,closeness analítico,betweenness simulado,betweenness analítico
6,medici,0.56,0.56,0.521099,0.521978
2,albizzi,0.482759,0.482759,0.214066,0.212454
9,ridolfi,0.482759,0.482759,0.089011,0.086081
7,tornabuon,0.482759,0.482759,0.086868,0.091575
3,guadagni,0.466667,0.466667,0.257802,0.260073
13,barbadori,0.4375,0.4375,0.11533,0.115385
11,strozzi,0.424242,0.424242,0.078516,0.075092
8,bischeri,0.4,0.4,0.120769,0.120879
5,salviati,0.388889,0.388889,0.142857,0.142857
14,castellan,0.388889,0.388889,0.088242,0.087912


In [14]:
close_geo_transf = geo_transf.iloc[:, 1].values
bet_geo_transf = geo_transf.iloc[:, 3].values
geo_transf.describe()

Unnamed: 0,closeness simulado,closeness analítico,betweenness simulado,betweenness analítico
count,15.0,15.0,15.0,15.0
mean,0.411834,0.411834,0.115751,0.115751
std,0.074293,0.074293,0.13655,0.136863
min,0.285714,0.285714,0.0,0.0
25%,0.359211,0.359211,0.010852,0.010989
50%,0.4,0.4,0.088242,0.087912
75%,0.474713,0.474713,0.131813,0.131868
max,0.56,0.56,0.521099,0.521978


In [15]:
path_duplic = run(g, 2, True)
path_duplic.sort_values(by=['closeness simulado','betweenness simulado'], ascending=False)

Unnamed: 0,família,closeness simulado,closeness analítico,betweenness simulado,betweenness analítico
7,tornabuon,0.279168,0.482759,0.683736,0.091575
9,ridolfi,0.273536,0.482759,0.700769,0.086081
13,barbadori,0.267609,0.4375,0.58467,0.115385
11,strozzi,0.267123,0.424242,0.681209,0.075092
6,medici,0.265452,0.56,0.818956,0.521978
8,bischeri,0.264627,0.4,0.649505,0.120879
3,guadagni,0.26149,0.466667,0.737802,0.260073
14,castellan,0.258051,0.388889,0.619341,0.087912
2,albizzi,0.258036,0.482759,0.615165,0.212454
12,peruzzi,0.253468,0.35,0.525385,0.021978


In [16]:
close_path_duplic = path_duplic.iloc[:, 1].values
bet_path_duplic = path_duplic.iloc[:, 3].values
path_duplic.describe()

Unnamed: 0,closeness simulado,closeness analítico,betweenness simulado,betweenness analítico
count,15.0,15.0,15.0,15.0
mean,0.252091,0.411834,0.565527,0.115751
std,0.020721,0.074293,0.159692,0.136863
min,0.206153,0.285714,0.258681,0.0
25%,0.235287,0.359211,0.417912,0.010989
50%,0.258051,0.4,0.615165,0.087912
75%,0.266288,0.474713,0.682473,0.131868
max,0.279168,0.56,0.818956,0.521978


In [17]:
path_transf = run(g,2, False)
path_transf.sort_values(by=['closeness simulado','betweenness simulado'], ascending=False)

Unnamed: 0,família,closeness simulado,closeness analítico,betweenness simulado,betweenness analítico
6,medici,0.302426,0.56,0.624396,0.521978
3,guadagni,0.267903,0.466667,0.403132,0.260073
7,tornabuon,0.260286,0.482759,0.331703,0.091575
9,ridolfi,0.258068,0.482759,0.36456,0.086081
14,castellan,0.252347,0.388889,0.382363,0.087912
2,albizzi,0.249923,0.482759,0.22978,0.212454
10,acciaiuol,0.246359,0.368421,0.0,0.0
5,salviati,0.246353,0.388889,0.142857,0.142857
8,bischeri,0.24519,0.4,0.363516,0.120879
11,strozzi,0.240364,0.424242,0.37033,0.075092


In [18]:
close_path_transf = path_transf.iloc[:, 1].values
bet_path_transf = path_transf.iloc[:, 3].values
path_transf.describe()

Unnamed: 0,closeness simulado,closeness analítico,betweenness simulado,betweenness analítico
count,15.0,15.0,15.0,15.0
mean,0.243911,0.411834,0.246495,0.115751
std,0.025054,0.074293,0.18711,0.136863
min,0.201411,0.285714,0.0,0.0
25%,0.232003,0.359211,0.071429,0.010989
50%,0.246353,0.4,0.245165,0.087912
75%,0.255207,0.474713,0.367445,0.131868
max,0.302426,0.56,0.624396,0.521978


In [19]:
trail_duplic = run(g,3, True)
trail_duplic.sort_values(by=['closeness simulado','betweenness simulado'], ascending=False)

Unnamed: 0,família,closeness simulado,closeness analítico,betweenness simulado,betweenness analítico
7,tornabuon,0.267149,0.482759,0.93544,0.091575
9,ridolfi,0.260425,0.482759,0.928736,0.086081
13,barbadori,0.258624,0.4375,0.603571,0.115385
6,medici,0.255772,0.56,1.256264,0.521978
11,strozzi,0.252356,0.424242,0.893681,0.075092
3,guadagni,0.251015,0.466667,0.897253,0.260073
8,bischeri,0.246552,0.4,0.784011,0.120879
2,albizzi,0.245531,0.482759,0.686648,0.212454
14,castellan,0.242183,0.388889,0.833571,0.087912
12,peruzzi,0.234962,0.35,0.59544,0.021978


In [20]:
close_trail_duplic = trail_duplic.iloc[:, 1].values
bet_trail_duplic = trail_duplic.iloc[:, 3].values
trail_duplic.describe()

Unnamed: 0,closeness simulado,closeness analítico,betweenness simulado,betweenness analítico
count,15.0,15.0,15.0,15.0
mean,0.240509,0.411834,0.683264,0.115751
std,0.018805,0.074293,0.280829,0.136863
min,0.199173,0.285714,0.266429,0.0
25%,0.227321,0.359211,0.411566,0.010989
50%,0.245531,0.4,0.686648,0.087912
75%,0.254064,0.474713,0.895467,0.131868
max,0.267149,0.56,1.256264,0.521978


In [21]:
trail_transf = run(g,3, False)
trail_transf.sort_values(by=['closeness simulado','betweenness simulado'], ascending=False)

Unnamed: 0,família,closeness simulado,closeness analítico,betweenness simulado,betweenness analítico
6,medici,0.290668,0.56,0.703956,0.521978
13,barbadori,0.266158,0.4375,0.273132,0.115385
2,albizzi,0.26371,0.482759,0.267033,0.212454
3,guadagni,0.263653,0.466667,0.403681,0.260073
7,tornabuon,0.252525,0.482759,0.313516,0.091575
12,peruzzi,0.245682,0.35,0.228626,0.021978
9,ridolfi,0.242556,0.482759,0.333846,0.086081
5,salviati,0.238348,0.388889,0.142857,0.142857
11,strozzi,0.228907,0.424242,0.326593,0.075092
14,castellan,0.227304,0.388889,0.376813,0.087912


In [22]:
close_trail_transf = trail_transf.iloc[:, 1].values
bet_trail_transf = trail_transf.iloc[:, 3].values
trail_transf.describe()

Unnamed: 0,closeness simulado,closeness analítico,betweenness simulado,betweenness analítico
count,15.0,15.0,15.0,15.0
mean,0.239765,0.411834,0.247319,0.115751
std,0.024378,0.074293,0.195194,0.136863
min,0.201091,0.285714,0.0,0.0
25%,0.224241,0.359211,0.071429,0.010989
50%,0.238348,0.4,0.273132,0.087912
75%,0.258089,0.474713,0.336786,0.131868
max,0.290668,0.56,0.703956,0.521978


In [23]:
walk_duplic = run(g,4, True)
walk_duplic.sort_values(by=['closeness simulado','betweenness simulado'], ascending=False)

Unnamed: 0,família,closeness simulado,closeness analítico,betweenness simulado,betweenness analítico
7,tornabuon,0.172738,0.482759,2.446868,0.091575
6,medici,0.16945,0.56,7.03544,0.521978
13,barbadori,0.16665,0.4375,1.501868,0.115385
9,ridolfi,0.162708,0.482759,2.550165,0.086081
11,strozzi,0.160872,0.424242,2.759176,0.075092
2,albizzi,0.15576,0.482759,3.129451,0.212454
8,bischeri,0.155603,0.4,2.821758,0.120879
3,guadagni,0.155124,0.466667,4.564505,0.260073
10,acciaiuol,0.153805,0.368421,0.589011,0.0
14,castellan,0.149839,0.388889,3.157308,0.087912


In [24]:
close_walk_duplic = walk_duplic.iloc[:, 1].values
bet_walk_duplic = walk_duplic.iloc[:, 3].values
walk_duplic.describe()

Unnamed: 0,closeness simulado,closeness analítico,betweenness simulado,betweenness analítico
count,15.0,15.0,15.0,15.0
mean,0.153336,0.411834,2.426832,0.115751
std,0.01243,0.074293,1.70047,0.136863
min,0.127608,0.285714,0.589011,0.0
25%,0.14499,0.359211,1.156593,0.010989
50%,0.155124,0.4,2.446868,0.087912
75%,0.16179,0.474713,2.975604,0.131868
max,0.172738,0.56,7.03544,0.521978


In [25]:
walk_transf = run(g,4, False)
walk_transf.sort_values(by=['closeness simulado','betweenness simulado'], ascending=False)

Unnamed: 0,família,closeness simulado,closeness analítico,betweenness simulado,betweenness analítico
4,pazzi,0.040933,0.285714,0.714231,0.0
0,ginori,0.039129,0.333333,0.744505,0.0
2,albizzi,0.037717,0.482759,2.459615,0.212454
5,salviati,0.037633,0.388889,1.552637,0.142857
10,acciaiuol,0.037281,0.368421,0.772967,0.0
1,lambertes,0.03715,0.325581,0.742747,0.0
3,guadagni,0.036636,0.466667,3.313846,0.260073
14,castellan,0.036439,0.388889,2.413626,0.087912
12,peruzzi,0.036182,0.35,1.573187,0.021978
7,tornabuon,0.036012,0.482759,2.453791,0.091575


In [26]:
close_walk_transf = walk_transf.iloc[:, 1].values
bet_walk_transf = walk_transf.iloc[:, 3].values
walk_transf.describe()

Unnamed: 0,closeness simulado,closeness analítico,betweenness simulado,betweenness analítico
count,15.0,15.0,15.0,15.0
mean,0.036694,0.411834,2.043919,0.115751
std,0.00177,0.074293,1.157575,0.136863
min,0.033617,0.285714,0.714231,0.0
25%,0.035839,0.359211,1.162802,0.010989
50%,0.036439,0.4,2.413626,0.087912
75%,0.037457,0.474713,2.449945,0.131868
max,0.040933,0.56,5.007143,0.521978


E agora, vamos pensar um pouco...

* Onde você precisa mudar o código para usar uma *trajetória* que não seja a *geodésica*? (caminho, trilha, passeio)

* Onde você precisa mudar o código para usar uma *difusão* que não seja a *transferência*? (duplicação)

Considere então a seguinte **hipótese**:

>Quando consideramos outros tipos de trajetória e outros tipos de difusão, os nós com maior *closeness simulado* e *betweenness simulado* não são necessariamente os nós com maior *closeness* e *betweenness* segundo as fórmulas clássicas. (que correspondem ao uso de geodésica e transferência na simulação)

Queremos:

1. Operacionalização e teste dessas hipótese. (Objetivo 3)
2. Interpretação dos resultados na linguagem de Análise de Redes Sociais (Objetivo 4)

Um *feedback* da atividade sobre *coreness no Jazz* será dado em breve, para vocês terem uma melhor referência do item 2.

In [27]:
from scipy import stats
pd.options.display.float_format = '{:.10f}'.format

In [32]:
closeness_array = [close_geo_duplic, 
close_path_duplic, close_path_transf,
close_trail_duplic, close_trail_transf,
close_walk_duplic, close_walk_transf]

betweenness_array = [bet_geo_duplic,
bet_path_duplic, bet_path_transf,
bet_trail_duplic, bet_trail_transf,
bet_walk_duplic, bet_walk_transf]

closeness_tstatistic = []
closeness_pvalue = []
betweenness_tstatistic = []
betweenness_pvalue = []

for i in range(len(closeness_array)):
    test = stats.ttest_rel(closeness_array[i], close_geo_transf)
    closeness_tstatistic.append(test[0])
    closeness_pvalue.append(test[1])

for i in range(len(betweenness_array)):
    test = stats.ttest_rel(betweenness_array[i], bet_geo_transf)
    betweenness_tstatistic.append(test[0])
    betweenness_pvalue.append(test[1])
    
types_array = ["geo_duplic",
"path_duplic", "path_transf",
"trail_duplic", "trail_transf",
"walk_duplic", "walk_transf"]
    
t_tests_df = pd.DataFrame({
        'type': [e for e in types_array],
        'closeness tstatistic': [e for e in closeness_tstatistic],
        'closeness pvalue': [e for e in closeness_pvalue],
        'betweenness tstatistic': [e for e in betweenness_tstatistic],
        'betweenness pvalue': [e for e in betweenness_pvalue]
    })
t_tests_df

Unnamed: 0,type,closeness tstatistic,closeness pvalue,betweenness tstatistic,betweenness pvalue
0,geo_duplic,,,3.3331679399,0.0049262132
1,path_duplic,-10.4460791991,5.43e-08,15.1538022575,4e-10
2,path_transf,-12.363034041,6.4e-09,4.159592117,0.0009633944
3,trail_duplic,-11.1584539456,2.36e-08,10.792092301,3.6e-08
4,trail_transf,-12.3045162747,6.8e-09,4.6287886814,0.0003904952
5,walk_duplic,-15.6981588015,3e-10,5.6895793785,5.58738e-05
6,walk_transf,-19.302556981,0.0,7.214701294,4.4617e-06


In [33]:
t_tests_df.sort_values(by=['closeness pvalue','betweenness pvalue'], ascending=False)

Unnamed: 0,type,closeness tstatistic,closeness pvalue,betweenness tstatistic,betweenness pvalue
1,path_duplic,-10.4460791991,5.43e-08,15.1538022575,4e-10
3,trail_duplic,-11.1584539456,2.36e-08,10.792092301,3.6e-08
4,trail_transf,-12.3045162747,6.8e-09,4.6287886814,0.0003904952
2,path_transf,-12.363034041,6.4e-09,4.159592117,0.0009633944
5,walk_duplic,-15.6981588015,3e-10,5.6895793785,5.58738e-05
6,walk_transf,-19.302556981,0.0,7.214701294,4.4617e-06
0,geo_duplic,,,3.3331679399,0.0049262132


In [34]:
t_tests_df.describe()

Unnamed: 0,closeness tstatistic,closeness pvalue,betweenness tstatistic,betweenness pvalue
count,6.0,6.0,7.0,7.0
mean,-13.5454665405,1.52e-08,7.2816748528,0.0009057821
std,3.3472469138,2.1e-08,4.2663135235,0.0018077723
min,-19.302556981,0.0,3.3331679399,4e-10
25%,-14.8643776114,1.8e-09,4.3941903992,2.2489e-06
50%,-12.3337751579,6.6e-09,5.6895793785,5.58738e-05
75%,-11.4449695279,1.94e-08,9.0033967975,0.0006769448
max,-10.4460791991,5.43e-08,15.1538022575,0.0049262132
