In [None]:
import numpy as np
import networkx as nx
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import pandas as pd
from datetime import datetime
from tqdm.notebook import tqdm,trange
import warnings
import logging
import scipy.sparse
import altair as alt
from vega_datasets import data
warnings.filterwarnings("ignore") 
logging.getLogger('matplotlib.font_manager').disabled = True

# read in email

In [None]:
def nameToIndexDict(l_unique_names):
# create name to index dictionary and index to name dictionary for later use
# returns (name2id, id2name)
    name2id = {}
    for idx,name in enumerate(l_unique_names):    # unique_names
        name2id[name] = idx

    id2name = {}
    for idx, name in enumerate(l_unique_names):
        id2name[idx] = name

    return name2id, id2name

In [None]:
def standardize_triplet(l, to_type=tuple):
    try:
        if type(l[0][0]) == str:
            for i in range(len(l)):
                l[i] = to_type(l[i])
        else:
            for i in range(len(l)):
                for j in range(len(l[i])):
                    l[i][j] = to_type(l[i][j])
    except:
        pass
    return l

In [None]:
df_email = pd.read_csv('output7_new_sentiment.csv',index_col = 0)
from_list = df_email['From'].values.tolist()
for i in range(len(from_list)):
    from_list[i] = eval(from_list[i])
df_email['From'] = from_list

to_list = df_email['To'].values.tolist()
for i in range(len(to_list)):
    to_list[i] = eval(to_list[i])
df_email['To'] = to_list

cc_list = df_email['CC'].values.tolist()
for i in range(len(cc_list)):
    cc_list[i] = eval(cc_list[i])
df_email['CC'] = cc_list

df_email


# Scott related emails

In [None]:
total_list = [('scoi', 'maddox', 'scoi.maddox@talgov.com'),
              ('scott', 'maddox', 'maddox@maddoxhorne.com'),
              ('scott', 'mattox', 'scott@govinc.net'),
              ('scott', 'maddox', 'scottcharlesmaddox@gmail.com'),
              ('scott', 'maddox', 'scott@scottmaddox.com'),
              ('scott', 'maddox', 'shamaddox@embarqmail.com'),
              ('scott', 'maddox', 'scott.maddox@talgov.com'),
              ('scof', 'maddox', 'scof.maddox@talgov.com'),
              ('scos', 'maddox', 'scos.maddox@talgov.com'),
              ('sha', 'maddox', 'sha_maddox'),]

In [None]:
# find person related emails
keep_idx=[]
for i in range(len(from_list)):
    for j in range(len(total_list)):
        if total_list[j] == from_list[i] :
            keep_idx.append(i)
            break
        for t in to_list[i]:
            if total_list[j] == t:
                keep_idx.append(i)
                break
        for cc in cc_list[i]:
            if total_list[j] == cc:
                keep_idx.append(i)
                break
len(keep_idx)

In [None]:
df_related_emails = df_email.iloc[keep_idx]
df_related_emails = df_related_emails.reset_index(drop=True)
df_related_emails

In [None]:
from_list = df_related_emails['From'].values.tolist()
to_list = df_related_emails['To'].values.tolist()
cc_list = df_related_emails['CC'].values.tolist()


from_list = standardize_triplet(from_list)
to_list = standardize_triplet(to_list)
cc_list = standardize_triplet(cc_list)

In [None]:
num_TO = []
num_TO = []
num_CC = []
num_TO_CC = []
for i in range(len(to_list)):
    num_TO.append(len(to_list[i]))
    num_CC.append(len(cc_list[i]))
    num_TO_CC.append(len(to_list[i]) + len(cc_list[i]))
df_related_emails['num_TO'] = num_TO
df_related_emails['num_CC'] = num_CC
df_related_emails['num_TO_CC'] = num_TO_CC

In [None]:
unique_people = set()
for i in range(len(from_list)):
    unique_people.add(from_list[i])

for i in range(len(cc_list)):
    for lst in cc_list[i]:
        unique_people.add(lst)

for i in range(len(to_list)):
    for lst in to_list[i]:
        unique_people.add(lst)
unique_people = list(unique_people)
unique_people.sort()
name2id, id2name = nameToIndexDict(unique_people)

len(unique_people)


In [None]:
date_email = pd.to_datetime(df_related_emails['Sent'])
month = date_email.dt.month
year = date_email.dt.year

In [None]:
idxes = []
time_labels = []
tickcolors = ['green', 'blue', 'red', 'orange']
color = tickcolors[0]
time_labels_for_tickcolor = {}
skip = True # some month in the middle is empty, so I use a skip flag to skip the first several month but keep the middle ones
for y in range(2012,2018):
    for m in range(1,13):
        idx = (year == y) * (month == m)
        if idx.sum() == 0 and skip == True:
            continue
        else:
            skip = False
            
            idxes.append(np.array(idx))
            label = str(y)+'.'+str(m)
            time_labels.append(label)
            time_labels_for_tickcolor[label] = color
            if label == '2014.1':
                color = tickcolors[1]
            elif label == '2014.5':
                color = tickcolors[2]
            elif label == '2016.2':
                color = tickcolors[3]
                
for i,idx in enumerate(idxes[::-1]): # remove the empty months in the end
    if idx.sum() != 0:
        break
idxes = idxes[:-i]
time_labels = time_labels[:-i]

In [None]:
# 
centrality = np.zeros((len(idxes),23))
email_adj_mat = None

# for k in range(len(idxes)):
for k in trange(len(idxes)):
    
    # build adjmat
    df_temp = df_related_emails.iloc[idxes[k]]
    num_email = len(df_temp)
    if num_email == 0:
        continue
    email_adj_mat = np.zeros((len(unique_people),len(unique_people)))

    from_list = df_temp['From'].values.tolist()
    to_list = df_temp['To'].values.tolist()
    cc_list = df_temp['CC'].values.tolist()
    num_TO_CC = df_temp['num_TO_CC'].values.tolist()

    for i in range(len(from_list)):
        s = name2id[from_list[i]] # id of the sender
        
        rs = to_list[i]
        for j in range(len(rs)):
            if rs[j] != ('', '', '') and rs[j] != ['', '', '']:
                r = name2id[rs[j]]  # id of the receiver
                email_adj_mat[s,r] += 1/num_TO_CC[i]
        cs = cc_list[i]
        for j in range(len(cs)):
            if cs[j] != ('', '', '') and cs[j] != ['', '', '']:
                r = name2id[cs[j]]  # id of the receiver
                email_adj_mat[s,r] += 1/num_TO_CC[i]
    
#     email_adj_mat = email_adj_mat + email_adj_mat.T
#     email_adj_mat = np.triu(email_adj_mat, 1)
    email_adj_mat = email_adj_mat/email_adj_mat.max() # normalize. the widest edge has fix width
    
    # build graph from numpy array email_adj_mat
    G = nx.from_numpy_array(email_adj_mat, create_using=nx.DiGraph)
    
    idc = np.array(list(nx.centrality.in_degree_centrality(G).values()))
    centrality[k,0] = idc.min()
    centrality[k,1] = idc.mean()
    centrality[k,2] = idc.max()
    centrality[k,3] = idc.std()
    
    odc = np.array(list(nx.centrality.out_degree_centrality(G).values()))
    centrality[k,4] = odc.min()
    centrality[k,5] = odc.mean()
    centrality[k,6] = odc.max()
    centrality[k,7] = odc.std()

    dc = np.array(list(nx.centrality.degree_centrality(G).values()))
    centrality[k,8] = dc.min()
    centrality[k,9] = dc.mean()
    centrality[k,10] = dc.max()
    centrality[k,11] = dc.std()
    
    bc = np.array(list(nx.centrality.betweenness_centrality(G).values()))
    centrality[k,12] = bc.min()
    centrality[k,13] = bc.mean()
    centrality[k,14] = bc.max()
    centrality[k,15] = bc.std()
    
    cc = np.array(list(nx.centrality.closeness_centrality(G).values()))
    centrality[k,16] = cc.min()
    centrality[k,17] = cc.mean()
    centrality[k,18] = cc.max()
    centrality[k,19] = cc.std()
    # https://networkx.org/documentation/stable/reference/classes/generated/networkx.Graph.size.html
    centrality[k,20] = G.size()  # if not provide weight, it's the number of edges. 
    centrality[k,21] = G.size(weight = 'weight')  
    
    centrality[k,22] = ((email_adj_mat.sum(axis=0) != 0) * (email_adj_mat.sum(axis=1) != 0)).sum() # number of active nodes
    
    email_adj_mat = None # will have memory problem if the matrix is too big, set to None to free momery before next iteration


In [None]:
np.save('scott_email_centrality', centrality)

## plot
### betweenness

In [None]:
fig, ax1 = plt.subplots(figsize = (15,4))

x = centrality[:,13]
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.set_title('betweenness centrality and network size', fontsize = 24)
ax1.bar(np.arange(len(idxes)), x, label = 'avg centrality', alpha = 0.6, color = 'purple')
ax1.legend(loc=2)
# the range is too large
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,10], y2 = centrality[:,8], color = 'red', alpha = 0.3)
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,9] + centrality[:,11], y2 = centrality[:,9] - centrality[:,11], color = 'red', alpha = 0.3)
ax1.set_xticks(np.arange(len(idxes)))
ax1.set_xticklabels(time_labels, rotation = 90, fontsize = 14)
ax1.set_xlabel('time', fontsize = 16)
ax1.set_ylabel('avg centrality', color='purple', fontsize = 20)
ax1.set_yticks(np.linspace(x.min(), x.max(), 7))
ax1.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
for ticklabel in plt.gca().get_xticklabels():
    ticklabel.set_color(time_labels_for_tickcolor[ticklabel.get_text()])
ax1.grid(axis = 'y')

x = centrality[:,15]
ax2 = ax1.twinx()
ax2.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines["right"].set_position(("axes", -0.12))
ax2.plot(x ,label = 'std centrality', c = 'orange', linewidth = 3, ls = 'dashed')
ax2.set_ylabel('std centrality', color='orange', fontsize = 20, x = -1.2, y = 0.5)
ax2.yaxis.set_label_coords(-0.14 ,0.5) 
ax2.set_yticks(np.linspace(x.min(), x.max(), 7))
ax2.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax2.legend(loc=2, bbox_to_anchor=(0.0,0.9))

x = centrality[:,20]
ax3 = ax1.twinx()
ax3.spines['right'].set_visible(False)
ax3.spines['left'].set_visible(False)
ax3.spines['top'].set_visible(False)
ax3.spines['bottom'].set_visible(False)
ax3.plot(x, label = '#edge', c = 'gray', linewidth = 3, alpha = 0.6)
ax3.set_ylabel('#edge', color='gray', fontsize = 20)
ax3.set_yticks(np.linspace(x.min(), x.max(), 7))
ax3.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax3.legend(loc=1)


x = centrality[:,22]
ax4 = ax1.twinx()
ax4.spines['right'].set_visible(False)
ax4.spines['left'].set_visible(False)
ax4.spines['top'].set_visible(False)
ax4.spines['bottom'].set_visible(False)
ax4.spines["right"].set_position(("axes", 1.08))
ax4.plot(x, label = '#active nodes', c = 'black', linewidth = 2, alpha = 0.6)
ax4.set_ylabel('#active nodes', color='black', fontsize = 20, x = 1.2, y = 0.5)
ax4.set_yticks(np.linspace(x.min(), x.max(), 7))
ax4.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax4.legend(loc=1, bbox_to_anchor=(1.0,0.9))

plt.tight_layout()
plt.show()

### closeness centrality

In [None]:
fig, ax1 = plt.subplots(figsize = (15,4))

x = centrality[:,17]
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.set_title('closeness centrality and network size', fontsize = 24)
ax1.bar(np.arange(len(idxes)), x, label = 'avg centrality', alpha = 0.6, color = 'purple')
ax1.legend(loc=2)
# the range is too large
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,10], y2 = centrality[:,8], color = 'red', alpha = 0.3)
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,9] + centrality[:,11], y2 = centrality[:,9] - centrality[:,11], color = 'red', alpha = 0.3)
ax1.set_yticks(np.linspace(x.min(), x.max(), 7))
ax1.set_xticks(np.arange(len(idxes)))
ax1.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax1.set_xticklabels(time_labels, rotation = 90, fontsize = 14)
ax1.set_xlabel('time', fontsize = 16)
ax1.set_ylabel('avg centrality', color='purple', fontsize = 20)
for ticklabel in plt.gca().get_xticklabels():
    ticklabel.set_color(time_labels_for_tickcolor[ticklabel.get_text()])
ax1.grid(axis = 'y')

x = centrality[:,19]
ax2 = ax1.twinx()
ax2.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines["right"].set_position(("axes", -0.18))
ax2.plot(x ,label = 'std centrality', c = 'orange', linewidth = 3, ls = 'dashed')
ax2.set_ylabel('std centrality', color='orange', fontsize = 20, x = -1.22, y = 0.5)
ax2.yaxis.set_label_coords(-0.2 ,0.5) 
ax2.set_yticks(np.linspace(x.min(), x.max(), 7))
ax2.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax2.legend(loc=2, bbox_to_anchor=(0.0,0.9))

x = centrality[:,20]
ax3 = ax1.twinx()
ax3.spines['right'].set_visible(False)
ax3.spines['left'].set_visible(False)
ax3.spines['top'].set_visible(False)
ax3.spines['bottom'].set_visible(False)
ax3.plot(x, label = '#edge', c = 'gray', linewidth = 3, alpha = 0.6)
ax3.set_ylabel('#edge', color='gray', fontsize = 20)
ax3.set_yticks(np.linspace(x.min(), x.max(), 7))
ax3.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax3.legend(loc=1)


x = centrality[:,22]
ax4 = ax1.twinx()
ax4.spines['right'].set_visible(False)
ax4.spines['left'].set_visible(False)
ax4.spines['top'].set_visible(False)
ax4.spines['bottom'].set_visible(False)
ax4.spines["right"].set_position(("axes", 1.08))
ax4.plot(x, label = '#active nodes', c = 'black', linewidth = 2, alpha = 0.6)
ax4.set_ylabel('#active nodes', color='black', fontsize = 20, x = 1.2, y = 0.5)
ax4.set_yticks(np.linspace(x.min(), x.max(), 7))
ax4.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax4.legend(loc=1, bbox_to_anchor=(1.0,0.9))

plt.tight_layout()
plt.show()

### degree

In [None]:
fig, ax1 = plt.subplots(figsize = (15,4))

x = centrality[:,9]
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.set_title('degree centrality and network size', fontsize = 24)
ax1.bar(np.arange(len(idxes)), x, label = 'avg centrality', alpha = 0.6, color = 'purple')
ax1.legend(loc=2)
# the range is too large
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,10], y2 = centrality[:,8], color = 'red', alpha = 0.3)
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,9] + centrality[:,11], y2 = centrality[:,9] - centrality[:,11], color = 'red', alpha = 0.3)
ax1.set_yticks(np.linspace(x.min(), x.max(), 7))
ax1.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax1.set_xticks(np.arange(len(idxes)))
ax1.set_xticklabels(time_labels, rotation = 90, fontsize = 14)
ax1.set_xlabel('time', fontsize = 16)
ax1.set_ylabel('avg centrality', color='purple', fontsize = 20)
for ticklabel in plt.gca().get_xticklabels():
    ticklabel.set_color(time_labels_for_tickcolor[ticklabel.get_text()])
ax1.grid(axis = 'y')

x = centrality[:,11]
ax2 = ax1.twinx()
ax2.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines["right"].set_position(("axes", -0.16))
ax2.plot(x ,label = 'std centrality', c = 'orange', linewidth = 3, ls = 'dashed')
ax2.set_ylabel('std centrality', color='orange', fontsize = 20, x = -1.16, y = 0.5)
ax2.yaxis.set_label_coords(-0.2 ,0.5) 
ax2.set_yticks(np.linspace(x.min(), x.max(), 7))
ax2.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax2.legend(loc=2, bbox_to_anchor=(0.0,0.9))

x = centrality[:,20]
ax3 = ax1.twinx()
ax3.spines['right'].set_visible(False)
ax3.spines['left'].set_visible(False)
ax3.spines['top'].set_visible(False)
ax3.spines['bottom'].set_visible(False)
ax3.plot(x, label = '#edge', c = 'gray', linewidth = 3, alpha = 0.6)
ax3.set_ylabel('#edge', color='gray', fontsize = 20)
ax3.set_yticks(np.linspace(x.min(), x.max(), 7))
ax3.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax3.legend(loc=1)


x = centrality[:,22]
ax4 = ax1.twinx()
ax4.spines['right'].set_visible(False)
ax4.spines['left'].set_visible(False)
ax4.spines['top'].set_visible(False)
ax4.spines['bottom'].set_visible(False)
ax4.spines["right"].set_position(("axes", 1.08))
ax4.plot(x, label = '#active nodes', c = 'black', linewidth = 2, alpha = 0.6)
ax4.set_ylabel('#active nodes', color='black', fontsize = 20, x = 1.2, y = 0.5)
ax4.set_yticks(np.linspace(x.min(), x.max(), 7))
ax4.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax4.legend(loc=1, bbox_to_anchor=(1.0,0.9))

plt.tight_layout()
plt.show()

# all emails

In [None]:
num_TO = []
num_TO = []
num_CC = []
num_TO_CC = []
for i in range(len(to_list)):
    num_TO.append(len(to_list[i]))
    num_CC.append(len(cc_list[i]))
    num_TO_CC.append(len(to_list[i]) + len(cc_list[i]))
df_email['num_TO'] = num_TO
df_email['num_CC'] = num_CC
df_email['num_TO_CC'] = num_TO_CC

In [None]:
unique_people = set()
for i in range(len(from_list)):
    unique_people.add(from_list[i])

for i in range(len(cc_list)):
    for lst in cc_list[i]:
        unique_people.add(lst)

for i in range(len(to_list)):
    for lst in to_list[i]:
        unique_people.add(lst)
unique_people = list(unique_people)
unique_people.sort()
name2id, id2name = nameToIndexDict(unique_people)

len(unique_people)


## by month

In [None]:
date_email = pd.to_datetime(df_email['Sent'])
month = date_email.dt.month
year = date_email.dt.year

In [None]:
idxes = []
time_labels = []
tickcolors = ['green', 'blue', 'red', 'orange']
color = tickcolors[0]
time_labels_for_tickcolor = {}
skip = True # some month in the middle is empty, so I use a skip flag to skip the first several month but keep the middle ones
for y in range(2012,2018):
    for m in range(1,13):
        idx = (year == y) * (month == m)
        if idx.sum() == 0 and skip == True:
            continue
        else:
            skip = False
            
            idxes.append(np.array(idx))
            label = str(y)+'.'+str(m)
            time_labels.append(label)
            time_labels_for_tickcolor[label] = color
            if label == '2014.1':
                color = tickcolors[1]
            elif label == '2014.5':
                color = tickcolors[2]
            elif label == '2016.2':
                color = tickcolors[3]
                
for i,idx in enumerate(idxes[::-1]): # remove the empty months in the end
    if idx.sum() != 0:
        break
idxes = idxes[:-i]
time_labels = time_labels[:-i]

In [None]:
# calculation part 1
centrality = np.zeros((len(idxes),25))
email_adj_mat = None

for k in trange(len(idxes)):
    
    # build adjmat
    df_temp = df_email.iloc[idxes[k]]
    num_email = len(df_temp)
    if num_email == 0:
        continue
    email_adj_mat = np.zeros((len(unique_people),len(unique_people)))

    from_list = df_temp['From'].values.tolist()
    to_list = df_temp['To'].values.tolist()
    cc_list = df_temp['CC'].values.tolist()
    num_TO_CC = df_temp['num_TO_CC'].values.tolist()

    for i in range(len(from_list)):
        s = name2id[from_list[i]] # id of the sender
        
        rs = to_list[i]
        for j in range(len(rs)):
            if rs[j] != ('', '', '') and rs[j] != ['', '', '']:
                r = name2id[rs[j]]  # id of the receiver
                email_adj_mat[s,r] += 1/num_TO_CC[i]
        cs = cc_list[i]
        for j in range(len(cs)):
            if cs[j] != ('', '', '') and cs[j] != ['', '', '']:
                r = name2id[cs[j]]  # id of the receiver
                email_adj_mat[s,r] += 1/num_TO_CC[i]
    
#     email_adj_mat = email_adj_mat + email_adj_mat.T
#     email_adj_mat = np.triu(email_adj_mat, 1)
    email_adj_mat = email_adj_mat/email_adj_mat.max() # normalize. the widest edge has fix width
    email_adj_mat_tosave = scipy.sparse.csc_matrix(email_adj_mat)
    scipy.sparse.save_npz('./adjmat_by_month/adjmat_'+str(k)+'.npz', email_adj_mat_tosave)
    email_adj_mat_tosave = None
    
    # build graph from numpy array email_adj_mat
    G = nx.from_numpy_array(email_adj_mat, create_using=nx.DiGraph)
    
    centrality[k,20] = G.size()  # if not provide weight, it's the number of edges. 
    centrality[k,21] = G.size(weight = 'weight')  # https://networkx.org/documentation/stable/reference/classes/generated/networkx.Graph.size.html
    centrality[k,22] = ((email_adj_mat.sum(axis=0) != 0) * (email_adj_mat.sum(axis=1) != 0)).sum() # number of active nodes
    centrality[k,23] = nx.density(G) # the number of nodes is constant in our case, density of active nodes maybe better
    email_adj_mat = None # will have memory problem, set to None to free momery before next iteration
    if centrality[k,20] != 0:
        centrality[k,24] = centrality[k,22]/centrality[k,20]
        
    idc = np.array(list(nx.centrality.in_degree_centrality(G).values()))
    centrality[k,0] = idc.min()
    centrality[k,1] = idc.mean()
    centrality[k,2] = idc.max()
    centrality[k,3] = idc.std()
    centrality_tosave = scipy.sparse.csc_matrix(idc)
    scipy.sparse.save_npz('./centrality_by_month/in_degree_'+str(k)+'.npz', centrality_tosave)
    centrality_tosave = None
    idc = None
    
    
    odc = np.array(list(nx.centrality.out_degree_centrality(G).values()))
    centrality[k,4] = odc.min()
    centrality[k,5] = odc.mean()
    centrality[k,6] = odc.max()
    centrality[k,7] = odc.std()
    centrality_tosave = scipy.sparse.csc_matrix(odc)
    scipy.sparse.save_npz('./centrality_by_month/out_degree_'+str(k)+'.npz', centrality_tosave)
    centrality_tosave = None
    odc = None
    

    dc = np.array(list(nx.centrality.degree_centrality(G).values()))
    centrality[k,8] = dc.min()
    centrality[k,9] = dc.mean()
    centrality[k,10] = dc.max()
    centrality[k,11] = dc.std()
    centrality_tosave = scipy.sparse.csc_matrix(dc)
    scipy.sparse.save_npz('./centrality_by_month/degree_'+str(k)+'.npz', centrality_tosave)
    centrality_tosave = None
    dc = None
    
    
    bc = np.array(list(nx.centrality.betweenness_centrality(G).values()))
    centrality[k,12] = bc.min()
    centrality[k,13] = bc.mean()
    centrality[k,14] = bc.max()
    centrality[k,15] = bc.std()
    centrality_tosave = scipy.sparse.csc_matrix(bc)
    scipy.sparse.save_npz('./centrality_by_month/betweenness_centrality_'+str(k)+'.npz', centrality_tosave)
    centrality_tosave = None
    bc = None
    
    
    cc = np.array(list(nx.centrality.closeness_centrality(G).values()))
    centrality[k,16] = cc.min()
    centrality[k,17] = cc.mean()
    centrality[k,18] = cc.max()
    centrality[k,19] = cc.std()
    centrality_tosave = scipy.sparse.csc_matrix(cc)
    scipy.sparse.save_npz('./centrality_by_month/closeness_centrality_'+str(k)+'.npz', centrality_tosave)
    centrality_tosave = None
    cc = None
    
    
np.save('all_email_centrality', centrality)

In [None]:
centrality = np.load('all_email_centrality.npy')
centrality.shape

In [None]:
# calculation part 2
weighted_centrality = np.zeros((len(idxes),25))
email_adj_mat = None

for k in trange(len(idxes)):
    
    # build adjmat
    df_temp = df_email.iloc[idxes[k]]
    num_email = len(df_temp)
    if num_email == 0:
        continue
    email_adj_mat = np.zeros((len(unique_people),len(unique_people)))

    from_list = df_temp['From'].values.tolist()
    to_list = df_temp['To'].values.tolist()
    cc_list = df_temp['CC'].values.tolist()
    num_TO_CC = df_temp['num_TO_CC'].values.tolist()

    for i in range(len(from_list)):
        s = name2id[from_list[i]] # id of the sender
        
        rs = to_list[i]
        for j in range(len(rs)):
            if rs[j] != ('', '', '') and rs[j] != ['', '', '']:
                r = name2id[rs[j]]  # id of the receiver
                email_adj_mat[s,r] += 1/num_TO_CC[i]
        cs = cc_list[i]
        for j in range(len(cs)):
            if cs[j] != ('', '', '') and cs[j] != ['', '', '']:
                r = name2id[cs[j]]  # id of the receiver
                email_adj_mat[s,r] += 1/num_TO_CC[i]
    
#     email_adj_mat = email_adj_mat + email_adj_mat.T
#     email_adj_mat = np.triu(email_adj_mat, 1)
    email_adj_mat = email_adj_mat/email_adj_mat.max() # normalize. the widest edge has fix width
    email_adj_mat_tosave = scipy.sparse.csc_matrix(email_adj_mat)
    scipy.sparse.save_npz('./adjmat_by_month/adjmat_'+str(k)+'.npz', email_adj_mat_tosave)
    email_adj_mat_tosave = None
    
    # build graph from numpy array email_adj_mat
    G = nx.from_numpy_array(email_adj_mat, create_using=nx.DiGraph)
    
    weighted_centrality[k,20] = G.size()  # if not provide weight, it's the number of edges. 
    weighted_centrality[k,21] = G.size(weight = 'weight')  # https://networkx.org/documentation/stable/reference/classes/generated/networkx.Graph.size.html
    weighted_centrality[k,22] = ((email_adj_mat.sum(axis=0) != 0) * (email_adj_mat.sum(axis=1) != 0)).sum() # number of active nodes
    weighted_centrality[k,23] = nx.density(G) # the number of nodes is constant in our case, density of active nodes maybe better
    email_adj_mat = None # will have memory problem, set to None to free momery before next iteration
    if weighted_centrality[k,20] != 0: 
        weighted_centrality[k,24] = weighted_centrality[k,22]/weighted_centrality[k,20]
        
    idc = np.array(list(nx.centrality.in_degree_centrality(G).values()))
    weighted_centrality[k,0] = idc.min()
    weighted_centrality[k,1] = idc.mean()
    weighted_centrality[k,2] = idc.max()
    weighted_centrality[k,3] = idc.std()
    weighted_centrality = scipy.sparse.csc_matrix(idc)
    scipy.sparse.save_npz('./centrality_by_month/in_degree_'+str(k)+'.npz', centrality_tosave)
    centrality_tosave = None
    idc = None
    
    
    odc = np.array(list(nx.centrality.out_degree_centrality(G).values()))
    centrality[k,4] = odc.min()
    centrality[k,5] = odc.mean()
    centrality[k,6] = odc.max()
    centrality[k,7] = odc.std()
    centrality_tosave = scipy.sparse.csc_matrix(odc)
    scipy.sparse.save_npz('./centrality_by_month/out_degree_'+str(k)+'.npz', centrality_tosave)
    centrality_tosave = None
    odc = None
    

    dc = np.array(list(nx.centrality.degree_centrality(G).values()))
    centrality[k,8] = dc.min()
    centrality[k,9] = dc.mean()
    centrality[k,10] = dc.max()
    centrality[k,11] = dc.std()
    centrality_tosave = scipy.sparse.csc_matrix(dc)
    scipy.sparse.save_npz('./centrality_by_month/degree_'+str(k)+'.npz', centrality_tosave)
    centrality_tosave = None
    dc = None
    
    
    bc = np.array(list(nx.centrality.betweenness_centrality(G).values()))
    centrality[k,12] = bc.min()
    centrality[k,13] = bc.mean()
    centrality[k,14] = bc.max()
    centrality[k,15] = bc.std()
    centrality_tosave = scipy.sparse.csc_matrix(bc)
    scipy.sparse.save_npz('./centrality_by_month/betweenness_centrality_'+str(k)+'.npz', centrality_tosave)
    centrality_tosave = None
    bc = None
    
    
    cc = np.array(list(nx.centrality.closeness_centrality(G).values()))
    centrality[k,16] = cc.min()
    centrality[k,17] = cc.mean()
    centrality[k,18] = cc.max()
    centrality[k,19] = cc.std()
    centrality_tosave = scipy.sparse.csc_matrix(cc)
    scipy.sparse.save_npz('./centrality_by_month/closeness_centrality_'+str(k)+'.npz', centrality_tosave)
    centrality_tosave = None
    cc = None
    
    
np.save('all_email_weighted_centrality', weighted_centrality)

### plot
#### betweenness

In [None]:
fig, ax1 = plt.subplots(figsize = (15,4))

x = centrality[:,13]
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.set_title('betweenness centrality and network density', fontsize = 24)
ax1.bar(np.arange(len(idxes)), x, label = 'avg centrality', alpha = 0.6, color = 'purple')
ax1.legend(loc=2)
# the range is too large
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,10], y2 = centrality[:,8], color = 'red', alpha = 0.3)
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,9] + centrality[:,11], y2 = centrality[:,9] - centrality[:,11], color = 'red', alpha = 0.3)
ax1.set_xticks(np.arange(len(idxes)))
ax1.set_xticklabels(time_labels, rotation = 90, fontsize = 14)
ax1.set_xlabel('time', fontsize = 16)
ax1.set_ylabel('avg centrality', color='purple', fontsize = 20)
ax1.set_yticks(np.linspace(x.min(), x.max(), 7))
ax1.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
for ticklabel in plt.gca().get_xticklabels():
    ticklabel.set_color(time_labels_for_tickcolor[ticklabel.get_text()])
ax1.grid(axis = 'y')

x = centrality[:,15]
ax2 = ax1.twinx()
ax2.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines["right"].set_position(("axes", -0.12))
ax2.plot(x ,label = 'std centrality', c = 'orange', linewidth = 3, ls = 'dashed')
ax2.set_ylabel('std centrality', color='orange', fontsize = 20, x = -1.2, y = 0.5)
ax2.yaxis.set_label_coords(-0.14 ,0.5) 
ax2.set_yticks(np.linspace(x.min(), x.max(), 7))
ax2.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax2.legend(loc=2, bbox_to_anchor=(0.0,0.9))

x = centrality[:,23]
ax3 = ax1.twinx()
ax3.spines['right'].set_visible(False)
ax3.spines['left'].set_visible(False)
ax3.spines['top'].set_visible(False)
ax3.spines['bottom'].set_visible(False)
ax3.plot(x, label = 'density', c = 'gray', linewidth = 3, alpha = 0.6)
ax3.set_ylabel('density', color='gray', fontsize = 20)
ax3.set_yticks(np.linspace(x.min(), x.max(), 7))
ax3.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax3.legend(loc=1)


x = centrality[:,24]
ax4 = ax1.twinx()
ax4.spines['right'].set_visible(False)
ax4.spines['left'].set_visible(False)
ax4.spines['top'].set_visible(False)
ax4.spines['bottom'].set_visible(False)
ax4.spines["right"].set_position(("axes", 1.08))
ax4.plot(x, label = 'modified density', c = 'black', linewidth = 2, alpha = 0.6)
ax4.set_ylabel('modified density', color='black', fontsize = 20, x = 1.2, y = 0.5)
ax4.set_yticks(np.linspace(x.min(), x.max(), 7))
ax4.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax4.legend(loc=1, bbox_to_anchor=(1.0,0.9))

plt.tight_layout()
plt.show()

#### closeness

In [None]:
fig, ax1 = plt.subplots(figsize = (15,4))

x = centrality[:,17]
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.set_title('closeness centrality and network density', fontsize = 24)
ax1.bar(np.arange(len(idxes)), x, label = 'avg centrality', alpha = 0.6, color = 'purple')
ax1.legend(loc=2)
# the range is too large
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,10], y2 = centrality[:,8], color = 'red', alpha = 0.3)
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,9] + centrality[:,11], y2 = centrality[:,9] - centrality[:,11], color = 'red', alpha = 0.3)
ax1.set_xticks(np.arange(len(idxes)))
ax1.set_xticklabels(time_labels, rotation = 90, fontsize = 14)
ax1.set_xlabel('time', fontsize = 16)
ax1.set_ylabel('avg centrality', color='purple', fontsize = 20)
ax1.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
ax1.set_yticks(np.linspace(x.min(), x.max(), 7))
ax1.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
for ticklabel in plt.gca().get_xticklabels():
    ticklabel.set_color(time_labels_for_tickcolor[ticklabel.get_text()])
ax1.grid(axis = 'y')

x = centrality[:,19]
ax2 = ax1.twinx()
ax2.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines["right"].set_position(("axes", -0.12))
ax2.plot(x ,label = 'std centrality', c = 'orange', linewidth = 3, ls = 'dashed')
ax2.set_ylabel('std centrality', color='orange', fontsize = 20, x = -1.2, y = 0.5)
ax2.yaxis.set_label_coords(-0.14 ,0.5) 
ax2.set_yticks(np.linspace(x.min(), x.max(), 7))
ax2.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax2.legend(loc=2, bbox_to_anchor=(0.0,0.9))

x = centrality[:,23]
ax3 = ax1.twinx()
ax3.spines['right'].set_visible(False)
ax3.spines['left'].set_visible(False)
ax3.spines['top'].set_visible(False)
ax3.spines['bottom'].set_visible(False)
ax3.plot(x, label = 'density', c = 'gray', linewidth = 3, alpha = 0.6)
ax3.set_ylabel('density', color='gray', fontsize = 20)
ax3.set_yticks(np.linspace(x.min(), x.max(), 7))
ax3.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax3.legend(loc=1)


x = centrality[:,24]
ax4 = ax1.twinx()
ax4.spines['right'].set_visible(False)
ax4.spines['left'].set_visible(False)
ax4.spines['top'].set_visible(False)
ax4.spines['bottom'].set_visible(False)
ax4.spines["right"].set_position(("axes", 1.08))
ax4.plot(x, label = 'modified density', c = 'black', linewidth = 2, alpha = 0.6)
ax4.set_ylabel('modified density', color='black', fontsize = 20, x = 1.2, y = 0.5)
ax4.set_yticks(np.linspace(x.min(), x.max(), 7))
ax4.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax4.legend(loc=1, bbox_to_anchor=(1.0,0.9))

plt.tight_layout()
plt.show()

#### degree

In [None]:
fig, ax1 = plt.subplots(figsize = (15,4))

x = centrality[:,9]
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.set_title('degree centrality and network density', fontsize = 24)
ax1.bar(np.arange(len(idxes)), x, label = 'avg centrality', alpha = 0.6, color = 'purple')
ax1.legend(loc=2)
# the range is too large
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,10], y2 = centrality[:,8], color = 'red', alpha = 0.3)
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,9] + centrality[:,11], y2 = centrality[:,9] - centrality[:,11], color = 'red', alpha = 0.3)
ax1.set_xticks(np.arange(len(idxes)))
ax1.set_xticklabels(time_labels, rotation = 90, fontsize = 14)
ax1.set_xlabel('time', fontsize = 16)
ax1.set_ylabel('avg centrality', color='purple', fontsize = 20)
ax1.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
ax1.set_yticks(np.linspace(x.min(), x.max(), 7))
ax1.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
for ticklabel in plt.gca().get_xticklabels():
    ticklabel.set_color(time_labels_for_tickcolor[ticklabel.get_text()])
ax1.grid(axis = 'y')

x = centrality[:,11]
ax2 = ax1.twinx()
ax2.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines["right"].set_position(("axes", -0.12))
ax2.plot(x ,label = 'std centrality', c = 'orange', linewidth = 3, ls = 'dashed')
ax2.set_ylabel('std centrality', color='orange', fontsize = 20, x = -1.2, y = 0.5)
ax2.yaxis.set_label_coords(-0.14 ,0.5) 
ax2.set_yticks(np.linspace(x.min(), x.max(), 7))
ax2.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax2.legend(loc=2, bbox_to_anchor=(0.0,0.9))

x = centrality[:,23]
ax3 = ax1.twinx()
ax3.spines['right'].set_visible(False)
ax3.spines['left'].set_visible(False)
ax3.spines['top'].set_visible(False)
ax3.spines['bottom'].set_visible(False)
ax3.plot(x, label = 'density', c = 'gray', linewidth = 3, alpha = 0.6)
ax3.set_ylabel('density', color='gray', fontsize = 20)
ax3.set_yticks(np.linspace(x.min(), x.max(), 7))
ax3.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax3.legend(loc=1)


x = centrality[:,24]
ax4 = ax1.twinx()
ax4.spines['right'].set_visible(False)
ax4.spines['left'].set_visible(False)
ax4.spines['top'].set_visible(False)
ax4.spines['bottom'].set_visible(False)
ax4.spines["right"].set_position(("axes", 1.08))
ax4.plot(x, label = 'modified density', c = 'black', linewidth = 2, alpha = 0.6)
ax4.set_ylabel('modified density', color='black', fontsize = 20, x = 1.2, y = 0.5)
ax4.set_yticks(np.linspace(x.min(), x.max(), 7))
ax4.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax4.legend(loc=1, bbox_to_anchor=(1.0,0.9))

plt.tight_layout()
plt.show()

## by month weighted

### betweenness

### closeness

In [None]:
folder = "./centrality_by_month_weighted"
alpha = 1
num_month = 69
centrality = np.zeros((num_month, 4))
for k in range(num_month):
    temp_centrality = np.asarray(scipy.sparse.load_npz(folder + '/closeness_centrality_alpha_'+str(alpha)+'_month_'+str(k)+'.npz').todense())
    centrality[k,0] = temp_centrality.min()
    centrality[k,1] = temp_centrality.mean()
    centrality[k,2] = temp_centrality.max()
    centrality[k,3] = temp_centrality.std()


In [None]:
centrality_for_size = np.load('all_email_centrality.npy')
centrality_for_size.shape

In [None]:
fig, ax1 = plt.subplots(figsize = (15,4))

x = centrality[:,1]
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.set_title('closeness centrality and network density', fontsize = 24)
ax1.bar(np.arange(len(idxes)), x, label = 'avg centrality', alpha = 0.6, color = 'purple')
ax1.legend(loc=2)
# the range is too large
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,10], y2 = centrality[:,8], color = 'red', alpha = 0.3)
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,9] + centrality[:,11], y2 = centrality[:,9] - centrality[:,11], color = 'red', alpha = 0.3)
ax1.set_xticks(np.arange(len(idxes)))
ax1.set_xticklabels(time_labels, rotation = 90, fontsize = 14)
ax1.set_xlabel('time', fontsize = 16)
ax1.set_ylabel('avg centrality', color='purple', fontsize = 20)
ax1.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
ax1.set_yticks(np.linspace(x.min(), x.max(), 7))
ax1.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
for ticklabel in plt.gca().get_xticklabels():
    ticklabel.set_color(time_labels_for_tickcolor[ticklabel.get_text()])
ax1.grid(axis = 'y')

x = centrality[:,3]
ax2 = ax1.twinx()
ax2.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines["right"].set_position(("axes", -0.12))
ax2.plot(x ,label = 'std centrality', c = 'orange', linewidth = 3, ls = 'dashed')
ax2.set_ylabel('std centrality', color='orange', fontsize = 20, x = -1.2, y = 0.5)
ax2.yaxis.set_label_coords(-0.14 ,0.5) 
ax2.set_yticks(np.linspace(x.min(), x.max(), 7))
ax2.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax2.legend(loc=2, bbox_to_anchor=(0.0,0.9))

x = centrality_for_size[:,23]
ax3 = ax1.twinx()
ax3.spines['right'].set_visible(False)
ax3.spines['left'].set_visible(False)
ax3.spines['top'].set_visible(False)
ax3.spines['bottom'].set_visible(False)
ax3.plot(x, label = 'density', c = 'gray', linewidth = 3, alpha = 0.6)
ax3.set_ylabel('density', color='gray', fontsize = 20)
ax3.set_yticks(np.linspace(x.min(), x.max(), 7))
ax3.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax3.legend(loc=1)


x = centrality_for_size[:,24]
ax4 = ax1.twinx()
ax4.spines['right'].set_visible(False)
ax4.spines['left'].set_visible(False)
ax4.spines['top'].set_visible(False)
ax4.spines['bottom'].set_visible(False)
ax4.spines["right"].set_position(("axes", 1.08))
ax4.plot(x, label = 'modified density', c = 'black', linewidth = 2, alpha = 0.6)
ax4.set_ylabel('modified density', color='black', fontsize = 20, x = 1.2, y = 0.5)
ax4.set_yticks(np.linspace(x.min(), x.max(), 7))
ax4.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax4.legend(loc=1, bbox_to_anchor=(1.0,0.9))

plt.tight_layout()
plt.show()

In [None]:
aaa = np.array([[.5,2,1.2],[1,0,4],[3,3,0]])
G = nx.from_numpy_array(aaa, create_using=nx.DiGraph)
np.array(list(nx.centrality.degree_centrality(G).values()))

In [None]:
np.array(list(nx.centrality.closeness_centrality(G).values()))

In [None]:
np.array(list(nx.centrality.closeness_centrality(G, distance = 'weight').values()))

In [None]:
np.array(list(nx.centrality.betweenness_centrality(G).values()))

In [None]:
np.array(list(nx.centrality.betweenness_centrality(G, weight = 'weight').values()))

## by stages

In [None]:
# seperate emails by sent time
date_email = pd.to_datetime(df_email['Sent'])
idxt1 = date_email>datetime(2014,1,23)
idxt2 = date_email>datetime(2014,5,14)
idxt3 = date_email>datetime(2016,2,24)
idxe1 = np.array(idxt1==False)
idxe2 = np.array((idxt1==True) & (idxt2==False))
idxe3 = np.array((idxt2==True) & (idxt3==False))
idxe4 = np.array(idxt3==True)

idxes = [idxe1, idxe2, idxe3, idxe4]
stage_labels = ['before 2014,1,23', '2014,1,23 - 2014,5,14', '2014,5,14 - 2016,2,24', 'after 2016,2,24']

In [None]:
# directly copy from the notebook extracting CRA and Commissioner emails in stages
cra_list = [('john', 'dailey', 'john_dailey'),
            
            ('bryan', 'desloge', 'bryan@deslogemedical.com'),
            ('commissionerbryan', 'desloge', 'commissionerbryan_desloge'),
            ('bryan', 'desloge', 'desloge.bryan@gmail.com'),
            ('bryan', 'desloge', 'deslogeb@leoncountyfl.gov'),
            
            ('jessica', 'miller', 'jessica.miller@talgov.com'),
            
            ('jane', 'sauls', 'jane_sauls'),
            
            ('nick', 'maddox', 'maddox.nicholas@gmail.com'),
            ('nick', 'maddox', 'maddoxn@leoncountyfl.gov'),
            
            ('mary ann lindley', '', 'lindleym@leoncountyfl.gov'),
            ('maryann', 'lindley', 'maryann_lindley'),
            ('mary', 'lindley', 'mary_lindley'),
            
            ('kristin', '', 'kristindozier@gmail.com'),
            ('kristen', 'dozier', 'kristen_dozier'),
            ('kristin', 'dozer', 'kristin_dozer'),
            ('kristin', 'dozier', 'dozierk@leoncountyfl.gov'),
            
            ('bill', 'proctor', 'proctorb@leoncountyfl.gov')
           ]

commissioner_list = [('scoi', 'maddox', 'scoi.maddox@talgov.com'),
                     ('scott', 'maddox', 'maddox@maddoxhorne.com'),
                     ('scott', 'mattox', 'scott@govinc.net'),
                     ('scott', 'maddox', 'scottcharlesmaddox@gmail.com'),
                     ('scott', 'maddox', 'scott@scottmaddox.com'),
                     ('scott', 'maddox', 'shamaddox@embarqmail.com'),
                     ('scott', 'maddox', 'scott.maddox@talgov.com'),
                     ('scof', 'maddox', 'scof.maddox@talgov.com'),
                     ('scos', 'maddox', 'scos.maddox@talgov.com'),
                     ('sha', 'maddox', 'sha_maddox'),
                     
                     ('andrew', 'gillum', 'ademetricg@gmail.com'),
                     ('andrew', 'gillum', 'agillum@pfaw.org'),
                     ('commissionerandrew', 'gillum', 'commissionerandrew_gillum'),
                     ('andrew', 'gilliam', 'andrew_gilliam'),
                     ('andrew', 'gillium', 'andrew_gillium'),
                     ('andrew', 'gillum', 'andrew@andrewgillum.com'),
                     ('andrew', 'gillum', 'gilluma@talgov.com'),
                     ('andrew', 'gillum', 'andrew.gillum@talgov.com'),
                     ('andrew', 'gillams', 'andrew_gillams'),
                     
                     ('mark', 'mustian', 'mmustian@ngnlaw.com'),
                     ('f11332', 'l11332', 'mark@markmustian.com'),
                     ('f11334', 'l11334', 'mark@markmustian.com'),
                     ('mark', 'mustian', 'mmustian@ngn-tally.com'),
                     ('f11333', 'l11333', 'mark@markmustian.com'),
                     
                     ('gil', 'ziffer', 'gil@ziffberry.com'),
                     ('gil', 'ziffer', 'gil.ziffer@talgov.com'),
                     
                     ('curtis', 'richardsom', 'curtis_richardsom'),
                     ('curus', 'richardson', 'curus.richardson@talgov.com'),
                     ('ultis', 'richardson', 'ultis_richardson'),
                     ('curtis', 'richardson', 'cabaide@aol.com'),
                     
                     ('nancy', 'miller', 'jacqueline.hightower@famu.edu'),
                     ('nancy', 'miller', 'nancy.miller@talgov.com'),
                     
                     ('john', 'marks', 'john.marksiii@talgov.com'),
                     ('john', 'marks', 'john.marks@talgov.com')
                    ]
total_list = cra_list + commissioner_list

df_org=pd.read_excel('emails-combine 06162020.xlsx',sheet_name=0,usecols=[1,2,3,15,19,23])
gorvenment_people_list = []
for i in range(len(df_org)):
    row = df_org.iloc[i]
    if pd.isnull(row['First']):
        continue
    first = str(row['First']).lower().strip()
    last = str(row['Last']).lower().strip()
    if pd.isnull(row['Email']):
        email = first + '_' + last
    else:
        email = row['Email'].lower().strip()
    if row['Organization1'] is not np.nan:
        if 'Tallahassee' in row['Organization1'] or 'Leon County' in row['Organization1']:
            gorvenment_people_list.append( (first, last, email) )
            continue
    if row['Organization 2'] is not np.nan:
        if 'Tallahassee' in row['Organization 2'] or 'Leon County' in row['Organization 2']:
            gorvenment_people_list.append( (first, last, email) )
            continue
    if row['Organization 3'] is not np.nan:
        if 'Tallahassee' in row['Organization 3'] or 'Leon County' in row['Organization 3']:
            gorvenment_people_list.append( (first, last, email) )
            continue


cra_idx_list = []
for i,people in enumerate(unique_people):
    if people in cra_list:
        cra_idx_list.append(i)
#         print(people)

commissioner_idx_list = []
for i,people in enumerate(unique_people):
    if people in commissioner_list:
        commissioner_idx_list.append(i)
#         print(people)
total_idx_list = cra_idx_list + commissioner_idx_list

government_idx_list = []
for i,people in enumerate(unique_people):
    for j,people2 in enumerate(gorvenment_people_list):
        if people[2] == people2[2] or (people[0] == people2[0] and people[1] == people2[1]) \
            or 'talgov.com' in people[2] or 'leoncountyfl.gov' in people[2] or 'tallahassee.com' in people[2]:
            if i not in cra_idx_list and i not in commissioner_idx_list :
                government_idx_list.append(i)
#                 print(people)
                break

len(cra_idx_list), len(commissioner_idx_list), len(government_idx_list)


In [None]:
# 
centrality = np.zeros((len(idxes),25))
email_adj_mat = None

for k in trange(len(idxes)):
    
    # build adjmat
    df_temp = df_email.iloc[idxes[k]]
    num_email = len(df_temp)
    if num_email == 0:
        continue
    email_adj_mat = np.zeros((len(unique_people),len(unique_people)))

    from_list = df_temp['From'].values.tolist()
    to_list = df_temp['To'].values.tolist()
    cc_list = df_temp['CC'].values.tolist()
    num_TO_CC = df_temp['num_TO_CC'].values.tolist()

    for i in range(len(from_list)):
        s = name2id[from_list[i]] # id of the sender
        
        rs = to_list[i]
        for j in range(len(rs)):
            if rs[j] != ('', '', '') and rs[j] != ['', '', '']:
                r = name2id[rs[j]]  # id of the receiver
                email_adj_mat[s,r] += 1/num_TO_CC[i]
        cs = cc_list[i]
        for j in range(len(cs)):
            if cs[j] != ('', '', '') and cs[j] != ['', '', '']:
                r = name2id[cs[j]]  # id of the receiver
                email_adj_mat[s,r] += 1/num_TO_CC[i]
    
#     email_adj_mat = email_adj_mat + email_adj_mat.T
#     email_adj_mat = np.triu(email_adj_mat, 1)
    email_adj_mat = email_adj_mat/email_adj_mat.max() # normalize. the widest edge has fix width
    email_adj_mat_tosave = scipy.sparse.csc_matrix(email_adj_mat)
    scipy.sparse.save_npz('./adjmat_by_stage/adjmat_'+str(k)+'.npz', email_adj_mat_tosave)
    email_adj_mat_tosave = None
    
    # build graph from numpy array email_adj_mat
    G = nx.from_numpy_array(email_adj_mat, create_using=nx.DiGraph)
    
    centrality[k,20] = G.size()  # if not provide weight, it's the number of edges. 
    centrality[k,21] = G.size(weight = 'weight')  # https://networkx.org/documentation/stable/reference/classes/generated/networkx.Graph.size.html
    centrality[k,22] = ((email_adj_mat.sum(axis=0) != 0) * (email_adj_mat.sum(axis=1) != 0)).sum() # number of active nodes
    centrality[k,23] = nx.density(G) # the number of nodes is constant in our case, density of active nodes maybe better
    email_adj_mat = None # will have memory problem, set to None to free momery before next iteration
    if centrality[k,20] != 0:
        centrality[k,24] = centrality[k,22]/centrality[k,20]
        
    idc = np.array(list(nx.centrality.in_degree_centrality(G).values()))
    centrality[k,0] = idc.min()
    centrality[k,1] = idc.mean()
    centrality[k,2] = idc.max()
    centrality[k,3] = idc.std()
    centrality_tosave = scipy.sparse.csc_matrix(idc)
    scipy.sparse.save_npz('./centrality_by_stage/in_degree_'+str(k)+'.npz', centrality_tosave)
    centrality_tosave = None
    idc = None
    
    
    odc = np.array(list(nx.centrality.out_degree_centrality(G).values()))
    centrality[k,4] = odc.min()
    centrality[k,5] = odc.mean()
    centrality[k,6] = odc.max()
    centrality[k,7] = odc.std()
    centrality_tosave = scipy.sparse.csc_matrix(odc)
    scipy.sparse.save_npz('./centrality_by_stage/out_degree_'+str(k)+'.npz', centrality_tosave)
    centrality_tosave = None
    odc = None
    

    dc = np.array(list(nx.centrality.degree_centrality(G).values()))
    centrality[k,8] = dc.min()
    centrality[k,9] = dc.mean()
    centrality[k,10] = dc.max()
    centrality[k,11] = dc.std()
    centrality_tosave = scipy.sparse.csc_matrix(dc)
    scipy.sparse.save_npz('./centrality_by_stage/degree_'+str(k)+'.npz', centrality_tosave)
    centrality_tosave = None
    dc = None
    
    
    bc = np.array(list(nx.centrality.betweenness_centrality(G).values()))
    centrality[k,12] = bc.min()
    centrality[k,13] = bc.mean()
    centrality[k,14] = bc.max()
    centrality[k,15] = bc.std()
    centrality_tosave = scipy.sparse.csc_matrix(bc)
    scipy.sparse.save_npz('./centrality_by_stage/betweenness_centrality_'+str(k)+'.npz', centrality_tosave)
    centrality_tosave = None
    bc = None
    
    
    cc = np.array(list(nx.centrality.closeness_centrality(G).values()))
    centrality[k,16] = cc.min()
    centrality[k,17] = cc.mean()
    centrality[k,18] = cc.max()
    centrality[k,19] = cc.std()
    centrality_tosave = scipy.sparse.csc_matrix(cc)
    scipy.sparse.save_npz('./centrality_by_stage/closeness_centrality_'+str(k)+'.npz', centrality_tosave)
    centrality_tosave = None
    cc = None
    
    
np.save('all_email_by_stage_centrality', centrality)

In [None]:
stage_labels = ['before 2014,1,23', '2014,1,23 - 2014,5,14', '2014,5,14 - 2016,2,24', 'after 2016,2,24']

### plot
#### betweenness

In [None]:
fig, ax1 = plt.subplots(figsize = (15,5))

x = centrality[:,13]
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.set_title('betweenness centrality and network density', fontsize = 24)
ax1.bar(np.arange(len(idxes)), x, label = 'avg centrality', alpha = 0.6, color = 'purple')
ax1.legend(loc=2)
# the range is too large
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,10], y2 = centrality[:,8], color = 'red', alpha = 0.3)
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,9] + centrality[:,11], y2 = centrality[:,9] - centrality[:,11], color = 'red', alpha = 0.3)
ax1.set_xticks(np.arange(len(idxes)))
ax1.set_xticklabels(stage_labels, rotation = 90, fontsize = 14)
ax1.set_xlabel('time', fontsize = 16)
ax1.set_ylabel('avg centrality', color='purple', fontsize = 20)
ax1.set_yticks(np.linspace(x.min(), x.max(), 7))
ax1.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
# for ticklabel in plt.gca().get_xticklabels():
#     ticklabel.set_color(time_labels_for_tickcolor[ticklabel.get_text()])
ax1.grid(axis = 'y')

x = centrality[:,15]
ax2 = ax1.twinx()
ax2.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines["right"].set_position(("axes", -0.12))
ax2.plot(x ,label = 'std centrality', c = 'orange', linewidth = 3, ls = 'dashed')
ax2.set_ylabel('std centrality', color='orange', fontsize = 20, x = -1.2, y = 0.5)
ax2.yaxis.set_label_coords(-0.14 ,0.5) 
ax2.set_yticks(np.linspace(x.min(), x.max(), 7))
ax2.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax2.legend(loc=2, bbox_to_anchor=(0.0,0.9))

x = centrality[:,23]
ax3 = ax1.twinx()
ax3.spines['right'].set_visible(False)
ax3.spines['left'].set_visible(False)
ax3.spines['top'].set_visible(False)
ax3.spines['bottom'].set_visible(False)
ax3.plot(x, label = 'density', c = 'gray', linewidth = 3, alpha = 0.6)
ax3.set_ylabel('density', color='gray', fontsize = 20)
ax3.set_yticks(np.linspace(x.min(), x.max(), 7))
ax3.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax3.legend(loc=1)


x = centrality[:,24]
ax4 = ax1.twinx()
ax4.spines['right'].set_visible(False)
ax4.spines['left'].set_visible(False)
ax4.spines['top'].set_visible(False)
ax4.spines['bottom'].set_visible(False)
ax4.spines["right"].set_position(("axes", 1.08))
ax4.plot(x, label = 'modified density', c = 'black', linewidth = 2, alpha = 0.6)
ax4.set_ylabel('modified density', color='black', fontsize = 20, x = 1.2, y = 0.5)
ax4.set_yticks(np.linspace(x.min(), x.max(), 7))
ax4.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax4.legend(loc=1, bbox_to_anchor=(1.0,0.9))

plt.tight_layout()
plt.show()

#### closeness

In [None]:
fig, ax1 = plt.subplots(figsize = (15,5))

x = centrality[:,17]
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.set_title('closeness centrality and network density', fontsize = 24)
ax1.bar(np.arange(len(idxes)), x, label = 'avg centrality', alpha = 0.6, color = 'purple')
ax1.legend(loc=2)
# the range is too large
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,10], y2 = centrality[:,8], color = 'red', alpha = 0.3)
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,9] + centrality[:,11], y2 = centrality[:,9] - centrality[:,11], color = 'red', alpha = 0.3)
ax1.set_xticks(np.arange(len(idxes)))
ax1.set_xticklabels(stage_labels, rotation = 90, fontsize = 14)
ax1.set_xlabel('time', fontsize = 16)
ax1.set_ylabel('avg centrality', color='purple', fontsize = 20)
ax1.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
ax1.set_yticks(np.linspace(x.min(), x.max(), 7))
ax1.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
# for ticklabel in plt.gca().get_xticklabels():
#     ticklabel.set_color(time_labels_for_tickcolor[ticklabel.get_text()])
ax1.grid(axis = 'y')

x = centrality[:,19]
ax2 = ax1.twinx()
ax2.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines["right"].set_position(("axes", -0.12))
ax2.plot(x ,label = 'std centrality', c = 'orange', linewidth = 3, ls = 'dashed')
ax2.set_ylabel('std centrality', color='orange', fontsize = 20, x = -1.2, y = 0.5)
ax2.yaxis.set_label_coords(-0.14 ,0.5) 
ax2.set_yticks(np.linspace(x.min(), x.max(), 7))
ax2.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax2.legend(loc=2, bbox_to_anchor=(0.0,0.9))

x = centrality[:,23]
ax3 = ax1.twinx()
ax3.spines['right'].set_visible(False)
ax3.spines['left'].set_visible(False)
ax3.spines['top'].set_visible(False)
ax3.spines['bottom'].set_visible(False)
ax3.plot(x, label = 'density', c = 'gray', linewidth = 3, alpha = 0.6)
ax3.set_ylabel('density', color='gray', fontsize = 20)
ax3.set_yticks(np.linspace(x.min(), x.max(), 7))
ax3.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax3.legend(loc=1)


x = centrality[:,24]
ax4 = ax1.twinx()
ax4.spines['right'].set_visible(False)
ax4.spines['left'].set_visible(False)
ax4.spines['top'].set_visible(False)
ax4.spines['bottom'].set_visible(False)
ax4.spines["right"].set_position(("axes", 1.08))
ax4.plot(x, label = 'modified density', c = 'black', linewidth = 2, alpha = 0.6)
ax4.set_ylabel('modified density', color='black', fontsize = 20, x = 1.2, y = 0.5)
ax4.set_yticks(np.linspace(x.min(), x.max(), 7))
ax4.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax4.legend(loc=1, bbox_to_anchor=(1.0,0.9))

plt.tight_layout()
plt.show()

#### degree

In [None]:
fig, ax1 = plt.subplots(figsize = (15,5))

x = centrality[:,9]
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.set_title('degree centrality and network density', fontsize = 24)
ax1.bar(np.arange(len(idxes)), x, label = 'avg centrality', alpha = 0.6, color = 'purple')
ax1.legend(loc=2)
# the range is too large
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,10], y2 = centrality[:,8], color = 'red', alpha = 0.3)
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,9] + centrality[:,11], y2 = centrality[:,9] - centrality[:,11], color = 'red', alpha = 0.3)
ax1.set_xticks(np.arange(len(idxes)))
ax1.set_xticklabels(stage_labels, rotation = 90, fontsize = 14)
ax1.set_xlabel('time', fontsize = 16)
ax1.set_ylabel('avg centrality', color='purple', fontsize = 20)
ax1.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
ax1.set_yticks(np.linspace(x.min(), x.max(), 7))
ax1.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
# for ticklabel in plt.gca().get_xticklabels():
#     ticklabel.set_color(time_labels_for_tickcolor[ticklabel.get_text()])
ax1.grid(axis = 'y')

x = centrality[:,11]
ax2 = ax1.twinx()
ax2.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines["right"].set_position(("axes", -0.12))
ax2.plot(x ,label = 'std centrality', c = 'orange', linewidth = 3, ls = 'dashed')
ax2.set_ylabel('std centrality', color='orange', fontsize = 20, x = -1.2, y = 0.5)
ax2.yaxis.set_label_coords(-0.14 ,0.5) 
ax2.set_yticks(np.linspace(x.min(), x.max(), 7))
ax2.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax2.legend(loc=2, bbox_to_anchor=(0.0,0.9))

x = centrality[:,23]
ax3 = ax1.twinx()
ax3.spines['right'].set_visible(False)
ax3.spines['left'].set_visible(False)
ax3.spines['top'].set_visible(False)
ax3.spines['bottom'].set_visible(False)
ax3.plot(x, label = 'density', c = 'gray', linewidth = 3, alpha = 0.6)
ax3.set_ylabel('density', color='gray', fontsize = 20)
ax3.set_yticks(np.linspace(x.min(), x.max(), 7))
ax3.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax3.legend(loc=1)


x = centrality[:,24]
ax4 = ax1.twinx()
ax4.spines['right'].set_visible(False)
ax4.spines['left'].set_visible(False)
ax4.spines['top'].set_visible(False)
ax4.spines['bottom'].set_visible(False)
ax4.spines["right"].set_position(("axes", 1.08))
ax4.plot(x, label = 'modified density', c = 'black', linewidth = 2, alpha = 0.6)
ax4.set_ylabel('modified density', color='black', fontsize = 20, x = 1.2, y = 0.5)
ax4.set_yticks(np.linspace(x.min(), x.max(), 7))
ax4.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax4.legend(loc=1, bbox_to_anchor=(1.0,0.9))

plt.tight_layout()
plt.show()

### top n people bar plot

In [None]:
centrality_names = ['degree', 'betweenness_centrality', 'closeness_centrality']

In [None]:
df_centrality = pd.DataFrame()
df_centrality['name'] = unique_people

In [None]:
for k in range(4):
    for name in centrality_names:
        sparse_matrix = scipy.sparse.load_npz('./centrality_by_stage/'+name+'_'+str(k)+'.npz')
        dense = np.asarray(sparse_matrix.todense()).reshape(-1)
        df_centrality[name+'_'+str(k)] = dense


In [None]:
df_centrality

#### cra and comm

In [None]:
df_cra_comm = df_centrality.iloc[total_idx_list]
df_cra_comm = df_cra_comm.reset_index(drop=True)
df_cra_comm

In [None]:
df_cra_comm_reordered = pd.DataFrame()
for col_name in list(df_cra_comm.columns)[1:]:
    centrality_name = col_name[:-2]
    stage = int(col_name[-1])
    df_temp = df_cra_comm[['name']+ [col_name]].rename({col_name: 'centrality'}, axis='columns')
    df_temp['stage'] = stage
    df_temp['centrality_name'] = centrality_name
    df_cra_comm_reordered = df_cra_comm_reordered.append(df_temp)
df_cra_comm_reordered

In [None]:
alt.data_transformers.disable_max_rows()

centrality_options = df_gov_reordered["centrality_name"].unique().tolist()
dropdown = alt.binding_select(options=centrality_options)
slider = alt.binding_range(min=0, max=3, step=1)


selection = alt.selection_single(
    fields=['centrality_name', "stage"],
    bind={'centrality_name': dropdown, 'stage': slider},
    name="Select",
    init={"centrality_name": "degree", "stage": 0}
)

brush = alt.selection(type="interval", encodings=['x'])

# color = alt.condition(selection,
#                       alt.Color('centrality_name:O', legend=None),
#                       alt.value('lightgray'))


base = alt.Chart(df_cra_comm_reordered, title="Base view").add_selection(
    selection
).add_selection(
    brush
).transform_filter(
    selection
).mark_bar().encode(x=alt.X('name:O', axis=alt.Axis(labels=False)),
    y='centrality:Q'
).properties(
    width=850,
    height=50
)

zoomed = alt.Chart(df_cra_comm_reordered).mark_bar().encode(x=alt.X('name:O', scale=alt.Scale(zero=False, domain=brush)),
    y='centrality:Q'
).transform_filter(
    selection
).properties(
    title={
      "text": "Top CRA and commisionner people with high centrality", 
      "subtitle": ["Select an interval in the base view, slide to choose a stage, and select one centrality from the dropdown list", 
                   "There are "+str(len(df_cra_comm))+" CRA and commisionner people. All are shown here.", 
                   "A known issue/feature is, the highest bar is always shown, even when it's not in the range. A tiny stair next to it shows the true corresponding value"], 
      "color": "black",
      "subtitleColor": "green"
    },
    width=850,
    height=400
)

chart = zoomed & base

# chart.save('cra&comm_centrality_sorted_bar.html')
chart

In [None]:
alt.data_transformers.disable_max_rows()

centrality_options = df_gov_reordered["centrality_name"].unique().tolist()
dropdown = alt.binding_select(options=centrality_options)
slider = alt.binding_range(min=0, max=3, step=1)


selection = alt.selection_single(
    fields=['centrality_name', "stage"],
    bind={'centrality_name': dropdown, 'stage': slider},
    name="Select",
    init={"centrality_name": "degree", "stage": 0}
)

brush = alt.selection(type="interval", encodings=['x'])

color = alt.condition(selection,
                      alt.Color('centrality_name:N', legend=None),
                      alt.value('lightgray'))


base = alt.Chart(df_cra_comm_reordered, title="Base view").add_selection(
    selection
).add_selection(
    brush
).transform_filter(
    selection
).transform_window(
    rank='rank(centrality)',
    sort=[alt.SortField('centrality', order='descending')]
).transform_filter(
    (alt.datum.rank < 200)
).mark_bar().encode(x=alt.X('name:N', sort='-y', axis=alt.Axis(labels=False)),
    y='centrality:Q'
).properties(
    width=850,
    height=50
)

zoomed = alt.Chart(df_cra_comm_reordered).transform_filter(
    selection
).transform_window(
    rank='rank(centrality)',
    sort=[alt.SortField('centrality', order='descending')]
).transform_filter(
    (alt.datum.rank < 200)
).mark_bar().encode(x=alt.X('name:N', sort='-y', scale=alt.Scale(domain=brush)),
    y='centrality:Q'
).properties(
    title={
      "text": "Top CRA and commisionner people with high centrality", 
      "subtitle": ["Select an interval in the base view, slide to choose a stage, and select one centrality from the dropdown list", 
                   "There are "+str(len(df_cra_comm))+" CRA and commisionner people. All are shown here.", 
                   "A known issue/feature is, the highest bar is always shown, even when it's not in the range. A tiny stair next to it shows the true corresponding value"], 
      "color": "black",
      "subtitleColor": "green"
    },
    width=850,
    height=400
)

chart = zoomed & base

chart.save('cra&comm_centrality_sorted_bar.html')
chart

#### gov

In [None]:
df_gov = df_centrality.iloc[government_idx_list]
df_gov = df_gov.reset_index(drop=True)
df_gov

In [None]:
df_gov_reordered = pd.DataFrame()
for col_name in list(df_gov.columns)[1:]:
    centrality_name = col_name[:-2]
    stage = int(col_name[-1])
    df_temp = df_gov[['name']+ [col_name]].rename({col_name: 'centrality'}, axis='columns')
    df_temp['stage'] = stage
    df_temp['centrality_name'] = centrality_name
    df_gov_reordered = df_gov_reordered.append(df_temp)
df_gov_reordered

In [None]:
alt.data_transformers.disable_max_rows()

centrality_options = df_gov_reordered["centrality_name"].unique().tolist()
dropdown = alt.binding_select(options=centrality_options)
slider = alt.binding_range(min=0, max=3, step=1)


selection = alt.selection_single(
    fields=['centrality_name', "stage"],
    bind={'centrality_name': dropdown, 'stage': slider},
    name="Select",
    init={"centrality_name": "degree", "stage": 0}
)

brush = alt.selection(type="interval", encodings=['x'])

color = alt.condition(selection,
                      alt.Color('centrality_name:N', legend=None),
                      alt.value('lightgray'))


base = alt.Chart(df_gov_reordered, title="Base view").mark_bar().add_selection(
    selection
).add_selection(
    brush
).transform_filter(
    selection
).encode(x=alt.X('name:N', sort='-y', axis=alt.Axis(labels=False)),
    y='centrality:Q'
).transform_window(
    rank='rank(centrality)',
    sort=[alt.SortField('centrality', order='descending')]
).transform_filter(
    (alt.datum.rank < 200)
).properties(
    width=850,
    height=50
)

zoomed = alt.Chart(df_gov_reordered).mark_bar().transform_filter(
    selection
).encode(x=alt.X('name:N', sort='-y', scale=alt.Scale(domain=brush)),
    y='centrality:Q'
).transform_window(
    rank='rank(centrality)',
    sort=[alt.SortField('centrality', order='descending')]
).transform_filter(
    (alt.datum.rank < 200)
).properties(
    title={
      "text": "Top gov people with high centrality", 
      "subtitle": ["Select an interval in the base view, slide to choose a stage, and select one centrality from the dropdown list", 
                   "There are "+str(len(df_gov))+" government people. Only top 200 are shown here.", 
                   "A known issue/feature is, the highest bar is always shown, even when it's not in the range. A tiny stair next to it shows the true corresponding value"], 
      "color": "black",
      "subtitleColor": "green"
    },
    width=850,
    height=400
)

chart = zoomed & base

chart.save('gov_centrality_sorted_bar.html')
chart

#### others

In [None]:
df_others = df_centrality.iloc[~df_centrality.index.isin(total_idx_list)&~df_centrality.index.isin(government_idx_list)]
df_others = df_others.reset_index(drop=True)
df_others

In [None]:
df_others_reordered = pd.DataFrame()
for col_name in list(df_others.columns)[1:]:
    centrality_name = col_name[:-2]
    stage = int(col_name[-1])
    df_temp = df_others[['name']+ [col_name]].rename({col_name: 'centrality'}, axis='columns')
    df_temp['stage'] = stage
    df_temp['centrality_name'] = centrality_name
    df_others_reordered = df_others_reordered.append(df_temp)
df_others_reordered

In [None]:
alt.data_transformers.disable_max_rows()

centrality_options = df_others_reordered["centrality_name"].unique().tolist()
dropdown = alt.binding_select(options=centrality_options)
slider = alt.binding_range(min=0, max=3, step=1)


selection = alt.selection_single(
    fields=['centrality_name', "stage"],
    bind={'centrality_name': dropdown, 'stage': slider},
    name="Select",
    init={"centrality_name": "degree", "stage": 0}
)

brush = alt.selection(type="interval", encodings=['x'])

color = alt.condition(selection,
                      alt.Color('centrality_name:N', legend=None),
                      alt.value('lightgray'))


base = alt.Chart(df_others_reordered, title="Base view").mark_bar().add_selection(
    selection
).add_selection(
    brush
).transform_filter(
    selection
).encode(x=alt.X('name:N', sort='-y', axis=alt.Axis(labels=False)),
    y='centrality:Q'
).transform_window(
    rank='rank(centrality)',
    sort=[alt.SortField('centrality', order='descending')]
).transform_filter(
    (alt.datum.rank < 200)
).properties(
    width=850,
    height=50
)

zoomed = alt.Chart(df_others_reordered).mark_bar().transform_filter(
    selection
).encode(x=alt.X('name:N', sort='-y', scale=alt.Scale(domain=brush)),
    y='centrality:Q'
).transform_window(
    rank='rank(centrality)',
    sort=[alt.SortField('centrality', order='descending')]
).transform_filter(
    (alt.datum.rank < 200)
).properties(
    title={
      "text": "Top others people with high centrality", 
      "subtitle": ["Select an interval in the base view, slide to choose a stage, and select one centrality from the dropdown list", 
                   "There are "+str(len(df_others))+" others people. Only top 200 are shown here.", 
                   "A known issue/feature is, the highest bar is always shown, even when it's not in the range. A tiny stair next to it shows the true corresponding value"], 
      "color": "black",
      "subtitleColor": "green"
    },
    width=850,
    height=400
)

chart = zoomed & base

chart.save('others_centrality_sorted_bar.html')
chart

## by stages weighted

### betweenness

In [None]:
folder = "./centrality_by_stage_weighted"
Alpha = [0,0.5,1,1.5]
num_stage = 4
centrality = np.zeros((num_stage, len(Alpha),4))
for k in range(num_stage):
    for i,alpha in enumerate(Alpha):
        temp_centrality = np.asarray(scipy.sparse.load_npz(folder + '/betweenness_centrality_alpha_'+str(alpha)+'_stage_'+str(k)+'.npz').todense())
        centrality[k,i,0] = temp_centrality.min()
        centrality[k,i,1] = temp_centrality.mean()
        centrality[k,i,2] = temp_centrality.max()
        centrality[k,i,3] = temp_centrality.std()


In [None]:
centrality_for_size = np.load('all_email_by_stage_centrality.npy')
centrality_for_size.shape

In [None]:
fig, ax1 = plt.subplots(figsize = (15,5))

x = centrality[:,2,1]
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.set_title('weighted betweenness centrality and network density', fontsize = 24)
ax1.bar(np.arange(len(idxes)), x, label = 'avg centrality', alpha = 0.6, color = 'purple')
ax1.legend(loc=2)
# the range is too large
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,10], y2 = centrality[:,8], color = 'red', alpha = 0.3)
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,9] + centrality[:,11], y2 = centrality[:,9] - centrality[:,11], color = 'red', alpha = 0.3)
ax1.set_xticks(np.arange(len(idxes)))
ax1.set_xticklabels(stage_labels, rotation = 90, fontsize = 14)
ax1.set_xlabel('time', fontsize = 16)
ax1.set_ylabel('avg centrality', color='purple', fontsize = 20)
ax1.set_yticks(np.linspace(x.min(), x.max(), 7))
ax1.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
# for ticklabel in plt.gca().get_xticklabels():
#     ticklabel.set_color(time_labels_for_tickcolor[ticklabel.get_text()])
ax1.grid(axis = 'y')

x = centrality[:,2,3]
ax2 = ax1.twinx()
ax2.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines["right"].set_position(("axes", -0.12))
ax2.plot(x ,label = 'std centrality', c = 'orange', linewidth = 3, ls = 'dashed')
ax2.set_ylabel('std centrality', color='orange', fontsize = 20, x = -1.2, y = 0.5)
ax2.yaxis.set_label_coords(-0.14 ,0.5) 
ax2.set_yticks(np.linspace(x.min(), x.max(), 7))
ax2.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax2.legend(loc=2, bbox_to_anchor=(0.0,0.9))

x = centrality_for_size[:,23]
ax3 = ax1.twinx()
ax3.spines['right'].set_visible(False)
ax3.spines['left'].set_visible(False)
ax3.spines['top'].set_visible(False)
ax3.spines['bottom'].set_visible(False)
ax3.plot(x, label = 'density', c = 'gray', linewidth = 3, alpha = 0.6)
ax3.set_ylabel('density', color='gray', fontsize = 20)
ax3.set_yticks(np.linspace(x.min(), x.max(), 7))
ax3.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax3.legend(loc=1)


x = centrality_for_size[:,24]
ax4 = ax1.twinx()
ax4.spines['right'].set_visible(False)
ax4.spines['left'].set_visible(False)
ax4.spines['top'].set_visible(False)
ax4.spines['bottom'].set_visible(False)
ax4.spines["right"].set_position(("axes", 1.08))
ax4.plot(x, label = 'modified density', c = 'black', linewidth = 2, alpha = 0.6)
ax4.set_ylabel('modified density', color='black', fontsize = 20, x = 1.2, y = 0.5)
ax4.set_yticks(np.linspace(x.min(), x.max(), 7))
ax4.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax4.legend(loc=1, bbox_to_anchor=(1.0,0.9))

plt.tight_layout()
plt.show()

#### changing alpha

In [None]:
Alpha = [0,0.5,1,1.5]
num_stage = 4
centrality[0,:,:]

### closeness

In [None]:
folder = "./centrality_by_stage_weighted"
Alpha = [0,0.5,1,1.5]
num_stage = 4
centrality = np.zeros((num_stage, len(Alpha),4))
for k in range(num_stage):
    for i,alpha in enumerate(Alpha):
        temp_centrality = np.asarray(scipy.sparse.load_npz(folder + '/closeness_centrality_alpha_'+str(alpha)+'_stage_'+str(k)+'.npz').todense())
        centrality[k,i,0] = temp_centrality.min()
        centrality[k,i,1] = temp_centrality.mean()
        centrality[k,i,2] = temp_centrality.max()
        centrality[k,i,3] = temp_centrality.std()


In [None]:
centrality_for_size = np.load('all_email_by_stage_centrality.npy')
centrality_for_size.shape

In [None]:
fig, ax1 = plt.subplots(figsize = (15,5))

x = centrality[:,2,1]
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.set_title('weighted closeness centrality and network density', fontsize = 24)
ax1.bar(np.arange(len(idxes)), x, label = 'avg centrality', alpha = 0.6, color = 'purple')
ax1.legend(loc=2)
# the range is too large
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,10], y2 = centrality[:,8], color = 'red', alpha = 0.3)
# plt.fill_between(np.arange(len(idxes)) , y1 = centrality[:,9] + centrality[:,11], y2 = centrality[:,9] - centrality[:,11], color = 'red', alpha = 0.3)
ax1.set_xticks(np.arange(len(idxes)))
ax1.set_xticklabels(stage_labels, rotation = 90, fontsize = 14)
ax1.set_xlabel('time', fontsize = 16)
ax1.set_ylabel('avg centrality', color='purple', fontsize = 20)
ax1.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
ax1.set_yticks(np.linspace(x.min(), x.max(), 7))
ax1.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
# for ticklabel in plt.gca().get_xticklabels():
#     ticklabel.set_color(time_labels_for_tickcolor[ticklabel.get_text()])
ax1.grid(axis = 'y')

x = centrality[:,2,3]
ax2 = ax1.twinx()
ax2.spines['left'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines["right"].set_position(("axes", -0.12))
ax2.plot(x ,label = 'std centrality', c = 'orange', linewidth = 3, ls = 'dashed')
ax2.set_ylabel('std centrality', color='orange', fontsize = 20, x = -1.2, y = 0.5)
ax2.yaxis.set_label_coords(-0.14 ,0.5) 
ax2.set_yticks(np.linspace(x.min(), x.max(), 7))
ax2.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax2.legend(loc=2, bbox_to_anchor=(0.0,0.9))

x = centrality_for_size[:,23]
ax3 = ax1.twinx()
ax3.spines['right'].set_visible(False)
ax3.spines['left'].set_visible(False)
ax3.spines['top'].set_visible(False)
ax3.spines['bottom'].set_visible(False)
ax3.plot(x, label = 'density', c = 'gray', linewidth = 3, alpha = 0.6)
ax3.set_ylabel('density', color='gray', fontsize = 20)
ax3.set_yticks(np.linspace(x.min(), x.max(), 7))
ax3.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax3.legend(loc=1)


x = centrality_for_size[:,24]
ax4 = ax1.twinx()
ax4.spines['right'].set_visible(False)
ax4.spines['left'].set_visible(False)
ax4.spines['top'].set_visible(False)
ax4.spines['bottom'].set_visible(False)
ax4.spines["right"].set_position(("axes", 1.08))
ax4.plot(x, label = 'modified density', c = 'black', linewidth = 2, alpha = 0.6)
ax4.set_ylabel('modified density', color='black', fontsize = 20, x = 1.2, y = 0.5)
ax4.set_yticks(np.linspace(x.min(), x.max(), 7))
ax4.set_ylim(x.min() - 0.01*(x.max()-x.min()), x.max() + 0.01*(x.max()-x.min()))
ax4.legend(loc=1, bbox_to_anchor=(1.0,0.9))

plt.tight_layout()
plt.show()

### top n bar plot (weighted)

In [None]:
centrality_names = ['betweenness_centrality', 'closeness_centrality']

In [None]:
df_centrality = pd.DataFrame(columns = ['name', 'alpha', 'stage', 'centrality_name', 'centrality_value'])
# df_centrality['name'] = unique_people

In [None]:
df_centrality = pd.DataFrame()
df_centrality['name'] = unique_people
for k in range(4):
    for name in centrality_names:
        df_temp = pd.DataFrame()
        for alpha in Alpha:
            sparse_matrix = scipy.sparse.load_npz('./centrality_by_stage_weighted/'+name+'_alpha_'+str(alpha)+'_stage_'+str(k)+'.npz')
            dense = np.asarray(sparse_matrix.todense()).reshape(-1)
            df_centrality[name+'_alpha_'+str(alpha)+'_stage_'+str(k)] = dense

df_centrality

#### cra and comm

In [None]:
df_cra_comm = df_centrality.iloc[total_idx_list]
df_cra_comm = df_cra_comm.reset_index(drop=True)
df_cra_comm

In [None]:
df_cra_comm_reordered = pd.DataFrame()
for col_name in list(df_cra_comm.columns)[1:]:
    centrality_name = '_'.join(col_name.split('_')[:2])
    stage = int(col_name[-1])
    alpha = float(col_name.split('_')[-3])
    df_temp = df_cra_comm[['name']+ [col_name]].rename({col_name: 'centrality'}, axis='columns')
    df_temp['stage'] = stage
    df_temp['centrality_name'] = centrality_name
    df_temp['alpha'] = alpha
    
    df_cra_comm_reordered = df_cra_comm_reordered.append(df_temp)
df_cra_comm_reordered

In [None]:
alt.data_transformers.disable_max_rows()

centrality_options = df_cra_comm_reordered["centrality_name"].unique().tolist()
dropdown = alt.binding_select(options=centrality_options)
slider1 = alt.binding_range(min=0, max=3, step=1)
slider2 = alt.binding_range(min=0.5, max=1.5, step=0.5)


selection = alt.selection_single(
    fields=['centrality_name', 'stage', 'alpha'],
    bind={'centrality_name': dropdown, 'stage': slider1, 'alpha': slider2},
    name="Select",
    init={"centrality_name": "betweenness_centrality", "stage": 0, 'alpha': 1}
)

brush = alt.selection(type="interval", encodings=['x'])

# color = alt.condition(selection,
#                       alt.Color('centrality_name:O', legend=None),
#                       alt.value('lightgray'))


base = alt.Chart(df_cra_comm_reordered, title="Base view").mark_bar().encode(x=alt.X('name:O', sort='-y'),
    y='centrality:Q'
).add_selection(
    selection
).add_selection(
    brush
).transform_filter(
    selection
).transform_window(
    rank='rank(centrality)',
    sort=[alt.SortField('centrality', order='descending')]
).transform_filter(
    (alt.datum.rank < 200)
).properties(
    width=850,
    height=50
)

zoomed = alt.Chart(df_cra_comm_reordered).mark_bar().transform_filter(
    selection
).encode(x=alt.X('name:O', sort='-y', scale=alt.Scale(zero=False, domain=brush)),
    y='centrality:Q'
).properties(
    title={
      "text": "Top CRA and commisionner people with highest weighted centrality", 
      "subtitle": ["Select an interval in the base view, slide to choose a stage, and select one centrality from the dropdown list", 
                   "There are "+str(len(df_cra_comm))+" CRA and commisionner people. All are shown here.", 
                   "A known issue/feature is, the highest bar is always shown, even when it's not in the range. A tiny stair next to it shows the true corresponding value"], 
      "color": "black",
      "subtitleColor": "green"
    },
    width=850,
    height=400
)

chart = zoomed & base

chart.save('weighted_cra_comm_centrality_sorted_bar.html')
chart

#### gov

In [None]:
df_gov = df_centrality.iloc[government_idx_list]
df_gov = df_gov.reset_index(drop=True)
df_gov

In [None]:
df_gov_reordered = pd.DataFrame()
for col_name in list(df_gov.columns)[1:]:
    centrality_name = '_'.join(col_name.split('_')[:2])
    stage = int(col_name[-1])
    alpha = float(col_name.split('_')[-3])
    df_temp = df_gov[['name']+ [col_name]].rename({col_name: 'centrality'}, axis='columns')
    df_temp['stage'] = stage
    df_temp['centrality_name'] = centrality_name
    df_temp['alpha'] = alpha
    df_gov_reordered = df_gov_reordered.append(df_temp)
df_gov_reordered

In [None]:
alt.data_transformers.disable_max_rows()

centrality_options = df_gov_reordered["centrality_name"].unique().tolist()
dropdown = alt.binding_select(options=centrality_options)
slider1 = alt.binding_range(min=0, max=3, step=1)
slider2 = alt.binding_range(min=0.5, max=1.5, step=0.5)


selection = alt.selection_single(
    fields=['centrality_name', "stage", "alpha"],
    bind={'centrality_name': dropdown, 'stage': slider1, 'alpha':slider2},
    name="Select",
    init={"centrality_name": "betweenness_centrality", "stage": 0, "alpha": 1}
)

brush = alt.selection(type="interval", encodings=['x'])

# color = alt.condition(selection,
#                       alt.Color('centrality_name:O', legend=None),
#                       alt.value('lightgray'))


base = alt.Chart(df_gov_reordered, title="Base view").add_selection(
    selection
).add_selection(
    brush
).transform_filter(
    selection
).mark_bar().encode(x=alt.X('name:O', sort='-y', axis=alt.Axis(labels=False)),
    y='centrality:Q'
).transform_window(
    rank='rank(centrality)',
    sort=[alt.SortField('centrality', order='descending')]
).transform_filter(
    (alt.datum.rank < 100)
).properties(
    width=850,
    height=50
)

zoomed = alt.Chart(df_gov_reordered).transform_filter(
    selection
).transform_window(
    rank='rank(centrality)',
    sort=[alt.SortField('centrality', order='descending')]
).transform_filter(
    (alt.datum.rank < 100)
).encode(x=alt.X('name:O', sort='-y'),
    y='centrality:Q'
).transform_filter(
    brush
).mark_bar().properties(
    title={
      "text": "Top Gov people with highest weighted centrality", 
      "subtitle": ["Select an interval in the base view, slide to choose a stage, and select one centrality from the dropdown list", 
                   "There are "+str(len(df_gov))+" gov people. Only top 100 are shown.", 
                   "A known issue/feature is, the highest bar is always shown, even when it's not in the range. A tiny stair next to it shows the true corresponding value"], 
      "color": "black",
      "subtitleColor": "green"
    },
    width=850,
    height=400
)

chart = zoomed & base

# chart.save('weighted_gov_centrality_sorted_bar.html')
chart

#### others

In [None]:
df_others = df_centrality.iloc[~df_centrality.index.isin(total_idx_list)&~df_centrality.index.isin(government_idx_list)]
df_others = df_others.reset_index(drop=True)
df_others

In [None]:
df_others_reordered = pd.DataFrame()
for col_name in list(df_others.columns)[1:]:
    centrality_name = '_'.join(col_name.split('_')[:2])
    stage = int(col_name[-1])
    alpha = float(col_name.split('_')[-3])
    df_temp = df_others[['name']+ [col_name]].rename({col_name: 'centrality'}, axis='columns')
    df_temp['stage'] = stage
    df_temp['centrality_name'] = centrality_name
    df_temp['alpha'] = alpha
    df_others_reordered = df_others_reordered.append(df_temp)
df_others_reordered

In [None]:
df_others_reordered.to_csv('weighted_df_others.csv')

In [None]:
alt.data_transformers.disable_max_rows()

centrality_options = df_others_reordered["centrality_name"].unique().tolist()
dropdown = alt.binding_select(options=centrality_options)
slider1 = alt.binding_range(min=0, max=3, step=1)
slider2 = alt.binding_range(min=0.5, max=1.5, step=0.5)


selection = alt.selection_single(
    fields=['centrality_name', "stage", "alpha"],
    bind={'centrality_name': dropdown, 'stage': slider1, 'alpha':slider2},
    name="Select",
    init={"centrality_name": "betweenness_centrality", "stage": 0, "alpha": 1}
)

brush = alt.selection(type="interval", encodings=['x'])

# color = alt.condition(selection,
#                       alt.Color('centrality_name:O', legend=None),
#                       alt.value('lightgray'))


base = alt.Chart(df_others_reordered, title="Base view").add_selection(
    selection
).add_selection(
    brush
).transform_filter(
    selection
).mark_bar().encode(x=alt.X('name:O', sort='-y', axis=alt.Axis(labels=False)),
    y='centrality:Q'
).transform_window(
    rank='rank(centrality)',
    sort=[alt.SortField('centrality', order='descending')]
).transform_filter(
    (alt.datum.rank < 100)
).properties(
    width=850,
    height=50
)

zoomed = alt.Chart(df_others_reordered).transform_filter(
    selection
).transform_window(
    rank='rank(centrality)',
    sort=[alt.SortField('centrality', order='descending')]
).transform_filter(
    (alt.datum.rank < 100)
).encode(x=alt.X('name:O', sort='-y', scale=alt.Scale(zero=False, domain=brush)),
    y='centrality:Q'
).mark_bar().properties(
    title={
      "text": "Top other people with highest weighted centrality", 
      "subtitle": ["Select an interval in the base view, slide to choose a stage, and select one centrality from the dropdown list", 
                   "There are "+str(len(df_others))+" other people. Only top 100 are shown.", 
                   "A known issue/feature is, the highest bar is always shown, even when it's not in the range. A tiny stair next to it shows the true corresponding value"], 
      "color": "black",
      "subtitleColor": "green"
    },
    width=850,
    height=400
)

chart = zoomed & base

chart.save('weighted_others_centrality_sorted_bar.html')
chart

# emails in selected month (if necessary)