**Required output_attr_stats.csv**   
**Generates two folders: adj_folder and centrality_folder, with all calculation results inside**

# calculation flow inside this notebook

| Steps | Based on         | Calculate        | Speed | Result saved |
|-------|------------------|------------------|-------|--------------|
| 1     | emails           | adjacency matrix | slow  | yes          |
| 2     | adjacency matrix | graph            | fast  | no           |
| 3     | graph            | centrality       | slow  | yes          |


Although the adj mats are named undirected, directed adj mats can be easily get by doing matrix operations.

In [27]:
import numpy as np
import networkx as nx
import matplotlib
import matplotlib.pyplot as plt
# import matplotlib.colors as mcolors
import pandas as pd
from datetime import datetime
from tqdm.auto import tqdm,trange
import warnings
# import logging
import scipy.sparse
# import altair as alt
# from vega_datasets import data
from pathlib import Path
import os
import json

warnings.filterwarnings("ignore") 
# logging.getLogger('matplotlib.font_manager').disabled = True

In [2]:
def nameToIndexDict(l_unique_names):
    '''
    create name to index dictionary and index to name dictionary for later use
    
    l_unique_names: a list of sorted unique names
    return: name2id
    return: id2name
    '''
    name2id = {}
    for idx,name in enumerate(l_unique_names):    # unique_names
        name2id[name] = idx

    id2name = {}
    for idx, name in enumerate(l_unique_names):
        id2name[idx] = name

    return name2id, id2name

In [3]:
def get_people_list(from_list, to_list, cc_list):
    '''
    Collect all unique people
    
    from_list: list of senders
    to_list: list of recipients, ';' seperated
    cc_list: list of cc recipients, ';' seperated
    return: unique_people: a people list in alphabetical order
    return: invalid_people_idx: a list of index pointing to invalid people
    '''
    unique_people = set()
    for i in range(len(from_list)):
        if not pd.isnull(from_list[i]):
            unique_people.add(from_list[i])

    for i in range(len(to_list)):
        if not pd.isnull(to_list[i]):
            for lst in to_list[i].split(';'):
                unique_people.add(lst)
                
    for i in range(len(cc_list)):
        if not pd.isnull(cc_list[i]):
            for lst in cc_list[i].split(';'):
                unique_people.add(lst)

    unique_people = sorted(list(unique_people))
    invalid_people_idx = []
    for i,people in enumerate(unique_people):
        if 'invalid' in people:
            invalid_people_idx.append(i)
    return unique_people, invalid_people_idx

In [4]:
def seperate_emails(df_email, by, date_col = 'date_sent'):
    '''
    seperate emails by stage or month
    
    df_email: dataframe with email records 
    by: how to seperate, either by month or by stage
    date_col: name of the date column in df_email
    return: idxes: list of indices, corresponds to each splitting period
    return: idx_labels: list of labels, corresponds to each splitting period
    return: label_colors: dict of label:color, corresponds to each splitting period
    '''
    assert by in ['month', 'stage'], 'not implemented'
    date_email = pd.to_datetime(df_email[date_col])
    colors = ['green', 'blue', 'red', 'orange']
    
    if by == 'stage':
        idxt1 = date_email>datetime(2014,1,23)
        idxt2 = date_email>datetime(2014,5,14)
        idxt3 = date_email>datetime(2016,2,24)
        idxe1 = np.array(idxt1==False)
        idxe2 = np.array((idxt1==True) & (idxt2==False))
        idxe3 = np.array((idxt2==True) & (idxt3==False))
        idxe4 = np.array(idxt3==True)

        idxes = [idxe1, idxe2, idxe3, idxe4]
        idx_labels = ['before 2014,1,23', '2014,1,23 - 2014,5,14', '2014,5,14 - 2016,2,24', 'after 2016,2,24']
        label_colors = {}
        for label, color in zip(idx_labels, colors):
            label_colors[label] = color
    elif by == 'month':
        month = date_email.dt.month
        year = date_email.dt.year
        idxes = []
        idx_labels = []
        
        color = colors[0]
        label_colors = {}
        skip = True # skip initial empty months and ending months
        for y in range(int(year.min()),int(year.max()+1)):
            for m in range(1,13):
                idx = (year == y) * (month == m)
                if idx.sum() == 0 and skip == True:
                    continue
                else:
                    skip = False
                    idxes.append(np.array(idx))
                    label = str(y)+'.'+str(m)
                    idx_labels.append(label)
                    label_colors[label] = color
                    if label == '2014.1':
                        color = colors[1]
                    elif label == '2014.5':
                        color = colors[2]
                    elif label == '2016.2':
                        color = colors[3]
        for i,idx in enumerate(idxes[::-1]): # remove the empty months in the end
            if idx.sum() != 0:
                break
        idxes = idxes[:-i]
        idx_labels = idx_labels[:-i]
    return idxes, idx_labels, label_colors

In [5]:
def prepare_auxiliary_cols(df_email):
    '''
    Create cols 'num_TO', 'num_CC', and 'num_TO_CC'. During building adjacency matrix, these columns may be used to scale edges
    
    df_email: dataframe with email records 
    '''
    from_list = df_email['From'].values.tolist()
    to_list = df_email['To'].values.tolist()
    cc_list = df_email['CC'].values.tolist()
    num_TO = []
    num_CC = []
    num_TO_CC = []
    for i in range(len(to_list)):
        num_TO.append(len(to_list[i].split(';')))
        num_CC.append(len(cc_list[i].split(';')))
        num_TO_CC.append(len(to_list[i].split(';')) + len(cc_list[i].split(';')))
    df_email['num_TO'] = num_TO
    df_email['num_CC'] = num_CC
    df_email['num_TO_CC'] = num_TO_CC

In [29]:
def build_adj_mat(df_email, by, name2id, adj_folder, scale_edges=False, save=True, force_recalc=False):
    '''
    build adjacency matrix and save
    
    df_email: dataframe with email records 
    by: how to seperate, either by month or by stage
    name2id: dictionary, name to index mapping
    adj_folder: where do you want to save adj matrix
    scale_edges: if True, instead of add 1 to the adjmat for each edge, add 1/num_TO_CC
    save: whether to save the adj matrix
    force_recalc: if True, recalculate the adj matrix even if the output file exist
    return: idxes: list of indices, corresponds to each splitting period
    return: idx_labels: list of labels, corresponds to each splitting period
    return: label_colors: dict of label:color, corresponds to each splitting period
    '''
    idxes, idx_labels, label_colors = seperate_emails(df_email, by)
    
    num_people = len(name2id)
    
    for k in trange(len(idxes)):
        outfile = adj_folder+'/adjmat_'+by+'_'+str(k)+'.npz'
        if force_recalc or os.path.exists(outfile):
            print(f'outfile {outfile} found! Skipped!')
            continue
        df_temp = df_email.iloc[idxes[k]]
        num_email = len(df_temp)
        if num_email == 0:
            print(f'no email found in period {k}! Skipped!')
            continue
            
        if scale_edges:
            prepare_auxiliary_cols(df_temp)
            num_TO_CC = df_temp['num_TO_CC'].values.tolist()
        
        from_list = df_temp['From'].values.tolist()
        to_list = df_temp['To'].values.tolist()
        cc_list = df_temp['CC'].values.tolist()
        
        email_adj_mat = np.zeros((num_people,num_people))   
        
        for i in range(len(from_list)):
            if not pd.isnull(from_list[i]):
                s = name2id[from_list[i]] # id of the sender

                if not pd.isnull(to_list[i]):
                    rs = to_list[i].split(';')
                    for j in range(len(rs)):
                        r = name2id[rs[j]]  # id of the receiver
                        if scale_edges:
                            email_adj_mat[s,r] += 1/(num_TO_CC[i])
                        else:
                            email_adj_mat[s,r] += 1
                if not pd.isnull(cc_list[i]):
                    cs = cc_list[i].split(';')
                    for j in range(len(cs)):
                        r = name2id[cs[j]]  # id of the receiver
                        if scale_edges:
                            email_adj_mat[s,r] += 1/(num_TO_CC[i])
                        else:
                            email_adj_mat[s,r] += 1
                            
        if save:
            email_adj_mat_tosave = scipy.sparse.csc_matrix(email_adj_mat)
            scipy.sparse.save_npz(outfile, email_adj_mat_tosave)
    if save: 
        idxes_outfile = 'sparse_idxes_'+by+'.npz'
        sparse_idxes = scipy.sparse.csc_matrix(np.array(idxes))
        scipy.sparse.save_npz(idxes_outfile, sparse_idxes)
        with open('label_colors_'+by+'.json', 'w') as f:
            json.dump(label_colors, f)
    return idxes, idx_labels, label_colors

In [7]:
def build_graph(adj_mat_path, directed, alpha, invalid_people_idx = None):
    '''
    build networkx graph and return it for further calculation or plotting
    
    adj_mat_path: path to the adjacency matrix (file existence should be checked before passing into this function)
    directed: build a directed graph or an undirected graph
    alpha: alpha factor to scale adjacency matrix
    invalid_people_idx: a list of index or None. if None, not removing nodes. 
                        if not None, set corresponding columns and rows to be 0.
    
    return: G: a networkx graph
    '''
    assert directed in [True, False], 'not implemented'
    
    sparse_matrix = scipy.sparse.load_npz(adj_mat_path)
    email_adj_mat = np.asarray(sparse_matrix.todense())
    sparse_matrix = None
    size = email_adj_mat.shape[0]
    email_adj_mat = email_adj_mat**alpha * (email_adj_mat != 0)
    num_people = email_adj_mat.shape[0]
    if directed:
        email_adj_mat = email_adj_mat + email_adj_mat.T
        email_adj_mat = np.triu(email_adj_mat, 1) # this also remove edge to itself
        
    # more emails means closer relationship. Therefore, we use the inverse of email numbers as the distance
    email_adj_mat = 1/email_adj_mat
    email_adj_mat[email_adj_mat==np.inf] = 0.
    if invalid_people_idx is not None:
        for i in invalid_people_idx:
            email_adj_mat[i,:] = 0.0
            email_adj_mat[:,i] = 0.0
    if directed:
        G = nx.from_numpy_array(email_adj_mat, create_using=nx.DiGraph)
    else:
        G = nx.from_numpy_array(email_adj_mat, create_using=nx.Graph)
    
    return G

In [8]:
def calc_centrality(adj_folder, centrality_folder, centrality, directed, weighted, invalid_people_idx, ALPHA=[1], force_recalc=False):
    '''
    calculate centralities
    
    
    adj_folder: where do you saved adj matrix
    centrality_folder: where do you want to save centrality data
    centrality: which centrality you want to calculate
    directed: directed graph or undirected graph
    weighted: weighted edges or edges are equally contributed
    ALPHA: a list of alpha factor to scale the adcacency matrix
    force_recalc: if True, recalculate even if the output file exist
    '''
    assert centrality in ['betweenness', 'closeness', 'degree'], 'not implemented'
    
    for file in tqdm(os.listdir(adj_folder)):
        if not file.endswith(".npz"):
            continue
        _, by, k = file[:-4].split('_')
        infile = adj_folder+'/'+file
        for alpha in ALPHA:
            outfile = centrality_folder+'/'+by+'_'+k+'_alpha_'+str(alpha)+'_'+centrality+'.npz'
            if force_recalc or not os.path.exists(outfile):
                G = build_graph(infile, directed, alpha, invalid_people_idx)
                if centrality == 'betweenness':
                    if weighted:
                        centrality_arr = np.array(list(nx.centrality.betweenness_centrality(G, weight = 'weight').values()))
                    else:
                        centrality_arr = np.array(list(nx.centrality.betweenness_centrality(G).values()))
                elif centrality == 'closeness':
                    if weighted:
                        centrality_arr = np.array(list(nx.centrality.closeness_centrality(G, distance = 'weight').values()))
                    else:
                        centrality_arr = np.array(list(nx.centrality.closeness_centrality(G).values()))
                else:
                    centrality_arr = np.array(list(nx.centrality.degree_centrality(G).values()))
                centrality_arr_tosave = scipy.sparse.csc_matrix(centrality_arr)
                scipy.sparse.save_npz(outfile, centrality_arr_tosave)
                centrality_arr_tosave = None
                centrality_arr = None
                G = None
            else:
                print(f'outfile {outfile} found! Skipped!')


# scripts starting from here

In [9]:
df_email = pd.read_csv('output_attr_stats.csv')

from_list = df_email['From'].values.tolist()
to_list = df_email['To'].values.tolist()
cc_list = df_email['CC'].values.tolist()

unique_people, invalid_people_idx = get_people_list(from_list, to_list, cc_list)
num_people = len(unique_people)
name2id, id2name = nameToIndexDict(unique_people)

In [10]:
adj_folder = "./undirected_adj_03212022"
centrality_folder = "./centrality_weighted_05072022"
Path(adj_folder).mkdir(parents=True, exist_ok=True)
Path(centrality_folder).mkdir(parents=True, exist_ok=True)

In [12]:
by = 'month'
idxes, idx_labels, label_colors = build_adj_mat(df_email, by, name2id, adj_folder)

  0%|          | 0/110 [00:00<?, ?it/s]

outfile ./undirected_adj_03212022/adjmat_month_0.npz found! Skipped!
no email found in period 1! Skipped!
no email found in period 2! Skipped!
no email found in period 3! Skipped!
no email found in period 4! Skipped!
no email found in period 5! Skipped!
no email found in period 6! Skipped!
no email found in period 7! Skipped!
no email found in period 8! Skipped!
no email found in period 9! Skipped!
no email found in period 10! Skipped!
no email found in period 11! Skipped!
no email found in period 12! Skipped!
no email found in period 13! Skipped!
no email found in period 14! Skipped!
no email found in period 15! Skipped!
no email found in period 16! Skipped!
no email found in period 17! Skipped!
no email found in period 18! Skipped!
no email found in period 19! Skipped!
outfile ./undirected_adj_03212022/adjmat_month_20.npz found! Skipped!
no email found in period 21! Skipped!
no email found in period 22! Skipped!
no email found in period 23! Skipped!
no email found in period 24! Skipp

In [11]:
by = 'stage'
idxes, idx_labels, label_colors = build_adj_mat(df_email, by, name2id, adj_folder)

  0%|          | 0/4 [00:00<?, ?it/s]

outfile ./undirected_adj_03212022/adjmat_stage_0.npz found! Skipped!
outfile ./undirected_adj_03212022/adjmat_stage_1.npz found! Skipped!
outfile ./undirected_adj_03212022/adjmat_stage_2.npz found! Skipped!
outfile ./undirected_adj_03212022/adjmat_stage_3.npz found! Skipped!


In [14]:
centrality = 'closeness'
directed = True 
weighted = True
calc_centrality(adj_folder, centrality_folder, centrality, directed, weighted, invalid_people_idx, ALPHA=[1], force_recalc=False)

  0%|          | 0/77 [00:00<?, ?it/s]

In [15]:
centrality = 'betweenness'
directed = True
weighted = True
calc_centrality(adj_folder, centrality_folder, centrality, directed, weighted, invalid_people_idx, ALPHA=[1], force_recalc=False)

  0%|          | 0/77 [00:00<?, ?it/s]

In [16]:
centrality = 'degree'
directed = True
weighted = True
calc_centrality(adj_folder, centrality_folder, centrality, directed, weighted, invalid_people_idx, ALPHA=[1], force_recalc=False)

  0%|          | 0/77 [00:00<?, ?it/s]

# adj mats for concentric network plots

In [17]:
adj_folder = "./undirected_adj_scaled_05072022"
Path(adj_folder).mkdir(parents=True, exist_ok=True)
Path(centrality_folder).mkdir(parents=True, exist_ok=True)
by = 'month'
build_adj_mat(df_email, by, name2id, adj_folder, scale_edges=True, save=True, force_recalc=False)

  0%|          | 0/110 [00:00<?, ?it/s]

no email found in period 1! Skipped!
no email found in period 2! Skipped!
no email found in period 3! Skipped!
no email found in period 4! Skipped!
no email found in period 5! Skipped!
no email found in period 6! Skipped!
no email found in period 7! Skipped!
no email found in period 8! Skipped!
no email found in period 9! Skipped!
no email found in period 10! Skipped!
no email found in period 11! Skipped!
no email found in period 12! Skipped!
no email found in period 13! Skipped!
no email found in period 14! Skipped!
no email found in period 15! Skipped!
no email found in period 16! Skipped!
no email found in period 17! Skipped!
no email found in period 18! Skipped!
no email found in period 19! Skipped!
no email found in period 21! Skipped!
no email found in period 22! Skipped!
no email found in period 23! Skipped!
no email found in period 24! Skipped!
no email found in period 25! Skipped!
no email found in period 26! Skipped!
no email found in period 27! Skipped!
no email found in per

([array([False, False, False, ..., False, False, False]),
  array([False, False, False, ..., False, False, False]),
  array([False, False, False, ..., False, False, False]),
  array([False, False, False, ..., False, False, False]),
  array([False, False, False, ..., False, False, False]),
  array([False, False, False, ..., False, False, False]),
  array([False, False, False, ..., False, False, False]),
  array([False, False, False, ..., False, False, False]),
  array([False, False, False, ..., False, False, False]),
  array([False, False, False, ..., False, False, False]),
  array([False, False, False, ..., False, False, False]),
  array([False, False, False, ..., False, False, False]),
  array([False, False, False, ..., False, False, False]),
  array([False, False, False, ..., False, False, False]),
  array([False, False, False, ..., False, False, False]),
  array([False, False, False, ..., False, False, False]),
  array([False, False, False, ..., False, False, False]),
  array([False

In [30]:
adj_folder = "./undirected_adj_scaled_05072022"
Path(adj_folder).mkdir(parents=True, exist_ok=True)
Path(centrality_folder).mkdir(parents=True, exist_ok=True)
by = 'stage'
build_adj_mat(df_email, by, name2id, adj_folder, scale_edges=True, save=True, force_recalc=False)

  0%|          | 0/4 [00:00<?, ?it/s]

outfile ./undirected_adj_scaled_05072022/adjmat_stage_0.npz found! Skipped!
outfile ./undirected_adj_scaled_05072022/adjmat_stage_1.npz found! Skipped!
outfile ./undirected_adj_scaled_05072022/adjmat_stage_2.npz found! Skipped!
outfile ./undirected_adj_scaled_05072022/adjmat_stage_3.npz found! Skipped!


([array([False, False,  True, ..., False, False,  True]),
  array([False, False, False, ..., False, False, False]),
  array([ True,  True, False, ..., False, False, False]),
  array([False, False, False, ...,  True,  True, False])],
 ['before 2014,1,23',
  '2014,1,23 - 2014,5,14',
  '2014,5,14 - 2016,2,24',
  'after 2016,2,24'],
 {'before 2014,1,23': 'green',
  '2014,1,23 - 2014,5,14': 'blue',
  '2014,5,14 - 2016,2,24': 'red',
  'after 2016,2,24': 'orange'})