In [1]:
__author__ = "Lisette Espin"
__copyright__ = "Copyright 2019, GESIS"
__credits__ = ["Lisette Espin-Noboa", 
               "Fariba Karimi",                
               "Claudia Wagner",
               "Markus Strohmaier"]
__license__ = "GPL"
__version__ = "1.0.0"
__maintainer__ = "Lisette Espin"
__email__ = "lisette.espin@gesis.org"
__status__ = "Development"

<div style="text-align:center;"><h1>Anonymizing Real-World networks</h1></div>

<h2>Dependencies</h2>

In [2]:
import os
import sys
import networkx as nx
import multiprocessing
from joblib import delayed
from joblib import Parallel
from collections import Counter

In [3]:
%load_ext autoreload
%autoreload 2

sys.path.append('../../code')
from org.gesis.libs.utils import printf

<h2>Constants and Functions</h2>

In [4]:
DATASETS = ['aps', 'github','pokec','wikipedia'] #'twitter'
ROOT = '../../data/'

In [5]:
def anon(dataset, root):
    printf(dataset)
    fn = os.path.join(ROOT,dataset,'{}_attributed_network.gpickle'.format(dataset))
    
    # converting node names to seq. numbers
    G = nx.read_gpickle(fn)
    G = nx.convert_node_labels_to_integers(G, first_label=1, ordering='default')     
    
    # copy
    H = nx.DiGraph()    
    H.add_edges_from(G.edges()) # singletons (degree 0) are ignored.
    
    # setting graph metadata (infering minority label)
    tmp = Counter([G.node[n][G.graph['class']] for n in G.nodes()])
    print(dataset, tmp)
    tmp = tmp.most_common()
    H.graph['name'] = G.graph['name']
    H.graph['class'] = G.graph['class']
    H.graph['labels'] = [tmp[0][0],tmp[1][0]]
    H.graph['groups'] = ['M','m']    
    H.graph['label'] = 'minority'
        
    # setting node attribute: m (is minority?)
    attrs = {n:{'minority':int(G.node[n][G.graph['class']]==H.graph['labels'][1])} for n in G.nodes()}
    nx.set_node_attributes(H, attrs)
    
    # writing new version
    nx.write_gpickle(H, fn.replace(".gpickle",'_anon.gpickle'))
    print(H.graph)
    n = list(H.nodes())[0]
    print('{}: {}, {}'.format(n, H.node[n]['minority'], G.node[n][G.graph['class']]))
    print()
    del(G)
    del(H)

<h2>Main</h2>

In [6]:
for dataset in DATASETS:
    anon(dataset, ROOT)    

2020-02-27 13:28:19	aps
aps Counter({'05.30.-d': 1157, '05.20.-y': 696})
{'label': 'minority', 'labels': ['05.30.-d', '05.20.-y'], 'groups': ['M', 'm'], 'name': 'APS', 'class': 'pacs'}
1: 0, 05.30.-d

2020-02-27 13:28:19	github
github Counter({'male': 293268, 'female': 18487})
{'label': 'minority', 'labels': ['male', 'female'], 'groups': ['M', 'm'], 'name': 'Github', 'class': 'gender'}
1: 0, male

2020-02-27 13:28:52	pokec
pokec Counter({0: 828304, 1: 804336})
{'label': 'minority', 'labels': [0, 1], 'groups': ['M', 'm'], 'name': 'Pokec', 'class': 'gender'}
1: 1, 1

2020-02-27 13:39:04	wikipedia
wikipedia Counter({'male': 2678, 'female': 481})
{'label': 'minority', 'labels': ['male', 'female'], 'groups': ['M', 'm'], 'name': 'Wikipedia', 'class': 'gender'}
1: 0, male

