In [3]:
import numpy as np
import pandas as pd
import networkx as nx
%matplotlib inline
import matplotlib.pyplot as plt

# Download data: https://www.kaggle.com/manoelribeiro/hateful-users-on-twitter

print 'nx:', nx.__version__

nx: 2.0


In [3]:
node_file = 'pubmed_data/Pubmed-Diabetes.NODE.paper.tab'
edge_file = 'pubmed_data/Pubmed-Diabetes.DIRECTED.cites.tab'

In [21]:
def load_node_data(node_file):
    
    data = None
    
    with open(node_file) as f:
        f.readline() #dummy line
        meta = f.readline().split('\t')[1:-1]
        meta_keys = [s.split(':')[1] for s in meta]
        meta_values = range(len(meta_keys))
        meta_dict = dict(zip(meta_keys, meta_values))
        dim = len(meta_dict)

        features = np.empty((0, dim), np.float)
        nodes = []
        labels = []

        for line in f:
            tokens = line.split('\t')

            node = int(tokens[0])
            nodes.append(node)
            
            label = tokens[1].split('=')[1]
            labels.append(label)
            
            feat_vals = tokens[2:-1]
            feat_vect = np.zeros(shape=(1, dim), dtype=np.float)

            for feat_val in feat_vals:
                feat, val = feat_val.split('=')
                feat_ind = meta_dict[feat]
                feat_vect[0][feat_ind] = val

            features = np.append(features, feat_vect, axis=0)

        data = (np.array(nodes), np.array(labels), features)
        
    return data

In [22]:
nodes, labels, features = load_node_data(node_file)

In [23]:
features.shape

(19717, 500)

In [24]:
print nodes.shape, labels.shape, features.shape

(19717,) (19717,) (19717, 500)


In [27]:
def load_edge_data(edge_file):
    
    edges = np.empty((0, 2), int)

    with open(edge_file) as f:
        f.readline() #dummy line
        f.readline() #dummy line

        for line in f:
            tokens = line.split('\t')
            source = int(tokens[1].split(':')[1])
            target = int(tokens[3].split(':')[1])
            
            edge_vect = np.zeros(shape=(1, 2), dtype=np.int)
            edge_vect[0][0] = source
            edge_vect[0][1] = target
            
            edges = np.append(edges, edge_vect, axis=0)

    return edges

In [28]:
edges = load_edge_data(edge_file)
print edges.shape

(44338, 2)


In [29]:
node_label = pd.DataFrame()
node_label['node'] = nodes
node_label['label'] = labels

print node_label.shape

(19717, 2)


In [30]:
index = np.arange(features.shape[0])
col_names = ['feature'+str(i) for i in range(features.shape[1])]
columns = np.array(col_names)

node_features = pd.DataFrame(data=features, index=index, columns=columns)
node_features['node'] = nodes
node_features = node_features[['node'] + col_names]

print node_features.shape, nodes.shape

(19717, 501) (19717,)


In [31]:
node_features.head(10)

Unnamed: 0,node,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature490,feature491,feature492,feature493,feature494,feature495,feature496,feature497,feature498,feature499
0,12187484,0.093935,0.028698,0.01176,0.019375,0.063161,0.170891,0.067702,0.017555,0.098402,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2344352,0.023618,0.0,0.014784,0.0,0.0,0.0,0.0,0.0,0.030926,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14654069,0.102263,0.0,0.010669,0.0,0.0,0.0,0.0,0.0,0.044636,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16443886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2684155,0.030616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.080179,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,15032912,0.116897,0.0,0.0,0.0,0.0,0.0,0.0,0.005201,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,17988185,0.0,0.0,0.007445,0.0,0.0,0.0,0.0,0.011114,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,9834350,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,16230722,0.0,0.010479,0.004294,0.0,0.0,0.0,0.0,0.004273,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,3542527,0.0,0.0,0.02797,0.0,0.0,0.0,0.0,0.013917,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
edge_list = pd.DataFrame(edges)
edge_list.head()

Unnamed: 0,0,1
0,19127292,17363749
1,19668377,17293876
2,1313726,3002783
3,19110882,14578298
4,18606979,10333910


In [33]:
G = nx.from_pandas_edgelist(edge_list, source=0, target=1)
print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 19717
Number of edges: 44327
Average degree:   4.4963


In [70]:
def random_seed(g):
    """this function returns a single node from g, it's chosen with uniform probability"""
#     ux = randint(0, g.number_of_nodes(), 1)
    ux = np.random.choice(list(g.nodes()), 3, replace=False)
    return ux

def snowball_sampling(g, seed, maxsize=50):
    """this function returns a set of nodes equal to maxsize from g that are 
    collected from around seed node via snownball sampling"""
    if g.number_of_nodes() < maxsize:
        return set()
    print 'seed:', seed
    q = list(seed)
    subgraph = set(q)
    while q:
        top = q[0]
        q.remove(top)
        for node in g.neighbors(top):
            if len(subgraph) == maxsize:
                return subgraph

            q.append(node)
            subgraph.add(node)
                
    return subgraph

In [112]:
rs = random_seed(G)
nodes = snowball_sampling(G, rs, maxsize=4000)
GS = G.subgraph(nodes)
print(nx.info(GS))
print 'denisty:', nx.density(GS)
print 'components:', nx.number_connected_components(GS)
print 'clustering coeff:', nx.average_clustering(GS)

seed: [ 8986914 16949517 12410189]
Name: 
Type: SubGraph
Number of nodes: 4000
Number of edges: 8654
Average degree:   4.3270
denisty: 0.00108202050513
components: 1
clustering coeff: 0.0994155480539


In [34]:
gs_edges = nx.to_pandas_edgelist(G)
gs_edges.head(10)

Unnamed: 0,source,target
0,8617984,10403912
1,8617984,14678267
2,8617984,17286757
3,8617984,15498046
4,8617984,17956579
5,16385830,18178393
6,11272194,18483609
7,11272194,16710474
8,11272194,16537919
9,7438339,11790818


In [35]:
in_source = node_features['node'].isin(gs_edges['source'])
in_target = node_features['node'].isin(gs_edges['target'])
gs_features = node_features[in_source | in_target]
gs_nodes = node_label[in_source | in_target]
print gs_features.shape, gs_nodes.shape

(19717, 501) (19717, 2)


In [36]:
print gs_nodes[gs_nodes['label'] == '1'].shape
print gs_nodes[gs_nodes['label'] == '2'].shape
print gs_nodes[gs_nodes['label'] == '3'].shape

(4103, 2)
(7875, 2)
(7739, 2)


In [37]:
gs_nodes.to_csv('../data/pubmed/nodes.csv', header=None, index=None)
gs_edges.to_csv('../data/pubmed/edges.csv', header=None, index=None)
gs_features.to_csv('../data/pubmed/features.csv', header=None, index=None)