In [1]:
import hetmatpy.hetmat
import networkx as nx
import numpy as np
import pandas as pd
import scipy.sparse
import sklearn.metrics
import tqdm

import xswap

In [2]:
hetmat = hetmatpy.hetmat.HetMat('../../data/task1/hetionet-v1.0.hetmat/')

metapaths = hetmat.metagraph.extract_all_metapaths(1, True)
metaedges = [path[0] for path in metapaths]

In [3]:
def mean_degree_variance(matrix):
    '''Compute the geometric mean of source and target degree variances'''
    return np.sqrt(np.var(matrix.sum(axis=0)) * np.var(matrix.sum(axis=1)))

def heterogeneity(G):
    '''
    Compute a topology-independent measure of degree heterogeneity
    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5319345/
    '''
    hist = np.array(nx.degree_histogram(G))
    hist = hist / hist.sum()

    h = np.sqrt(((1 - hist[hist != 0])**2).sum() / len(hist))
    h_het = np.sqrt(1 - 3 / len(hist))
    h_m = h / h_het
    return h_m

def density(matrix):
    if scipy.sparse.issparse(matrix):
        matrix = matrix.toarray()
        
    m, n = matrix.shape
    if m != n:
        # dealing with bi-adjacency matrix
        return matrix.sum() / (m * n)
    
    symmetric = np.array_equal(matrix, matrix.T)
    n_edges = np.triu(matrix).sum() if symmetric else matrix.sum()
    
    allow_self_loop = matrix.diagonal().sum() > 0
    
    if symmetric and allow_self_loop:
        n_possible = n * (n + 1) / 2
    elif symmetric and not allow_self_loop:
        n_possible = n * (n - 1) / 2
    elif not symmetric and allow_self_loop:
        n_possible = n ** 2
    else:
        n_possible = n ** 2 - n

    return n_edges / n_possible

def gini(array):
    """
    Calculate the Gini coefficient of a numpy array.
    https://github.com/oliviaguest/gini/blob/master/gini.py
    """
    # based on bottom eq:
    # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg
    # from:
    # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    # All values are treated equally, arrays must be 1d:
    array = array.flatten()
    if np.amin(array) < 0:
        # Values cannot be negative:
        array -= np.amin(array)
    # Values cannot be 0:
    array += 0.0000001
    # Values must be sorted:
    array = np.sort(array)
    # Index per array element:
    index = np.arange(1,array.shape[0]+1)
    # Number of array elements:
    n = array.shape[0]
    # Gini coefficient:
    return ((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array)))

In [4]:
records = list()
for metaedge in tqdm.tqdm_notebook(metaedges):
    source, target, mat = hetmat.metaedge_to_adjacency_matrix(metaedge, dtype=int, dense_threshold=1)
    G = nx.bipartite.from_biadjacency_matrix(mat)

    degree_sequence = np.array(list(dict(nx.degree(G)).values()))

    features = {
        'metaedge': metaedge.abbrev,
        'num_source': len(source),
        'num_target': len(target),
        'var': np.var(degree_sequence),
        'mean_var': mean_degree_variance(mat),
        'heterogeneity': heterogeneity(G),
        'density': density(mat),
        'gini': gini(degree_sequence.astype(float))
    }
    records.append(features)

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))




In [5]:
df = pd.DataFrame.from_records(records)
df.to_csv('../../data/task1/hetionet_features.csv', index=False)
df.head()

Unnamed: 0,density,gini,heterogeneity,mean_var,metaedge,num_source,num_target,var
0,0.065403,0.437884,0.666925,105.400995,AlD,402,137,178.424857
1,0.012143,0.784978,0.103437,4746.423796,AdG,402,20945,16213.712277
2,0.062519,0.728028,0.113018,63757.05258,AeG,402,20945,213200.302894
3,0.011621,0.747376,0.107764,3429.167911,AuG,402,20945,13951.699862
4,0.002347,0.739579,0.789888,5430.031771,BPpG,11381,20945,6274.709148
