In [1]:
import sys

import numpy as np
import pandas as pd
import scipy.sparse
import tqdm
import xswap

sys.path.insert(0, '../../')

import analysis

In [2]:
n_perms = 1000
allow_antiparallel = True
allow_self_loops = True
directed = True

# 2. TFTG directed network

In [3]:
tftg_df = pd.read_csv('../../../data/task3/3.all_nodes/tftg.tsv.xz', sep='\t', compression='xz')

full_features_df = pd.DataFrame()

for network in ['train', 'test_recon', 'test_new']:
    edges = list(map(tuple, 
        tftg_df
        .query(f'{network} == 1')
        .loc[:, ['id_a', 'id_b']]
        .values
    ))
    mat = analysis.edges_to_matrix(edges, directed=directed).tocsc()
    print(network, mat.shape)

    out_degree, in_degree = analysis.compute_directed_degrees(mat)
    n_source = max(edge[0] for edge in edges) + 1

    feature_dict = {
        'edge_prior': scipy.sparse.csc_matrix((n_source, mat.shape[1])),

        'inf': analysis.directed_inference(mat, out_degree, in_degree, n_source),
        'mean_inf': np.zeros((n_source, mat.shape[1]), dtype=float),
        'p_inf': np.zeros((n_source, mat.shape[1]), dtype=float),
    }

    perm_edges = edges.copy()
    for i in tqdm.tnrange(n_perms):
        perm_edges, _ = xswap.permute_edge_list(
            perm_edges, 
            allow_self_loops=allow_self_loops, 
            allow_antiparallel=allow_antiparallel, 
            seed=i
        )
        perm_mat = analysis.edges_to_matrix(perm_edges, directed=directed).tocsc()

        feature_dict['edge_prior'] += perm_mat[:n_source]

        perm_inf = analysis.directed_inference(perm_mat, out_degree, in_degree, n_source)
        feature_dict['mean_inf'] += perm_inf
        feature_dict['p_inf'] += (perm_inf >= feature_dict['inf'])

    # Post-process features
    network_features_df = None
    for feature, array in feature_dict.items():
        # Normalize features by the number of permutations
        if feature in ['edge_prior', 'mean_inf', 'p_inf']:
            feature_dict[feature] /= n_perms

        # Make features dense (for DataFrame)
        if scipy.sparse.issparse(array):
            array = array.toarray()

        feature_df = (
            pd.DataFrame(array)
            .reset_index()
            .melt(id_vars=['index'])
            .rename(columns={'index': 'id_a', 'variable': 'id_b', 'value': feature})
            .assign(
                id_a=lambda df: df['id_a'].astype(int),
                id_b=lambda df: df['id_b'].astype(int),
                network=network,
            )
        )
        if network_features_df is None:
            network_features_df = feature_df
        else:
            network_features_df = network_features_df.merge(feature_df, on=['id_a', 'id_b', 'network'])

    full_features_df = pd.concat([full_features_df, network_features_df])

train (1441, 1441)


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


test_recon (1441, 1441)


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))


test_new (1441, 1441)


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [4]:
tftg_features_df = (
    tftg_df
    .melt(
        id_vars=['id_a', 'id_b'], 
        value_vars=['train', 'test_recon', 'test_new'], 
        var_name='network', value_name='edge'
    )
    .merge(full_features_df, on=['id_a', 'id_b', 'network'], how='left')
)
tftg_features_df.to_csv('../../../data/task3/4.data/tftg.tsv.xz', sep='\t', compression='xz', index=False)