In [1]:
import sys

import networkx as nx
import numpy as np
import pandas as pd
from plotnine import *
import scipy.sparse
import tqdm

sys.path.insert(0, '../')
import analysis

In [2]:
edges_df = pd.read_csv('../../data/2.edges/ppi.tsv.xz', sep='\t')

id_to_degree = (
    edges_df
    .groupby('id_a')['train']
    .sum()
    .reset_index()
    .merge(
        edges_df.groupby('id_b')['train'].sum().reset_index(), 
        left_on='id_a', right_on='id_b',how='outer'
    )
    .fillna(0)
    .assign(
        degree=lambda df: df['train_x'] + df['train_y']
    )
    .loc[:, 'degree'].to_dict()
)

train_edges = list(map(tuple, (
    edges_df
    .query('train == 1')
    .loc[:, 'id_a':'id_b']
    .values
)))

G = nx.from_edgelist(train_edges)

In [3]:
name_to_link_prediction = {
    name: nx.link_prediction.__dict__[name]
    for name in {'resource_allocation_index', 'adamic_adar_index', 'preferential_attachment'}
}

name_to_generator = {
    name: f(G) for name, f in name_to_link_prediction.items()
}

In [4]:
name_to_generator

{'preferential_attachment': <generator object _apply_prediction.<locals>.<genexpr> at 0x7fae128a0b88>,
 'adamic_adar_index': <generator object _apply_prediction.<locals>.<genexpr> at 0x7fae128a0c78>,
 'resource_allocation_index': <generator object _apply_prediction.<locals>.<genexpr> at 0x7fae1273d1b0>}

In [5]:
rows = list()
for name, generator in name_to_generator.items():
    for row in tqdm.tqdm_notebook(generator, total=8077881):
        rows.append(row + (name,))

HBox(children=(IntProgress(value=0, max=8077881), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8077881), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8077881), HTML(value='')))




In [6]:
df = (
    pd.DataFrame(rows, columns=['id_a', 'id_b', 'value', 'feature'])
    .pivot_table(index=['id_a', 'id_b'], columns='feature', values='value')
    .reset_index()
    .rename_axis(None, axis=1)
    .assign(
        source=lambda df: df['id_a'].map(id_to_degree).astype(int),
        target=lambda df: df['id_b'].map(id_to_degree).astype(int),
    )
)

df.to_csv('prediction_features.csv.gz', compression='gzip', index=False)

df.head(2)

Unnamed: 0,id_a,id_b,adamic_adar_index,preferential_attachment,resource_allocation_index,source,target
0,0,1,0.0,165.0,0.0,55,3
1,0,2,0.228205,3520.0,0.0125,55,64
