In [1]:
import sys

import networkx as nx
import pandas as pd
import tqdm

sys.path.insert(0, '../')

import analysis

In [2]:
edges_df = pd.read_csv('../../data/task3/2.edges/ppi.tsv.xz', sep='\t')

edges_df.head(2)

Unnamed: 0,name_a,name_b,id_a,id_b,train,test_recon,test_new
0,A0A087WT00,O00154,0,48,1,1,0
1,A0A087WT00,O43736,0,237,0,1,0


In [3]:
network = 'train'

id_to_degree = (
    edges_df
    .groupby('id_a')[network]
    .sum()
    .reset_index()
    .rename(columns={network: 'degree'})
    .merge(
        edges_df
        .groupby('id_b')[network]
        .sum()
        .reset_index()
        .rename(columns={network: 'degree'}),
        left_on='id_a', right_on='id_b', how ='outer'
    )
    .fillna(0)
    .assign(degree=lambda df: df['degree_x'] * df['degree_y'])
    .loc[:, 'degree']
    .to_dict()
)

edges = list(map(tuple, (
    edges_df
    .query(f'{network} == 1')
    .loc[:, 'id_a':'id_b']
    .values
)))

G = nx.from_edgelist(edges)

feature_to_generator = {
    'resource_allocation_index': nx.link_prediction.resource_allocation_index(G),
    'adamic_adar_index': nx.link_prediction.adamic_adar_index(G),
    'preferential_attachment': nx.link_prediction.preferential_attachment(G),
}

rows = list()
for feature_name, generator in feature_to_generator.items():
    for row in tqdm.tqdm_notebook(generator, total=7968660):
        rows.append(row + (feature_name,))

HBox(children=(IntProgress(value=0, max=7968660), HTML(value='')))

KeyboardInterrupt: 

In [None]:
df = (
    pd.DataFrame(rows, columns=['id_a', 'id_b', 'value', 'feature'])
    .pivot_table(index=['id_a', 'id_b'], columns='feature', values='value')
    .reset_index()
    .rename_axis(None, axis=1)
    .assign(
        network=network
    )
)

df.to_csv('prediction_features.csv.xz', compression='xz', index=False)

df.head(2)