In [1]:
%cd /scratch/bruingjde/SNAM2021-code/

import os
import typing

import joblib
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
import sklearn.linear_model
import sklearn.model_selection
import sklearn.metrics
import sklearn.pipeline
import sklearn.preprocessing
from tqdm.auto import tqdm

import tlp

# LIKELY TOO SLOW!

/scratch/bruingjde/SNAM2021-code


In [16]:
# Check
ok = True
for entry in os.scandir('data'):
  if not os.path.isfile(os.path.join(entry, 'features', 'PPR.pkl')):
    ok = False
    print(entry)
if ok: print('Ok!')

<DirEntry '25'>
<DirEntry '23'>
<DirEntry '07'>
<DirEntry '04'>
<DirEntry '10'>
<DirEntry '24'>
<DirEntry '08'>
<DirEntry '22'>
<DirEntry '15'>
<DirEntry '30'>
<DirEntry '21'>
<DirEntry '11'>
<DirEntry '28'>
<DirEntry '29'>
<DirEntry '01'>
<DirEntry '17'>
<DirEntry '18'>
<DirEntry '20'>
<DirEntry '03'>
<DirEntry '26'>
<DirEntry '05'>
<DirEntry '14'>
<DirEntry '06'>
<DirEntry '27'>
<DirEntry '02'>
<DirEntry '09'>
<DirEntry '16'>


In [23]:
def ppr(path: str, verbose: bool = False, feature_name='PPR'):
  # Check if file exists
  feature_path = os.path.join(path, 'features')
  file = os.path.join(feature_path, f'{feature_name}.pkl')
  if os.path.isfile(file): return 
  
  os.makedirs(feature_path, exist_ok=True)

  # Read in
  edgelist_mature_file = os.path.join(path, 'edgelist_mature.pkl')
  if verbose: print(f'Read {edgelist_mature_file}')
  edgelist_mature = pd.read_pickle(edgelist_mature_file)
  
  instances_file = os.path.join(path, 'instances_sampled.npy')
  if verbose: print(f'Read {instances_file}')  
  instances_sampled = np.load(instances_file)

  G = nx.from_pandas_edgelist(edgelist_mature, create_using=nx.MultiGraph)
  
  # Get all unique nodes in instances
  nodes_sampled = {node for instance in instances_sampled for node in instance}
  
  # Calculate personalized pagerank for each node
  # Source dealing with tol: https://github.com/Aghasemian/OptimalLinkPrediction/blob/master/Code/OLP.py
  tol = 1e-6
  while True:
    try:
      value_per_node = {node: nx.pagerank_scipy(G, personalization=dict(node=1), max_iter=10000, tol=tol) for node in tqdm(nodes_sampled, disable=not verbose)}
      break # Not run when exception in line above.
    except nx.PowerIterationFailedConvergence:
      tol*=10
  
  value_per_instance = [(value_per_node[u][v], value_per_node[v][u]) for u, v in tqdm(instances_sampled, disable=not verbose)]
  
  def diff(x): return abs(x[1]-x[0]) 
  
  results = {
    tlp.Experiment(feature=feature_name, time_aware=False, nodepair_strategy=strategy_str): (
      np.array([strategy_func(instance) for instance in value_per_instance])) 
    for strategy_str, strategy_func in {'sum': sum, 'diff': diff, 'max': max, 'min': min}.items()
  }
  
  joblib.dump(results, os.path.join(feature_path, f'{feature_name}.pkl'))

In [20]:
entries = sorted(os.scandir('data'), key=lambda x: x.name)
for entry in tqdm(entries):
  ppr(entry, verbose=True)

  0%|          | 0/30 [00:00<?, ?it/s]

Read data/01/edgelist_mature.pkl
Read data/01/instances_sampled.npy


  0%|          | 0/30236 [00:00<?, ?it/s]

  p = p / p.sum()


PowerIterationFailedConvergence: (PowerIterationFailedConvergence(...), 'power iteration failed to converge within 10000 iterations')

In [None]:
entries = sorted(os.scandir('data'), key=lambda x: x.name)
tlp.ProgressParallel(n_jobs=len(entries), total=len(entries))(
  joblib.delayed(ppr)(entry.path) for entry in entries
)

  0%|          | 0/30 [00:00<?, ?it/s]