In [37]:
import collections
import copy
import datetime
import functools
import itertools
import math
from typing import List, Tuple, Dict

import joblib
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from sklearn.preprocessing import minmax_scale
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold, cross_validate, GridSearchCV, train_test_split
import seaborn as sns
from tqdm import tqdm
from xgboost import XGBClassifier

NodePair = Tuple[int, int]
Edge = List[Tuple[int, int, Dict['date', datetime.datetime]]]

results = dict()

In [63]:
def filter_edgelist(edges: pd.DataFrame, start=0, stop=1, verbose=True) -> pd.DataFrame: 
  """Filter edgelist.  If start/ stop is float, start/stop from the fraction of total edges. If datetime, this is used.""" 
  no_edges = len(edges)
  if start != 0:
    if type(start) is float:
      assert 0 < start < 1
      start = int(start*no_edges)
    if type(start) is int: start = edges.iloc[start]['date']
    start = start + datetime.timedelta(seconds=1)
  else: start = edges['date'].min()
  if verbose: print(start)
  
  if stop != 1:
    if type(stop) is float:
      assert 0 < stop < 1
      stop = math.floor(stop*no_edges)-1
    if type(stop) is int: stop = edges.iloc[stop]['date']
  else: stop = edges['date'].max()
  if verbose: print(stop)
  
  mask = (edges['date'] >= start) & (edges['date'] <= stop)
  if verbose: 
    no_selected_edges = sum(mask)
    print(f'{no_selected_edges=}, ({no_selected_edges/len(edges):.1e})')

  return edges.loc[mask]
def convert_to_set(edges: pd.DataFrame) -> List[NodePair]: return {edge for edge in edges.loc[:, ['source', 'target']].itertuples(index=False, name=None)}
def get_graph(edgelist: pd.DataFrame) -> nx.Graph:
  """Add edge to graph. Contains edge attribute weight."""
  g = nx.Graph()
  
  for u, v, _ in edgelist.itertuples(index=False, name=None):
    weight = g[u][v]["weight"]+1 if g.has_edge(u,v) else 1
    g.add_edge(u, v, weight=weight)
  
  return g
def giant_component(graph: nx.Graph) -> nx.Graph: return graph.subgraph(max(nx.connected_components(graph), key=len)).copy()
def report(graph:nx.Graph, probes: Tuple[int, int]):
  n = len(probes)
  print(f"Number of probes: {n}")
  a = sum([graph.has_edge(u, v) for u, v in probes])
  print(f"- already edge: {a} ({a/n:.0%})")
  non_edges = set(nx.non_edges(graph))
  ne = sum([np in non_edges for np in probes])
  print(f"- both nodes in graph: {ne} ({ne/n:.0%})")
  ng = sum([not (graph.has_node(u) and graph.has_node(v)) for u, v in probes])
  print(f"- not in graph: {ng} ({ng/n:.0%})")
def get_distances(graph: nx.Graph, cutoff: int = None) -> (List[NodePair], List[int]):
  """
  Get all non-edges using BFS. When cutoff provided, consider only node pairs with at most this distance.
  Returns:
  - nodepairs: tuple containing all nodepairs
  - distances: tuple containing all distances
  """
  return zip(
    *[
      ((u, v), distance)
      for u, (nbs_u, _) in tqdm(nx.all_pairs_dijkstra(graph, cutoff, weight=None), total=len(graph), desc="get_distances")
      for v, distance in nbs_u.items() if distance > 1 and (cutoff is None or distance <= cutoff) 
    ]
  )
def read_edges(file: str, sep=' ') -> pd.DataFrame:
  d = pd.read_csv(file, sep, skiprows=1, names=['source', 'target', 'weight', 'date'])
  d['date'] = d['date'].apply(datetime.datetime.fromtimestamp)
  d.sort_values(by='date', inplace=True)
  return d.loc[:, ['source', 'target', 'date']]
def gridsearch(df: pd.DataFrame, random_state=1, also_random=True, max_depth=[1, 2]) -> pd.DataFrame:
  X = df.drop(columns='target').values
  y = df['target'].values
  
  param_grid=dict(max_depth=max_depth, scale_pos_weight=[sum(~y)/sum(y), 1])
  
  X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=1/3, random_state=random_state)
  clf = XGBClassifier(random_state=random_state, tree_method='hist', n_jobs=6)
  gridsearch = GridSearchCV(
    clf, 
    param_grid=param_grid, 
    scoring='average_precision', 
    n_jobs=30,
    cv=StratifiedKFold(shuffle=True, random_state=random_state),
    return_train_score=True
  )
  
  if also_random: 
    gridsearch_random = copy.deepcopy(gridsearch)
    np.random.seed(random_state)
    y_random = copy.deepcopy(y_trainval)
    np.random.shuffle(y_random)
  
  gridsearch.fit(X_trainval, y_trainval)
  df_dict = dict(
      mean_train=gridsearch.cv_results_['mean_train_score'],
      std_train=gridsearch.cv_results_['std_train_score'],
      mean_test=gridsearch.cv_results_['mean_test_score'],
      std_test=gridsearch.cv_results_['std_test_score'],
      test_fold0=gridsearch.cv_results_[f'split0_test_score'],
      test_fold1=gridsearch.cv_results_[f'split1_test_score'],
      test_fold2=gridsearch.cv_results_[f'split2_test_score'],
      test_fold3=gridsearch.cv_results_[f'split3_test_score'],
      test_fold4=gridsearch.cv_results_[f'split4_test_score']
  )
  
  if also_random: 
    gridsearch_random.fit(X_trainval, y_random)
    df_dict['mean_train_random']=gridsearch_random.cv_results_['mean_train_score']
    df_dict['std_train_random']=gridsearch_random.cv_results_['std_train_score']
    df_dict['mean_test_random']=gridsearch_random.cv_results_['mean_test_score']
    df_dict['std_test_random']=gridsearch_random.cv_results_['std_test_score']
  df = pd.DataFrame(df_dict, index=pd.Index([(d['max_depth'], d['scale_pos_weight'] > 1) for d in gridsearch.cv_results_['params']], name=('max_depth', 'balanced')))
  df['diff_train_test'] = (df['mean_test'] - df['mean_train']).abs() / df['mean_test']
  df['rstd_test'] = df['std_test'] / df['mean_test']
  if also_random: df['test_over_random'] = df['mean_test'] - df['mean_test_random']
  return df.sort_values('mean_test', ascending=False)
def get_x_y(df: pd.DataFrame): return df.drop(columns='target').values, df['target'].values
def report_performance(df_train: pd.DataFrame, df_test=None, random_state=1, max_depth=1, tree_method='hist', balanced=True, n_jobs=128):
  X, y = get_x_y(df_train)
  if df_test is None: X, X_test, y, y_test = train_test_split(X, y, test_size=1/3, random_state=random_state)
  else: X_test, y_test = get_x_y(df_test)
  clf = XGBClassifier(max_depth=max_depth, n_jobs=128, tree_method=tree_method, scale_pos_weight=sum(~y)/sum(y) if balanced else 1 , random_state=random_state)
  clf.fit(X, y)
  y_pred = clf.predict_proba(X_test)[:,1]
  return average_precision_score(y_test, y_pred), roc_auc_score(y_test, y_pred)
def flatten(l): return np.array([item for sublist in l for item in sublist])
def get_katz(graph, nodepairs, beta=.005, cutoff=5): 
  return [sum([beta**k * v for k, v in collections.Counter([len(p) for p in nx.all_simple_paths(graph, *nodepair, cutoff=5)]).items()]) for nodepair in nodepairs]
def get_propflow(graph, limit=5):
    score = dict()
    for node in tqdm(graph):
        scores = {node: 1.0}
        found = set()
        newSearch = [node]

        for _ in range(0, limit+1):
            oldSearch = list(newSearch)
            found.update(newSearch)
            newSearch = set()
            while len(oldSearch) != 0:
                n2 = oldSearch.pop()
                nodeInput = scores[n2]
                degree = graph.degree(n2, 'weight')
                flow = 0.0
                for n3 in graph[n2]:
                    wij = graph[n2][n3]['weight']
                    flow = nodeInput * (wij*1.0/degree)
                    scores[n3] = scores.get(n3, 0) + flow
                    if n3 not in found: newSearch.add(n3)
        score[node] = scores
    return score  
def print_status(desc: str): print(f'{datetime.datetime.now().strftime("%H:%M:%S")}: {desc}')
class ProgressParallel(joblib.Parallel):
    def __init__(self, use_tqdm=True, total=None, desc=None, *args, **kwargs):
        self._use_tqdm = use_tqdm
        self._total = total
        self._desc = desc
        super().__init__(*args, **kwargs)

    def __call__(self, *args, **kwargs):
        with tqdm(disable=not self._use_tqdm, total=self._total, desc=self._desc) as self._pbar:
            return joblib.Parallel.__call__(self, *args, **kwargs)

    def print_progress(self):
        if self._total is None:
            self._pbar.total = self.n_dispatched_tasks
        self._pbar.n = self.n_completed_tasks
        self._pbar.refresh()
def print_function(f):
  @functools.wraps(f)
  def wrapper(*args, **kwargs):
    print_status(f.__name__)
    return f(*args, **kwargs)
  return wrapper
@print_function
def get_d(graph: nx.Graph, nodepairs: List[NodePair], verbose=True) -> (np.array, np.array):
  degree = np.array([[degree for _, degree in graph.degree(nodepair)] for nodepair in tqdm(nodepairs, desc="Degree", disable=not verbose)])
  degree.sort(axis=1)
  return degree[:,0], degree[:,1]
@print_function
def get_v(graph: nx.Graph, nodepairs: List[NodePair]) -> (np.array, np.array):
  volume = np.array([[degree for _, degree in graph.degree(nodepair, weight='weight')] for nodepair in tqdm(nodepairs, desc="Volume")])
  volume.sort(axis=1)
  return volume[:,0], volume[:,1]
@print_function
def get_cn(graph: nx.Graph, nodepairs: List[NodePair]) -> np.array: return np.array([len(list(nx.common_neighbors(graph, *nodepair))) for nodepair in tqdm(nodepairs, desc='Common Neighbors')])
@print_function
def get_pf(graph: nx.Graph, nodepairs: List[NodePair]) -> np.array:
  score = get_propflow(graph)
  return np.fromiter(((score.get(u, 0).get(v, 0) + score.get(v, 0).get(u, 0))/2 for u, v in tqdm(nodepairs, desc='propflow')), dtype=float)
@print_function
def get_sp(graph: nx.Graph, nodepairs: List[NodePair], cutoff=5) -> np.array:
  sp_dict = {node: {k: len(v) for k, v in nx.predecessor(graph, node, cutoff=cutoff).items()} for node in tqdm(graph, desc='calculating shortest paths')}
  return np.fromiter((sp_dict[u][v] for u, v in tqdm(nodepairs, desc='Retrieve sp')), dtype=int)
def get_mf(graph: nx.Graph, nodepairs: List[NodePair], cutoff=5, flow_func = nx.algorithms.flow.edmonds_karp, chunk_size=1000) -> np.array:
  def mf(graph, nodepairs, **kwargs): 
    return [nx.maximum_flow_value(graph, *nodepair, capacity='weight', **kwargs) for nodepair in nodepairs]
  print_status('residual network')
  residual = nx.algorithms.flow.utils.build_residual_network(graph, capacity='weight')
  no_chunks = len(nodepairs) // chunk_size
  chunks = np.array_split(nodepairs, no_chunks)
  print_status('get_mf')
  return flatten(ProgressParallel(n_jobs=128, total=no_chunks, desc='Maxflow (parallel)')(joblib.delayed(mf)(graph, chunk, residual=residual, flow_func=flow_func, cutoff=cutoff) for chunk in chunks))

## Condmat

### Random

In [33]:
condmat_random = gridsearch(pd.read_pickle('datasets/condmat/random/2/features.pkl'))

In [34]:
condmat_random[['mean_test', 'diff_train_test', 'rstd_test', 'test_over_random', 'test_fold0', 'test_fold1', 'test_fold2', 'test_fold3', 'test_fold4']]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test,diff_train_test,rstd_test,test_over_random,test_fold0,test_fold1,test_fold2,test_fold3,test_fold4
max_depth,balanced,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2,False,0.011711,1.439408,0.30133,0.008951,0.01768,0.006682,0.012222,0.010868,0.011102
1,False,0.011336,0.122145,0.430077,0.008822,0.019543,0.006487,0.014112,0.007421,0.009115
2,True,0.010128,0.985273,0.375302,0.007355,0.016543,0.006186,0.011873,0.006675,0.009365
1,True,0.009457,0.146815,0.277128,0.006947,0.012661,0.006343,0.012234,0.006894,0.009152


In [23]:
results['condmat'] = dict()
results['condmat']['random'] = report_performance(pd.read_pickle('datasets/condmat/random/2/features.pkl'), max_depth=1, balanced=False)
print(results['condmat']['random'])

(0.01161405871585572, 0.7167960845889996)


### Temporal

In [37]:
condmat_temporal = gridsearch(pd.read_pickle('datasets/condmat/temporal/train/2/features.pkl'))

In [38]:
condmat_temporal[['mean_test', 'diff_train_test', 'rstd_test', 'test_over_random', 'test_fold0', 'test_fold1', 'test_fold2', 'test_fold3', 'test_fold4']]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test,diff_train_test,rstd_test,test_over_random,test_fold0,test_fold1,test_fold2,test_fold3,test_fold4
max_depth,balanced,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,False,0.005152,0.206178,0.622107,0.003848,0.006089,0.002433,0.002779,0.003432,0.011026
2,False,0.004359,3.991703,0.299936,0.003044,0.006444,0.003775,0.002997,0.003291,0.005288
2,True,0.003938,2.986969,0.283034,0.002622,0.003795,0.00403,0.00288,0.003,0.005983
1,True,0.003246,0.492874,0.19428,0.001938,0.003892,0.002504,0.002665,0.003107,0.004061


In [22]:
results['condmat']['temporal'] = report_performance(
  df_train=pd.read_pickle('datasets/condmat/temporal/train/2/features.pkl'), 
  df_test=pd.read_pickle('datasets/condmat/temporal/test/2/features.pkl'),
  max_depth=1, 
  balanced=False
)
print(results['condmat']['temporal'])

(0.004799299901218471, 0.7319341272805222)


## Enron

In [39]:
dataset = 'enron'

### Random

In [40]:
random = gridsearch(pd.read_pickle(f'datasets/{dataset}/random/2/features.pkl'))

In [41]:
random[['mean_test', 'diff_train_test', 'rstd_test', 'test_over_random', 'test_fold0', 'test_fold1', 'test_fold2', 'test_fold3', 'test_fold4']]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test,diff_train_test,rstd_test,test_over_random,test_fold0,test_fold1,test_fold2,test_fold3,test_fold4
max_depth,balanced,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2,False,0.035862,0.241268,0.195357,0.034583,0.037135,0.027194,0.048338,0.033124,0.033518
2,True,0.022814,0.14197,0.068906,0.021526,0.02488,0.020727,0.024251,0.022684,0.021529
1,False,0.017305,0.030621,0.118209,0.016025,0.018222,0.014126,0.020375,0.017233,0.016568
1,True,0.013299,0.002559,0.106276,0.012012,0.013753,0.011673,0.01566,0.01205,0.01336


In [43]:
results[dataset] = dict()
results[dataset]['random'] = report_performance(pd.read_pickle(f'datasets/{dataset}/random/2/features.pkl'), max_depth=1, balanced=False)
print(results[dataset]['random'])

(0.015961239641945298, 0.8706734068173684)


### Temporal

In [42]:
temporal = gridsearch(pd.read_pickle(f'datasets/{dataset}/temporal/train/2/features.pkl'))
temporal[['mean_test', 'diff_train_test', 'rstd_test', 'test_over_random', 'test_fold0', 'test_fold1', 'test_fold2', 'test_fold3', 'test_fold4']]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test,diff_train_test,rstd_test,test_over_random,test_fold0,test_fold1,test_fold2,test_fold3,test_fold4
max_depth,balanced,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2,False,0.026599,0.516816,0.376079,0.026016,0.025339,0.044877,0.027025,0.015393,0.02036
2,True,0.012708,0.263628,0.082945,0.012138,0.011507,0.013899,0.012009,0.014051,0.012073
1,False,0.010403,0.072873,0.196721,0.009853,0.010352,0.011283,0.013665,0.007599,0.009115
1,True,0.007225,0.000276,0.100433,0.006679,0.008107,0.007296,0.007711,0.007045,0.005967


In [44]:
results[dataset]['temporal'] = report_performance(
  df_train=pd.read_pickle(f'datasets/{dataset}/temporal/train/2/features.pkl'), 
  df_test=pd.read_pickle(f'datasets/{dataset}/temporal/test/2/features.pkl'),
  max_depth=1, 
  balanced=False
)
print(results[dataset]['temporal'])

(0.011743071145453058, 0.8742894406283527)


## Askubuntu

In [68]:
dataset = 'askubuntu'

### Random

In [50]:
random = gridsearch(pd.read_pickle(f'datasets/{dataset}/random/2/features.pkl'), max_depth=[1,2,3])
random[['mean_test', 'diff_train_test', 'rstd_test', 'test_over_random', 'test_fold0', 'test_fold1', 'test_fold2', 'test_fold3', 'test_fold4']]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test,diff_train_test,rstd_test,test_over_random,test_fold0,test_fold1,test_fold2,test_fold3,test_fold4
max_depth,balanced,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3,False,0.0502,0.82577,0.159691,0.049308,0.057978,0.055663,0.050619,0.035078,0.051662
2,False,0.041473,0.346244,0.120017,0.04056,0.046468,0.043193,0.046345,0.034082,0.037276
3,True,0.03639,0.318876,0.092383,0.035518,0.035754,0.032753,0.035488,0.035192,0.042764
2,True,0.029424,0.038861,0.047644,0.028534,0.029342,0.028542,0.032157,0.028488,0.02859
1,False,0.019488,0.050299,0.094515,0.018554,0.018621,0.022406,0.019953,0.016761,0.019701
1,True,0.016752,0.025251,0.107087,0.015825,0.017591,0.017891,0.016554,0.013365,0.018357


In [69]:
results[dataset] = dict()
results[dataset]['random'] = report_performance(pd.read_pickle(f'datasets/{dataset}/random/2/features.pkl'), max_depth=2, balanced=True)
print(results[dataset]['random'])

(0.023293264618648724, 0.8946992586126661)


### Temporal

In [51]:
temporal = gridsearch(pd.read_pickle(f'datasets/{dataset}/temporal/train/2/features.pkl'), max_depth=[1,2,3])
temporal[['mean_test', 'diff_train_test', 'rstd_test', 'test_over_random', 'test_fold0', 'test_fold1', 'test_fold2', 'test_fold3', 'test_fold4']]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test,diff_train_test,rstd_test,test_over_random,test_fold0,test_fold1,test_fold2,test_fold3,test_fold4
max_depth,balanced,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3,False,0.03986,1.150436,0.232144,0.039328,0.02578,0.04228,0.033903,0.044582,0.052753
2,False,0.031763,0.40481,0.15487,0.031233,0.026208,0.030563,0.027827,0.034271,0.039943
3,True,0.028169,0.315953,0.255624,0.027639,0.018628,0.031371,0.020469,0.03543,0.034949
2,True,0.020791,0.038073,0.263838,0.020254,0.014498,0.020978,0.015061,0.028702,0.024717
1,False,0.013962,0.130814,0.219161,0.013397,0.010621,0.012798,0.011182,0.017749,0.017459
1,True,0.01213,0.055315,0.170149,0.011586,0.010161,0.011052,0.011742,0.016107,0.011589


In [70]:
results[dataset]['temporal'] = report_performance(
  df_train=pd.read_pickle(f'datasets/{dataset}/temporal/train/2/features.pkl'), 
  df_test=pd.read_pickle(f'datasets/{dataset}/temporal/test/2/features.pkl'),
  max_depth=2, 
  balanced=True
)
print(results[dataset]['temporal'])

(0.004643793569510117, 0.8513835399254048)


## Bibsonomy

In [3]:
dataset = 'bibsonomy'

### Random