# Link Prediction in Condmat

In [2]:
import copy
from datetime import datetime
import itertools
import math
from typing import List, Any, Dict, Tuple

import joblib
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from sklearn.preprocessing import minmax_scale
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold, cross_validate, GridSearchCV, train_test_split
import seaborn as sns
from tqdm import tqdm
from xgboost import XGBClassifier

# Typing
NodePair = Tuple[int, int]
Edge = List[Tuple[int, int, Dict[str, datetime]]]

folder = '/local/bruingjde/complexnetworks2020-experiment/temp/en-b1'

In [3]:
def _filter_edgelist(edges: List[Edge], start, stop) -> List[Edge]: 
  """Filter edgelist.  If start/ stop is float, start/stop from the fraction of total edges. If datetime, this is used.""" 
  no_edges = len(edges)
  if start is None: start=0
  if stop is None: stop=1
  if type(start) is float or start == 0:
    start_index = int(start*no_edges)
    start = edges[start_index][2]['date']
  if type(stop) is float or stop == 1:
    stop_index = math.floor(stop*no_edges)-1
    stop = edges[stop_index][2]['date']
  return [edge for edge in edges if edge[2]['date'] >= start and edge[2]['date'] <= stop]
def get_edgelist(file='src/enron.pkl', start=None, stop=None) -> List[Edge]:
  return _filter_edgelist(joblib.load(file), start, stop)
def giant_component(graph: nx.Graph) -> nx.Graph: return graph.subgraph(max(nx.connected_components(graph), key=len)).copy()
def get_graph(edgelist: List[Edge]) -> nx.Graph:
  """Add edge to graph. Contains edge attribute weight."""
  g = nx.Graph()
  
  for u, v, _ in edgelist:
    weight = g[u][v]["weight"]+1 if g.has_edge(u,v) else 1
    g.add_edge(u, v, weight=weight)
  
  return g
def report(graph:nx.Graph, probes: Tuple[int, int]):
  n = len(probes)
  print(f"Number of probes: {n}")
  a = sum([graph.has_edge(u, v) for u, v in probes])
  print(f"- already edge: {a} ({a/n:.0%})")
  non_edges = set(nx.non_edges(graph))
  ne = sum([np in non_edges for np in tqdm(probes)])
  print(f"- both nodes in graph: {ne} ({ne/n:.0%})")
  ng = sum([not (graph.has_node(u) and graph.has_node(v)) for u, v in tqdm(probes)])
  print(f"- not in graph: {ng} ({ng/n:.0%})")
def get_distances(graph: nx.Graph, cutoff: int = None) -> (List[NodePair], List[int]):
  """
  Get all non-edges using BFS. When cutoff provided, consider only node pairs with at most this distance.
  Returns:
  - nodepairs: tuple containing all nodepairs
  - distances: tuple containing all distances
  """
  return zip(
    *[
      [(u, v), distance]
      for u, (nbs_u, _) in tqdm(nx.all_pairs_dijkstra(graph, cutoff, weight=None), total=len(graph), desc="get_distances")
      for v, distance in nbs_u.items() if distance > 1 and (cutoff is None or distance <= cutoff) 
    ]
  )

## Set-up
Choose here the parameters on how you want to define the learn and assessing phase.

In [4]:
g_train_matured = giant_component(get_graph(get_edgelist(stop=.7)))
uv_train_probe = {(u, v) for u, v, _ in tqdm(get_edgelist(start=.7, stop=.85))}

100%|██████████| 172378/172378 [00:00<00:00, 1380696.25it/s]


In [6]:
joblib.dump(g_train_matured, f'{folder}/train/graph.pkl')
joblib.dump(uv_train_probe, f'{folder}/train/probes.pkl')

['/local/bruingjde/complexnetworks2020-experiment/temp/en-b1/train/probes.pkl']

In [None]:
report(graph=g_train_matured, probes=uv_train_probe)

In [7]:
g_test_matured = giant_component(get_graph(get_edgelist(stop=.85)))
uv_test_probe = {(u, v) for u, v, _ in get_edgelist(start=.85)}

In [None]:
report(graph=g_test_matured, probes=uv_test_probe)

In [8]:
joblib.dump(g_test_matured, f'{folder}/test/graph.pkl')
joblib.dump(uv_test_probe, f'{folder}/test/probes.pkl')

['/local/bruingjde/complexnetworks2020-experiment/temp/en-b1/test/probes.pkl']

## Export

### Train

In [4]:
g_train_matured = joblib.load(f'{folder}/train/graph.pkl')
uv_train_probe = joblib.load(f'{folder}/train/probes.pkl')

In [5]:
nodepairs_train, distances_train = get_distances(g_train_matured, cutoff=2)
targets_train = [nodepair in uv_train_probe for nodepair in tqdm(nodepairs_train)]

get_distances: 100%|██████████| 60313/60313 [11:02<00:00, 91.04it/s]  
100%|██████████| 36686150/36686150 [00:10<00:00, 3532190.45it/s]


In [None]:
%%time
nodepairs = np.array(nodepairs)
distances = np.array(distances)
targets = np.array(targets)

In [None]:
joblib.dump(nodepairs_train, f'{folder}/train/2/nodepairs.pkl')
joblib.dump(distances_train, f'{folder}/train/2/distances.pkl')
joblib.dump(targets_train, f'{folder}/train/2/targets.pkl')
joblib.dump(g_train_matured, f'{folder}/train/2/graph.pkl')

In [None]:
def export(nodepairs, distances, targets, graph, path):
  for select_distance in [2, 3, 4]:
    print(select_distance)
    filter_indices = (distances == select_distance)
    for obj, filename in [(nodepairs, 'nodepairs'), (distances, 'distances'), (targets, 'targets')]:
      obj[filter_indices].dump(f'{path}{select_distance}/{filename}.pkl')
    joblib.dump(graph, f'{path}{select_distance}/graph.pkl')                         
  print('all')
  for obj, filename in [(nodepairs, 'nodepairs'), (distances, 'distances'), (targets, 'targets')]:
    obj.dump(f'{path}all/{filename}.pkl')
  joblib.dump(graph, f'{path}all/graph.pkl')

### Test

In [None]:
nodepairs_test, distances_test = get_distances(g_test_matured)
targets_test = [nodepair in uv_test_probe for nodepair in tqdm(nodepairs_test)]

In [None]:
%%time
nodepairs_test = np.array(nodepairs_test)
targets_test = np.array(targets_test)

In [None]:
print(f'{sum(targets_test) / len(nodepairs_test):e}')

## Hyperparameter selection

### XGBoost

$n=2$

In [None]:
def get_x_y(df: pd.DataFrame): return df.drop(columns='target').values, df['target'].values
def gridsearch(df: pd.DataFrame, random_state=1, also_random=True, max_depth=[1, 2]) -> pd.DataFrame:
  X, y = get_x_y(df)
  
  
  X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=1/3, random_state=random_state)
  clf = XGBClassifier(random_state=random_state, tree_method='hist', n_jobs=6)
  gridsearch = GridSearchCV(
    clf, 
    param_grid=dict(max_depth=max_depth, scale_pos_weight=[sum(~y_train)/sum(y_train), 1]), 
    scoring='average_precision', 
    n_jobs=30,
    cv=StratifiedKFold(shuffle=True, random_state=random_state),
    return_train_score=True
  )
  
  if also_random: 
    gridsearch_random = copy.deepcopy(gridsearch)
    np.random.seed(random_state)
    y_random = copy.deepcopy(y_train)
    np.random.shuffle(y_random)
  
  gridsearch.fit(X_train, y_train)
  df_dict = dict(
      mean_train=gridsearch.cv_results_['mean_train_score'],
      std_train=gridsearch.cv_results_['std_train_score'],
      mean_val=gridsearch.cv_results_['mean_test_score'],
      std_val=gridsearch.cv_results_['std_test_score'],
      val_fold0=gridsearch.cv_results_[f'split0_test_score'],
      val_fold1=gridsearch.cv_results_[f'split1_test_score'],
      val_fold2=gridsearch.cv_results_[f'split2_test_score'],
      val_fold3=gridsearch.cv_results_[f'split3_test_score'],
      val_fold4=gridsearch.cv_results_[f'split4_test_score']
  )
  
  if also_random: 
    gridsearch_random.fit(X_trainval, y_random)
    df_dict['mean_train_random']=gridsearch_random.cv_results_['mean_train_score']
    df_dict['std_train_random']=gridsearch_random.cv_results_['std_train_score']
    df_dict['mean_val_random']=gridsearch_random.cv_results_['mean_test_score']
    df_dict['std_val_random']=gridsearch_random.cv_results_['std_test_score']
  df = pd.DataFrame(df_dict, index=pd.Index([(d['max_depth'], d['scale_pos_weight'] > 1) for d in gridsearch.cv_results_['params']], name=('max_depth', 'balanced')))
  df['diff_train_val'] = df['mean_val'] - df['mean_train']
  df['rstd_test'] = df['std_val'] / df['mean_val']
  if also_random: df['val_over_random'] = df['mean_val'] - df['mean_val_random']
  return df.sort_values('mean_val', ascending=False)
    
def report_performance(df_train: pd.DataFrame, df_test: pd.DataFrame, random_state=1, max_depth=1, tree_method='hist', balanced=True, n_jobs=128):
  X, y = get_x_y(df_train)
  clf = XGBClassifier(max_depth=max_depth, n_jobs=128, tree_method=tree_method, scale_pos_weight=sum(~y)/sum(y) if balanced else 1 , random_state=random_state)
  clf.fit(X, y)
  X_test, y_test = get_x_y(df_test)
  y_pred = clf.predict_proba(X_test)[:,1]
  return average_precision_score(y_test, y_pred), roc_auc_score(y_test, y_pred)

In [None]:
hps2 = gridsearch(pd.read_pickle(f'temp/b1/train/2/features.pkl'))

In [None]:
hps2[['mean_test', 'diff_train_test', 'rstd_test', 'test_over_random', 'test_fold0', 'test_fold1', 'test_fold2', 'test_fold3', 'test_fold4']]

In [None]:
hps3 = gridsearch(pd.read_pickle(f'temp/b1/train/3/features.pkl'))

In [None]:
hps3[['mean_test', 'diff_train_test', 'rstd_test', 'test_over_random', 'test_fold0', 'test_fold1', 'test_fold2', 'test_fold3', 'test_fold4']]

In [None]:
hps4 = gridsearch(pd.read_pickle(f'temp/b1/train/4/features.pkl'), max_depth=[1, 2, 3])

In [None]:
hps4[['mean_test', 'diff_train_test', 'rstd_test', 'test_over_random', 'test_fold0', 'test_fold1', 'test_fold2', 'test_fold3', 'test_fold4']]

In [None]:
report_performance(df_train=pd.read_pickle(f'temp/b1/train/2/features.pkl'), df_test=pd.read_pickle(f'temp/b1/test/2/features.pkl'), balanced=False)

In [None]:
report_performance(
  df_train=pd.read_pickle(f'temp/b1/train/3/features.pkl')[pd.read_pickle(f'temp/b1/test/3/features.pkl').columns], 
  df_test=pd.read_pickle(f'temp/b1/test/3/features.pkl'), 
  balanced=True
)

In [None]:
report_performance(df_train=pd.read_pickle(f'temp/b1/train/4/features.pkl'), df_test=pd.read_pickle(f'temp/b1/test/4/features.pkl'), max_depth=2, balanced=True)