# Link Prediction in Condmat

In [12]:
import copy
from datetime import datetime
import itertools
import math
from typing import List, Any, Dict, Tuple

import joblib
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from sklearn.preprocessing import minmax_scale
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold, cross_validate, GridSearchCV, train_test_split
import seaborn as sns
from tqdm import tqdm
from xgboost import XGBClassifier

# Typing
Author = int
Papers = List[Tuple[List[Author], datetime]]
NodePair = Tuple[Author, Author]
Edge = List[Tuple[Author, Author, Dict[str, datetime]]]

folder = '/local/bruingjde/complexnetworks2020-experiment/temp/a4n/'

In [2]:
def _get_papers(filepath: str = "src/cond-mat.hg2") -> Papers:
  """Read collaboration data in filepath and return all papers."""
  
  papers = list()
  # Get number of rows to read for the vertices.
  with open(filepath) as file:
    no_rows = int(file.readline().split(' ')[1])
 
  with open(filepath) as file:
    for paper in file.readlines()[no_rows+2:]:
      # Each line has the following format: epoch no_authors [ u v (w ...) ]
      epoch = datetime.fromtimestamp(int(paper.split(' ')[0]))
          
      no_authors = int(paper.split(' ')[1])
      index1 = paper.find('[')+2
      index2 = paper.find(']')-1

      authors = [int(auth) for auth in paper[index1:index2].split(' ')]
      assert no_authors == len(authors)
      
      papers.append((authors, epoch))
  return papers
def _filter_edgelist(edges: List[Edge], start, stop) -> List[Edge]: 
  """Filter edgelist.  If start/ stop is float, start/stop from the fraction of total edges. If datetime, this is used.""" 
  no_edges = len(edges)
  if start is None: start=0
  if stop is None: stop=1
  if type(start) is float or start == 0:
    start_index = int(start*no_edges)
    start = edges[start_index][2]['date']
  if type(stop) is float or stop == 1:
    stop_index = math.floor(stop*no_edges)-1
    stop = edges[stop_index][2]['date']
  return [edge for edge in edges if edge[2]['date'] >= start and edge[2]['date'] <= stop]
def get_edgelist(*, start=None, stop=None) -> List[Edge]:
  """Return E_[t_1, t_2]."""
  papers = _get_papers()
  edges = [
    (u, v, dict(date=date)) if u<v else (v, u, dict(date=date))
    for authors, date in papers
    for u, v in itertools.combinations(authors, 2)
  ]
  return _filter_edgelist(edges, start, stop)
def giant_component(graph: nx.Graph) -> nx.Graph: return graph.subgraph(max(nx.connected_components(graph), key=len)).copy()
def get_graph(edgelist: List[Edge]) -> nx.Graph:
  """Add edge to graph. Contains edge attribute weight."""
  g = nx.Graph()
  
  for u, v, _ in edgelist:
    weight = g[u][v]["weight"]+1 if g.has_edge(u,v) else 1
    g.add_edge(u, v, weight=weight)
  
  return g
def report(graph:nx.Graph, probes: Tuple[Author, Author]):
  n = len(probes)
  print(f"Number of probes: {n}")
  a = sum([graph.has_edge(u, v) for u, v in probes])
  print(f"- already edge: {a} ({a/n:.0%})")
  non_edges = set(nx.non_edges(graph))
  ne = sum([np in non_edges for np in probes])
  print(f"- both nodes in graph: {ne} ({ne/n:.0%})")
  ng = sum([not (graph.has_node(u) and graph.has_node(v)) for u, v in probes])
  print(f"- not in graph: {ng} ({ng/n:.0%})")
def get_distances(graph: nx.Graph, cutoff: int = None) -> (List[NodePair], List[int]):
  """
  Get all non-edges using BFS. When cutoff provided, consider only node pairs with at most this distance.
  Returns:
  - nodepairs: tuple containing all nodepairs
  - distances: tuple containing all distances
  """
  nodepairs, distances = zip(
    *[
      [(u, v), distance]
      for u, (nbs_u, _) in tqdm(nx.all_pairs_dijkstra(graph, cutoff, weight=None), total=len(graph), desc="get_distances")
      for v, distance in nbs_u.items() if distance > 1 and (cutoff is None or distance <= cutoff) 
    ]
  )
  return nodepairs, np.array(distances)
def print_status(desc: str): print(f'{datetime.datetime.strftime("%H:%M:%S", datetime.datetime.localtime())}: {desc}')

## Set-up
Choose here the parameters on how you want to define the learn and assessing phase.

In [12]:
g_train_matured = giant_component(get_graph(get_edgelist(stop=datetime(1999, 12, 31))))
uv_train_probe = {(u, v) for u, v, _ in get_edgelist(start=datetime(2000, 1, 1), stop=datetime(2000, 6, 30))}

In [13]:
report(graph=g_train_matured, probes=uv_train_probe)

Number of probes: 10255
- already edge: 3173 (31%)
- both nodes in graph: 1900 (19%)
- not in graph: 5182 (51%)


In [14]:
g_test_matured = giant_component(get_graph(get_edgelist(stop=datetime(2000, 6, 30))))
uv_test_probe = {(u, v) for u, v, _ in get_edgelist(start=datetime(2000, 7, 1))}

In [6]:
report(graph=g_test_matured, probes=uv_test_probe)

Number of probes: 11823
- already edge: 3589 (30%)
- both nodes in graph: 2518 (21%)
- not in graph: 5715 (48%)


## Export

### Train

In [15]:
nodepairs_train, distances_train = get_distances(g_train_matured)
targets_train = [nodepair in uv_train_probe for nodepair in tqdm(nodepairs_train)]

get_distances: 100%|██████████| 11723/11723 [17:00<00:00, 11.49it/s]  
100%|██████████| 137341838/137341838 [00:41<00:00, 3339143.24it/s]


In [68]:
def export(nodepairs, distances, targets, graph, path):
  for select_distance in [2, 3, 4]:
    print(select_distance)
    filter_indices = (distances == select_distance)
    for obj, filename in [(nodepairs, 'nodepairs'), (distances, 'distances'), (targets, 'targets')]:
      obj[filter_indices].dump(f'{path}{select_distance}/{filename}.pkl')
    joblib.dump(graph, f'{path}{select_distance}/graph.pkl')                         
  print('all')
  for obj, filename in [(nodepairs, 'nodepairs'), (distances, 'distances'), (targets, 'targets')]:
    obj.dump(f'{path}all/{filename}.pkl')
  joblib.dump(graph, f'{path}all/graph.pkl')

### Test

In [16]:
nodepairs_test, distances_test = get_distances(g_test_matured)
targets_test = [nodepair in uv_test_probe for nodepair in tqdm(nodepairs_test)]

get_distances: 100%|██████████| 13375/13375 [28:01<00:00,  7.96it/s]   
100%|██████████| 178788264/178788264 [00:55<00:00, 3237878.40it/s]


In [59]:
%%time
nodepairs_test = np.array(nodepairs_test)
targets_test = np.array(targets_test)

CPU times: user 3min 47s, sys: 1.72 s, total: 3min 49s
Wall time: 3min 47s


In [12]:
print(f'{sum(targets_test) / len(nodepairs_test):e}')

1.418230e-03


## Hyperparameter selection

### XGBoost

$n=2$

In [35]:
def get_x_y(df: pd.DataFrame): return df.drop(columns='target').values, df['target'].values
def gridsearch(df: pd.DataFrame, random_state=1, also_random=True, max_depth=[1, 2]) -> pd.DataFrame:
  X, y = get_x_y(df)
  
  
  X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=1/3, random_state=random_state)
  clf = XGBClassifier(random_state=random_state, tree_method='hist', n_jobs=6)
  gridsearch = GridSearchCV(
    clf, 
    param_grid=dict(max_depth=max_depth, scale_pos_weight=[sum(~y_train)/sum(y_train), 1]), 
    scoring='average_precision', 
    n_jobs=30,
    cv=StratifiedKFold(shuffle=True, random_state=random_state),
    return_train_score=True
  )
  
  if also_random: 
    gridsearch_random = copy.deepcopy(gridsearch)
    np.random.seed(random_state)
    y_random = copy.deepcopy(y_train)
    np.random.shuffle(y_random)
  
  gridsearch.fit(X_train, y_train)
  df_dict = dict(
      mean_train=gridsearch.cv_results_['mean_train_score'],
      std_train=gridsearch.cv_results_['std_train_score'],
      mean_val=gridsearch.cv_results_['mean_test_score'],
      std_val=gridsearch.cv_results_['std_test_score'],
      val_fold0=gridsearch.cv_results_[f'split0_test_score'],
      val_fold1=gridsearch.cv_results_[f'split1_test_score'],
      val_fold2=gridsearch.cv_results_[f'split2_test_score'],
      val_fold3=gridsearch.cv_results_[f'split3_test_score'],
      val_fold4=gridsearch.cv_results_[f'split4_test_score']
  )
  
  if also_random: 
    gridsearch_random.fit(X_trainval, y_random)
    df_dict['mean_train_random']=gridsearch_random.cv_results_['mean_train_score']
    df_dict['std_train_random']=gridsearch_random.cv_results_['std_train_score']
    df_dict['mean_val_random']=gridsearch_random.cv_results_['mean_test_score']
    df_dict['std_val_random']=gridsearch_random.cv_results_['std_test_score']
  df = pd.DataFrame(df_dict, index=pd.Index([(d['max_depth'], d['scale_pos_weight'] > 1) for d in gridsearch.cv_results_['params']], name=('max_depth', 'balanced')))
  df['diff_train_val'] = df['mean_val'] - df['mean_train']
  df['rstd_test'] = df['std_val'] / df['mean_val']
  if also_random: df['val_over_random'] = df['mean_val'] - df['mean_val_random']
  return df.sort_values('mean_val', ascending=False)
    
def report_performance(df_train: pd.DataFrame, df_test: pd.DataFrame, random_state=1, max_depth=1, tree_method='hist', balanced=True, n_jobs=128):
  X, y = get_x_y(df_train)
  clf = XGBClassifier(max_depth=max_depth, n_jobs=128, tree_method=tree_method, scale_pos_weight=sum(~y)/sum(y) if balanced else 1 , random_state=random_state)
  clf.fit(X, y)
  X_test, y_test = get_x_y(df_test)
  y_pred = clf.predict_proba(X_test)[:,1]
  return average_precision_score(y_test, y_pred), roc_auc_score(y_test, y_pred)

In [23]:
hps2 = gridsearch(pd.read_pickle(f'temp/b1/train/2/features.pkl'))

In [27]:
hps2[['mean_test', 'diff_train_test', 'rstd_test', 'test_over_random', 'test_fold0', 'test_fold1', 'test_fold2', 'test_fold3', 'test_fold4']]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test,diff_train_test,rstd_test,test_over_random,test_fold0,test_fold1,test_fold2,test_fold3,test_fold4
max_depth,balanced,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,False,0.005152,0.001062,0.622107,0.003848,0.006089,0.002433,0.002779,0.003432,0.011026
2,False,0.004359,0.017399,0.299936,0.003044,0.006444,0.003775,0.002997,0.003291,0.005288
2,True,0.003938,0.011761,0.283034,0.002622,0.003795,0.00403,0.00288,0.003,0.005983
1,True,0.003246,0.0016,0.19428,0.001938,0.003892,0.002504,0.002665,0.003107,0.004061


In [25]:
hps3 = gridsearch(pd.read_pickle(f'temp/b1/train/3/features.pkl'))

In [28]:
hps3[['mean_test', 'diff_train_test', 'rstd_test', 'test_over_random', 'test_fold0', 'test_fold1', 'test_fold2', 'test_fold3', 'test_fold4']]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test,diff_train_test,rstd_test,test_over_random,test_fold0,test_fold1,test_fold2,test_fold3,test_fold4
max_depth,balanced,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2,False,0.004468,0.022433,1.614013,0.004276,0.000731,0.000547,0.000921,0.018883,0.001258
1,False,0.001733,0.00666,1.070482,0.001604,0.000574,0.000562,0.000948,0.005414,0.001165
2,True,0.001295,0.003029,0.688489,0.001118,0.000679,0.00048,0.001036,0.002988,0.001293
1,True,0.001099,8.6e-05,0.756664,0.000968,0.000474,0.000499,0.000549,0.002631,0.001343


In [32]:
hps4 = gridsearch(pd.read_pickle(f'temp/b1/train/4/features.pkl'), max_depth=[1, 2, 3])

In [33]:
hps4[['mean_test', 'diff_train_test', 'rstd_test', 'test_over_random', 'test_fold0', 'test_fold1', 'test_fold2', 'test_fold3', 'test_fold4']]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_test,diff_train_test,rstd_test,test_over_random,test_fold0,test_fold1,test_fold2,test_fold3,test_fold4
max_depth,balanced,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2,False,0.00261,0.003831,1.867772,0.002558,0.000139,0.00029,0.000109,0.000152,0.01236
2,True,0.001336,4.6e-05,1.748799,0.001288,0.000148,0.000307,0.000119,0.0001,0.006006
3,True,0.000701,0.004129,1.545884,0.000642,0.000153,0.000234,0.000118,0.000134,0.002869
3,False,0.000494,0.029945,1.194948,0.000442,0.000192,0.000223,0.000159,0.000221,0.001673
1,False,0.000289,0.000281,0.743702,0.000213,0.000143,0.000254,0.000211,0.000127,0.000708
1,True,0.00026,2e-06,0.576801,8.5e-05,0.00011,0.000241,0.000507,0.000109,0.000333


In [38]:
report_performance(df_train=pd.read_pickle(f'temp/b1/train/2/features.pkl'), df_test=pd.read_pickle(f'temp/b1/test/2/features.pkl'), balanced=False)

(0.004799299901218471, 0.7319341272805222)

In [41]:
pd.read_pickle(f'temp/b1/train/3/features.pkl')

Unnamed: 0,d_min,d_max,v_min,v_max,cn,pf,aa,jc,pa,target,mf,sp
0,3,12,3,21,0,0.004380,0.0,0.0,36,False,3,2
1,3,12,3,21,0,0.004490,0.0,0.0,36,False,3,2
2,11,12,21,23,0,0.001144,0.0,0.0,132,False,3,2
3,8,12,15,21,0,0.001019,0.0,0.0,96,False,3,2
4,7,12,15,21,0,0.001335,0.0,0.0,84,False,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...
2087141,4,26,4,50,0,0.003183,0.0,0.0,104,False,4,1
2087142,4,7,4,10,0,0.001574,0.0,0.0,28,False,4,1
2087143,4,4,4,4,0,0.001157,0.0,0.0,16,False,4,1
2087144,4,5,4,5,0,0.001071,0.0,0.0,20,False,4,1


In [45]:
report_performance(
  df_train=pd.read_pickle(f'temp/b1/train/3/features.pkl')[pd.read_pickle(f'temp/b1/test/3/features.pkl').columns], 
  df_test=pd.read_pickle(f'temp/b1/test/3/features.pkl'), 
  balanced=True
)

(0.00042199913168793345, 0.6997098375605814)

In [46]:
report_performance(df_train=pd.read_pickle(f'temp/b1/train/4/features.pkl'), df_test=pd.read_pickle(f'temp/b1/test/4/features.pkl'), max_depth=2, balanced=True)

FileNotFoundError: [Errno 2] No such file or directory: 'temp/b1/test/4/features.pkl'