# Link Prediction in Condmat

In [22]:
from linkprediction import convert_to_set, filter_edgelist, get_distances, get_graph, giant_component, read_edges, report

import joblib
from tqdm import tqdm

edges = read_edges('out.munmun_digg_reply')

In [3]:
edges_train_mature = filter_edgelist(edges, stop=50000)

2008-10-28 22:32:09
2008-11-07 09:58:33
no_selected_edges=50001 (5.7e-01)


In [4]:
edges_train_probe = filter_edgelist(edges, start=50000, stop=60000)

2008-11-07 09:58:34
2008-11-09 02:28:53
no_selected_edges=10000 (1.1e-01)


In [5]:
edges_test_mature = filter_edgelist(edges, stop=60000)

2008-10-28 22:32:09
2008-11-09 02:28:53
no_selected_edges=60001 (6.8e-01)


In [6]:
edges_test_probe = filter_edgelist(edges, start=60000, stop=70000)

2008-11-09 02:28:54
2008-11-10 15:57:57
no_selected_edges=10000 (1.1e-01)


## Set-up
Choose here the parameters on how you want to define the learn and assessing phase.

In [7]:
g_train_matured = giant_component(get_graph(edges_train_mature))
uv_train_probe = convert_to_set(edges_train_probe)

In [8]:
report(graph=g_train_matured, probes=uv_train_probe)

Number of probes: 9908
- already edge: 60 (1%)
- both nodes in graph: 3126 (32%)
- not in graph: 3896 (39%)


In [9]:
g_test_matured = giant_component(get_graph(edges_test_mature))
uv_test_probe = convert_to_set(edges_test_probe)

In [10]:
report(graph=g_test_matured, probes=uv_test_probe)

Number of probes: 9918
- already edge: 80 (1%)
- both nodes in graph: 3260 (33%)
- not in graph: 3726 (38%)


## Export

In [15]:
!mkdir -p temporal/{train,test}/2

### Train

In [23]:
%%time
nodepairs_train, _ = get_distances(g_train_matured, cutoff=2)
targets_train = [nodepair in uv_train_probe for nodepair in tqdm(nodepairs_train)]

joblib.dump(nodepairs_train, 'temporal/train/2/nodepairs.pkl', protocol=5)
joblib.dump(targets_train, 'temporal/train/2/target.pkl', protocol=5)
joblib.dump(g_train_matured, 'temporal/train/2/graph.pkl', protocol=5)

print(f'{sum(targets_train) / len(nodepairs_train):e}')

get_distances: 100%|██████████| 20981/20981 [00:11<00:00, 1858.00it/s]
100%|██████████| 1715286/1715286 [00:00<00:00, 3127370.30it/s]


1.848088e-04
CPU times: user 27.3 s, sys: 170 ms, total: 27.5 s
Wall time: 27.4 s


### Test

In [24]:
%%time
nodepairs_test, _ = get_distances(g_test_matured, cutoff=2)
targets_test = [nodepair in uv_test_probe for nodepair in tqdm(nodepairs_test)]

joblib.dump(nodepairs_test, 'temporal/test/2/nodepairs.pkl', protocol=5)
joblib.dump(targets_test, 'temporal/test/2/target.pkl', protocol=5)
joblib.dump(g_test_matured, 'temporal/test/2/graph.pkl', protocol=5)

print(f'{sum(targets_test) / len(nodepairs_test):e}')

get_distances: 100%|██████████| 23451/23451 [00:19<00:00, 1192.70it/s]
100%|██████████| 2321450/2321450 [00:00<00:00, 3232187.22it/s]


1.813522e-04
CPU times: user 42.6 s, sys: 265 ms, total: 42.9 s
Wall time: 42.8 s


## Hyperparameter selection

### XGBoost

$n=2$

In [None]:
def get_x_y(df: pd.DataFrame): return df.drop(columns='target').values, df['target'].values
def gridsearch(df: pd.DataFrame, random_state=1, also_random=True, max_depth=[1, 2]) -> pd.DataFrame:
  X, y = get_x_y(df)
  
  
  X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=1/3, random_state=random_state)
  clf = XGBClassifier(random_state=random_state, tree_method='hist', n_jobs=6)
  gridsearch = GridSearchCV(
    clf, 
    param_grid=dict(max_depth=max_depth, scale_pos_weight=[sum(~y_train)/sum(y_train), 1]), 
    scoring='average_precision', 
    n_jobs=30,
    cv=StratifiedKFold(shuffle=True, random_state=random_state),
    return_train_score=True
  )
  
  if also_random: 
    gridsearch_random = copy.deepcopy(gridsearch)
    np.random.seed(random_state)
    y_random = copy.deepcopy(y_train)
    np.random.shuffle(y_random)
  
  gridsearch.fit(X_train, y_train)
  df_dict = dict(
      mean_train=gridsearch.cv_results_['mean_train_score'],
      std_train=gridsearch.cv_results_['std_train_score'],
      mean_val=gridsearch.cv_results_['mean_test_score'],
      std_val=gridsearch.cv_results_['std_test_score'],
      val_fold0=gridsearch.cv_results_[f'split0_test_score'],
      val_fold1=gridsearch.cv_results_[f'split1_test_score'],
      val_fold2=gridsearch.cv_results_[f'split2_test_score'],
      val_fold3=gridsearch.cv_results_[f'split3_test_score'],
      val_fold4=gridsearch.cv_results_[f'split4_test_score']
  )
  
  if also_random: 
    gridsearch_random.fit(X_train, y_random)
    df_dict['mean_train_random']=gridsearch_random.cv_results_['mean_train_score']
    df_dict['std_train_random']=gridsearch_random.cv_results_['std_train_score']
    df_dict['mean_val_random']=gridsearch_random.cv_results_['mean_test_score']
    df_dict['std_val_random']=gridsearch_random.cv_results_['std_test_score']
  df = pd.DataFrame(df_dict, index=pd.Index([(d['max_depth'], d['scale_pos_weight'] > 1) for d in gridsearch.cv_results_['params']], name=('max_depth', 'balanced')))
  df['diff_train_val'] = df['mean_val'] - df['mean_train']
  df['rstd_test'] = df['std_val'] / df['mean_val']
  if also_random: df['val_over_random'] = df['mean_val'] - df['mean_val_random']
  return df.sort_values('mean_val', ascending=False)
    
def report_performance(df_train: pd.DataFrame, df_test: pd.DataFrame, random_state=1, max_depth=1, tree_method='hist', balanced=True, n_jobs=128):
  X, y = get_x_y(df_train)
  clf = XGBClassifier(max_depth=max_depth, n_jobs=128, tree_method=tree_method, scale_pos_weight=sum(~y)/sum(y) if balanced else 1 , random_state=random_state)
  clf.fit(X, y)
  X_test, y_test = get_x_y(df_test)
  y_pred = clf.predict_proba(X_test)[:,1]
  return average_precision_score(y_test, y_pred), roc_auc_score(y_test, y_pred)

In [None]:
hps2 = gridsearch(pd.read_pickle(f'temporal/train/2/features.pkl'))

In [None]:
hps2[['mean_test', 'diff_train_test', 'rstd_test', 'test_over_random', 'test_fold0', 'test_fold1', 'test_fold2', 'test_fold3', 'test_fold4']]

In [None]:
report_performance(df_train=pd.read_pickle(f'temp/b1/train/2/features.pkl'), df_test=pd.read_pickle(f'temp/b1/test/2/features.pkl'), balanced=False)