# Link Prediction in Condmat

In [1]:
from linkprediction import convert_to_set, filter_edgelist, get_distances, get_graph, giant_component, read_edges, report

import joblib
from tqdm import tqdm

edges = read_edges('out.sx-stackoverflow', sep='\t')

In [2]:
edges_train_mature = filter_edgelist(edges, stop=50000)

2008-08-01 07:17:57
2008-09-18 22:01:53
no_selected_edges=50001 (7.9e-04)


In [3]:
edges_train_probe = filter_edgelist(edges, start=50000, stop=60000)

2008-09-18 22:01:54
2008-09-20 08:13:54
no_selected_edges=10000 (1.6e-04)


In [4]:
edges_test_mature = filter_edgelist(edges, stop=60000)

2008-08-01 07:17:57
2008-09-20 08:13:54
no_selected_edges=60001 (9.4e-04)


In [5]:
edges_test_probe = filter_edgelist(edges, start=60000, stop=70000)

2008-09-20 08:13:55
2008-09-23 12:20:14
no_selected_edges=10000 (1.6e-04)


## Set-up
Choose here the parameters on how you want to define the learn and assessing phase.

In [6]:
g_train_matured = giant_component(get_graph(edges_train_mature))
uv_train_probe = convert_to_set(edges_train_probe)

In [7]:
report(graph=g_train_matured, probes=uv_train_probe)

Number of probes: 9200
- already edge: 554 (6%)
- both nodes in graph: 2340 (25%)
- not in graph: 3409 (37%)


In [8]:
g_test_matured = giant_component(get_graph(edges_test_mature))
uv_test_probe = convert_to_set(edges_test_probe)

In [9]:
report(graph=g_test_matured, probes=uv_test_probe)

Number of probes: 9136
- already edge: 627 (7%)
- both nodes in graph: 2708 (30%)
- not in graph: 2683 (29%)


## Export

In [10]:
!mkdir -p temporal/{train,test}/2

### Train

In [11]:
%%time
nodepairs_train, _ = get_distances(g_train_matured, cutoff=2)
targets_train = [nodepair in uv_train_probe for nodepair in tqdm(nodepairs_train)]

joblib.dump(nodepairs_train, 'temporal/train/2/nodepairs.pkl', protocol=5)
joblib.dump(targets_train, 'temporal/train/2/target.pkl', protocol=5)
joblib.dump(g_train_matured, 'temporal/train/2/graph.pkl', protocol=5)

print(f'{sum(targets_train) / len(nodepairs_train):e}')

get_distances: 100%|██████████| 6926/6926 [00:31<00:00, 218.01it/s]
100%|██████████| 2579380/2579380 [00:00<00:00, 3380747.76it/s]


4.629019e-04
CPU times: user 54.6 s, sys: 267 ms, total: 54.9 s
Wall time: 54.8 s


### Test

In [12]:
%%time
nodepairs_test, _ = get_distances(g_test_matured, cutoff=2)
targets_test = [nodepair in uv_test_probe for nodepair in tqdm(nodepairs_test)]

joblib.dump(nodepairs_test, 'temporal/test/2/nodepairs.pkl', protocol=5)
joblib.dump(targets_test, 'temporal/test/2/target.pkl', protocol=5)
joblib.dump(g_test_matured, 'temporal/test/2/graph.pkl', protocol=5)

print(f'{sum(targets_test) / len(nodepairs_test):e}')

get_distances: 100%|██████████| 7763/7763 [00:43<00:00, 177.14it/s]
100%|██████████| 3201068/3201068 [00:00<00:00, 3657658.81it/s]


4.642201e-04
CPU times: user 1min 13s, sys: 538 ms, total: 1min 13s
Wall time: 1min 13s


## Hyperparameter selection

### XGBoost

$n=2$

In [None]:
def get_x_y(df: pd.DataFrame): return df.drop(columns='target').values, df['target'].values
def gridsearch(df: pd.DataFrame, random_state=1, also_random=True, max_depth=[1, 2]) -> pd.DataFrame:
  X, y = get_x_y(df)
  
  
  X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=1/3, random_state=random_state)
  clf = XGBClassifier(random_state=random_state, tree_method='hist', n_jobs=6)
  gridsearch = GridSearchCV(
    clf, 
    param_grid=dict(max_depth=max_depth, scale_pos_weight=[sum(~y_train)/sum(y_train), 1]), 
    scoring='average_precision', 
    n_jobs=30,
    cv=StratifiedKFold(shuffle=True, random_state=random_state),
    return_train_score=True
  )
  
  if also_random: 
    gridsearch_random = copy.deepcopy(gridsearch)
    np.random.seed(random_state)
    y_random = copy.deepcopy(y_train)
    np.random.shuffle(y_random)
  
  gridsearch.fit(X_train, y_train)
  df_dict = dict(
      mean_train=gridsearch.cv_results_['mean_train_score'],
      std_train=gridsearch.cv_results_['std_train_score'],
      mean_val=gridsearch.cv_results_['mean_test_score'],
      std_val=gridsearch.cv_results_['std_test_score'],
      val_fold0=gridsearch.cv_results_[f'split0_test_score'],
      val_fold1=gridsearch.cv_results_[f'split1_test_score'],
      val_fold2=gridsearch.cv_results_[f'split2_test_score'],
      val_fold3=gridsearch.cv_results_[f'split3_test_score'],
      val_fold4=gridsearch.cv_results_[f'split4_test_score']
  )
  
  if also_random: 
    gridsearch_random.fit(X_train, y_random)
    df_dict['mean_train_random']=gridsearch_random.cv_results_['mean_train_score']
    df_dict['std_train_random']=gridsearch_random.cv_results_['std_train_score']
    df_dict['mean_val_random']=gridsearch_random.cv_results_['mean_test_score']
    df_dict['std_val_random']=gridsearch_random.cv_results_['std_test_score']
  df = pd.DataFrame(df_dict, index=pd.Index([(d['max_depth'], d['scale_pos_weight'] > 1) for d in gridsearch.cv_results_['params']], name=('max_depth', 'balanced')))
  df['diff_train_val'] = df['mean_val'] - df['mean_train']
  df['rstd_test'] = df['std_val'] / df['mean_val']
  if also_random: df['val_over_random'] = df['mean_val'] - df['mean_val_random']
  return df.sort_values('mean_val', ascending=False)
    
def report_performance(df_train: pd.DataFrame, df_test: pd.DataFrame, random_state=1, max_depth=1, tree_method='hist', balanced=True, n_jobs=128):
  X, y = get_x_y(df_train)
  clf = XGBClassifier(max_depth=max_depth, n_jobs=128, tree_method=tree_method, scale_pos_weight=sum(~y)/sum(y) if balanced else 1 , random_state=random_state)
  clf.fit(X, y)
  X_test, y_test = get_x_y(df_test)
  y_pred = clf.predict_proba(X_test)[:,1]
  return average_precision_score(y_test, y_pred), roc_auc_score(y_test, y_pred)

In [None]:
hps2 = gridsearch(pd.read_pickle(f'temporal/train/2/features.pkl'))

In [None]:
hps2[['mean_test', 'diff_train_test', 'rstd_test', 'test_over_random', 'test_fold0', 'test_fold1', 'test_fold2', 'test_fold3', 'test_fold4']]

In [None]:
report_performance(df_train=pd.read_pickle(f'temp/b1/train/2/features.pkl'), df_test=pd.read_pickle(f'temp/b1/test/2/features.pkl'), balanced=False)