# Link Prediction in Condmat

In [16]:
import joblib
from tqdm import tqdm

from linkprediction import convert_to_set, filter_edgelist, get_distances, get_graph, giant_component, read_edges, report

edges = read_edges('out.sx-stackoverflow', sep='\t')

In [6]:
print(f'Number of edges: {len(edges):.1e}')

Number of edges: 6.3e+07


In [7]:
edges_mature = filter_edgelist(edges, stop=50000)

2008-08-01 07:17:57
2008-09-18 22:01:53
no_selected_edges=50001 (7.9e-04)


In [8]:
edges_probe = filter_edgelist(edges, start=50000, stop=70000)

2008-09-18 22:01:54
2008-09-23 12:20:14
no_selected_edges=20000 (3.1e-04)


## Set-up
Choose here the parameters on how you want to define the learn and assessing phase.

In [10]:
!mkdir -p random/2

In [14]:
%%time
g_learn = giant_component(get_graph(edges_mature))
uv_assessing = convert_to_set(edges_probe)

joblib.dump(g_learn, 'random/graph.pkl', protocol=5)
joblib.dump(uv_assessing, f'random/probes.pkl', protocol=5)

report(graph=g_learn, probes=uv_assessing)

Number of probes: 18098
- already edge: 861 (5%)
- both nodes in graph: 4358 (24%)
- not in graph: 7536 (42%)
CPU times: user 12.3 s, sys: 1.56 s, total: 13.8 s
Wall time: 13.8 s


In [15]:
!cp random/graph.pkl random/2/graph.pkl

## Export

In [17]:
%%time
nodepairs, _ = get_distances(g_learn, cutoff=2)
targets = [nodepair in uv_assessing for nodepair in tqdm(nodepairs)]
joblib.dump(nodepairs, 'random/2/nodepairs.pkl', protocol=5)
joblib.dump(targets, 'random/2/target.pkl', protocol=5)

print(f'{sum(targets) / len(nodepairs):e}')

get_distances: 100%|██████████| 6926/6926 [00:39<00:00, 175.31it/s]
100%|██████████| 2579380/2579380 [00:00<00:00, 3594286.01it/s]


8.424505e-04
CPU times: user 1min, sys: 1.11 s, total: 1min 1s
Wall time: 1min 1s


## Distance analysis

In [None]:
df = pd.DataFrame(dict(distances=distances, targets=targets))
df = df.groupby('distances')['targets'].agg(['size', 'sum'])

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df['size'], fill='tozeroy', name='# Nodepairs'))
fig.add_trace(go.Scatter(x=df.index, y=df['sum'], fill='tozeroy', name='# Positives'))
fig.update_layout(xaxis=dict(tickmode='linear', tick0=2, dtick=1), yaxis_type="log")
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df['size'].cumsum(), fill='tozeroy', name='# Nodepairs'))
fig.add_trace(go.Scatter(x=df.index, y=df['sum'].cumsum(), fill='tozeroy', name='# Positives'))
fig.update_layout(xaxis=dict(tickmode='linear', tick0=2, dtick=1), yaxis_type="log")

## Feature inspection

In [None]:
%%time
df = joblib.load("random/2/features.pkl")

In [None]:
fig = px.imshow(df.corr(), x=df.columns, y=df.columns)
fig.update_xaxes(side="top")

In [None]:
def pairplot(df):
  return sns.pairplot(
    df[df['target']].append(df[~df['target']].sample(sum(df['target']))).apply(minmax_scale), 
    hue='target',
    hue_order=[True, False],
    palette={True: 'green', False: 'red'},
    kind='reg',
    diag_kws=dict(bw=.02),
    plot_kws=dict(scatter_kws=dict(alpha=.1))
  )

In [None]:
pairplot(df)

## Hyperparameter selection

See `parameter_optimalization.ipynb`. We choose the following parameters:
- `max_depth = 1`
- `tree_method = 'hist'`
- `no feature scaling`

### XGBoost

$n=2$

In [None]:
def gridsearch(df: pd.DataFrame, random_state=1, also_random=True, max_depth=[1, 2]) -> pd.DataFrame:
  X = df.drop(columns='target').values
  y = df['target'].values
  
  param_grid=dict(max_depth=max_depth, scale_pos_weight=[sum(~y)/sum(y), 1])
  
  X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=1/3, random_state=random_state)
  clf = XGBClassifier(random_state=random_state, tree_method='hist', n_jobs=6)
  gridsearch = GridSearchCV(
    clf, 
    param_grid=param_grid, 
    scoring='average_precision', 
    n_jobs=30,
    cv=StratifiedKFold(shuffle=True, random_state=random_state),
    return_train_score=True
  )
  
  if also_random: 
    gridsearch_random = copy.deepcopy(gridsearch)
    np.random.seed(random_state)
    y_random = copy.deepcopy(y_trainval)
    np.random.shuffle(y_random)
  
  gridsearch.fit(X_trainval, y_trainval)
  df_dict = dict(
      mean_train=gridsearch.cv_results_['mean_train_score'],
      std_train=gridsearch.cv_results_['std_train_score'],
      mean_test=gridsearch.cv_results_['mean_test_score'],
      std_test=gridsearch.cv_results_['std_test_score'],
      test_fold0=gridsearch.cv_results_[f'split0_test_score'],
      test_fold1=gridsearch.cv_results_[f'split1_test_score'],
      test_fold2=gridsearch.cv_results_[f'split2_test_score'],
      test_fold3=gridsearch.cv_results_[f'split3_test_score'],
      test_fold4=gridsearch.cv_results_[f'split4_test_score']
  )
  
  if also_random: 
    gridsearch_random.fit(X_trainval, y_random)
    df_dict['mean_train_random']=gridsearch_random.cv_results_['mean_train_score']
    df_dict['std_train_random']=gridsearch_random.cv_results_['std_train_score']
    df_dict['mean_test_random']=gridsearch_random.cv_results_['mean_test_score']
    df_dict['std_test_random']=gridsearch_random.cv_results_['std_test_score']
  df = pd.DataFrame(df_dict, index=pd.Index([(d['max_depth'], d['scale_pos_weight'] > 1) for d in gridsearch.cv_results_['params']], name=('max_depth', 'balanced')))
  df['diff_train_test'] = (df['mean_test'] - df['mean_train']).abs()/df['mean_test']
  df['rstd_test'] = df['std_test'] / df['mean_test']
  if also_random: df['test_over_random'] = df['mean_test'] - df['mean_test_random']
  return df.sort_values('mean_test', ascending=False)
    
def report_performance(df: pd.DataFrame, random_state=1, max_depth=1, tree_method='hist', balanced=True, n_jobs=128):
  X = df.drop(columns='target').values
  y = df['target'].values
  X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=1/3, random_state=random_state)
  clf = XGBClassifier(max_depth=max_depth, n_jobs=128, tree_method=tree_method, scale_pos_weight=sum(~y)/sum(y) if balanced else 1 , random_state=random_state)
  clf.fit(X_trainval, y_trainval)
  y_pred = clf.predict_proba(X_test)[:,1]
  return average_precision_score(y_test, y_pred), roc_auc_score(y_test, y_pred)

In [None]:
df = pd.read_pickle(f'temp/a1/2/features.pkl')
X = df.drop(columns='target').values
y = df['target'].values

param_grid=dict(max_depth=[1, 2], scale_pos_weight=[sum(~y)/sum(y), 1])

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=1/3, random_state=1)
clf = XGBClassifier(random_state=1, tree_method='hist', n_jobs=6)
gridsearch = GridSearchCV(
  clf, 
  param_grid=param_grid, 
  scoring='average_precision', 
  n_jobs=30,
  cv=StratifiedKFold(shuffle=True, random_state=1),
  return_train_score=True
)
gridsearch.fit(X_trainval, y_trainval)

In [None]:
hps2 = gridsearch(pd.read_pickle(f'random/2/features.pkl'))

In [None]:
hps2[['mean_test', 'mean_train', 'diff_train_test', 'rstd_test', 'test_over_random', 'test_fold0', 'test_fold1', 'test_fold2', 'test_fold3', 'test_fold4']]

In [None]:
hps3 = gridsearch(pd.read_pickle(f'temp/a1/3/features.pkl'))

In [None]:
hps3[['mean_test', 'diff_train_test', 'rstd_test', 'test_over_random', 'test_fold0', 'test_fold1', 'test_fold2', 'test_fold3', 'test_fold4']]

In [None]:
hps4 = gridsearch(pd.read_pickle(f'temp/a1/4/features.pkl'))

In [None]:
hps4[['mean_test', 'diff_train_test', 'rstd_test', 'test_over_random', 'test_fold0', 'test_fold1', 'test_fold2', 'test_fold3', 'test_fold4']]

In [None]:
report_performance(pd.read_pickle(f'random/2/features.pkl'), max_depth=1, balanced=False)

In [None]:
report_performance(pd.read_pickle(f'temp/a1/3/features.pkl'), balanced=True)

In [None]:
report_performance(pd.read_pickle(f'temp/a1/4/features.pkl'), balanced=True)