In [18]:
import collections
import copy
import datetime
import functools
import itertools
import math
from typing import List, Tuple, Dict

import joblib
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from sklearn.preprocessing import minmax_scale
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold, cross_validate, GridSearchCV, train_test_split
import seaborn as sns
from tqdm import tqdm
from xgboost import XGBClassifier

NodePair = Tuple[int, int]
Edge = List[Tuple[int, int, Dict['date', datetime.datetime]]]

results = dict()
Performance = collections.namedtuple('Performance', ['average_precision', 'roc_auc'])

In [19]:
def gridsearch(df: pd.DataFrame, random_state=1, also_random=True, max_depth=[1, 2]) -> pd.DataFrame:
  X = df.drop(columns='target').values
  y = df['target'].values
  
  param_grid=dict(max_depth=max_depth, scale_pos_weight=[sum(~y)/sum(y), 1])
  
  X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=1/3, random_state=random_state)
  clf = XGBClassifier(random_state=random_state, tree_method='hist', n_jobs=6)
  gridsearch = GridSearchCV(
    clf, 
    param_grid=param_grid, 
    scoring='average_precision', 
    n_jobs=30,
    cv=StratifiedKFold(shuffle=True, random_state=random_state),
    return_train_score=True
  )
  
  if also_random: 
    gridsearch_random = copy.deepcopy(gridsearch)
    np.random.seed(random_state)
    y_random = copy.deepcopy(y_trainval)
    np.random.shuffle(y_random)
  
  gridsearch.fit(X_trainval, y_trainval)
  df_dict = dict(
      mean_train=gridsearch.cv_results_['mean_train_score'],
      std_train=gridsearch.cv_results_['std_train_score'],
      mean_test=gridsearch.cv_results_['mean_test_score'],
      std_test=gridsearch.cv_results_['std_test_score'],
      test_fold0=gridsearch.cv_results_[f'split0_test_score'],
      test_fold1=gridsearch.cv_results_[f'split1_test_score'],
      test_fold2=gridsearch.cv_results_[f'split2_test_score'],
      test_fold3=gridsearch.cv_results_[f'split3_test_score'],
      test_fold4=gridsearch.cv_results_[f'split4_test_score']
  )
  
  if also_random: 
    gridsearch_random.fit(X_trainval, y_random)
    df_dict['mean_train_random']=gridsearch_random.cv_results_['mean_train_score']
    df_dict['std_train_random']=gridsearch_random.cv_results_['std_train_score']
    df_dict['mean_test_random']=gridsearch_random.cv_results_['mean_test_score']
    df_dict['std_test_random']=gridsearch_random.cv_results_['std_test_score']
  df = pd.DataFrame(df_dict, index=pd.Index([(d['max_depth'], d['scale_pos_weight'] > 1) for d in gridsearch.cv_results_['params']], name=('max_depth', 'balanced')))
  df['diff_train_test'] = (df['mean_test'] - df['mean_train']).abs() / df['mean_test']
  df['rstd_test'] = df['std_test'] / df['mean_test']
  if also_random: df['test_over_random'] = df['mean_test'] - df['mean_test_random']
  return df.sort_values('mean_test', ascending=False)
def get_x_y(df: pd.DataFrame): return df.drop(columns='target').values, df['target'].values
def report_performance(df_train: pd.DataFrame, df_test=None, random_state=1, max_depth=1, tree_method='hist', balanced=True, n_jobs=128):
  X, y = get_x_y(df_train)
  if df_test is None: X, X_test, y, y_test = train_test_split(X, y, test_size=1/3, random_state=random_state)
  else: X_test, y_test = get_x_y(df_test)
  clf = XGBClassifier(max_depth=max_depth, n_jobs=128, tree_method=tree_method, scale_pos_weight=sum(~y)/sum(y) if balanced else 1 , random_state=random_state)
  clf.fit(X, y)
  y_pred = clf.predict_proba(X_test)[:,1]
  return Performance(average_precision_score(y_test, y_pred), roc_auc_score(y_test, y_pred))

## Condmat

### Random

In [20]:
results['condmat'] = dict()
results['condmat']['random'] = report_performance(pd.read_pickle('datasets/condmat/random/2/features.pkl'), max_depth=1, balanced=False)
print(results['condmat']['random'])

Performance(average_precision=0.01161405871585572, roc_auc=0.7167960845889996)


### Temporal

In [21]:
results['condmat']['temporal'] = report_performance(
  df_train=pd.read_pickle('datasets/condmat/temporal/train/2/features.pkl'), 
  df_test=pd.read_pickle('datasets/condmat/temporal/test/2/features.pkl'),
  max_depth=1, 
  balanced=False
)
print(results['condmat']['temporal'])

Performance(average_precision=0.004799299901218471, roc_auc=0.7319341272805222)


## Enron

In [22]:
dataset = 'enron'

### Random

In [23]:
results[dataset] = dict()
results[dataset]['random'] = report_performance(pd.read_pickle(f'datasets/{dataset}/random/2/features.pkl'), max_depth=1, balanced=False)
print(results[dataset]['random'])

Performance(average_precision=0.015961239641945298, roc_auc=0.8706734068173684)


### Temporal

In [24]:
results[dataset]['temporal'] = report_performance(
  df_train=pd.read_pickle(f'datasets/{dataset}/temporal/train/2/features.pkl'), 
  df_test=pd.read_pickle(f'datasets/{dataset}/temporal/test/2/features.pkl'),
  max_depth=1, 
  balanced=False
)
print(results[dataset]['temporal'])

Performance(average_precision=0.011743071145453058, roc_auc=0.8742894406283527)


## Askubuntu

In [25]:
dataset = 'askubuntu'

### Random

In [26]:
results[dataset] = dict()
results[dataset]['random'] = report_performance(pd.read_pickle(f'datasets/{dataset}/random/2/features.pkl'), max_depth=2, balanced=True)
print(results[dataset]['random'])

Performance(average_precision=0.023293264618648724, roc_auc=0.8946992586126661)


### Temporal

In [27]:
results[dataset]['temporal'] = report_performance(
  df_train=pd.read_pickle(f'datasets/{dataset}/temporal/train/2/features.pkl'), 
  df_test=pd.read_pickle(f'datasets/{dataset}/temporal/test/2/features.pkl'),
  max_depth=2, 
  balanced=True
)
print(results[dataset]['temporal'])

Performance(average_precision=0.004643793569510117, roc_auc=0.8513835399254048)


## Bibsonomy

In [28]:
dataset = 'bibsonomy'

### Random

In [None]:
results[dataset] = dict()
results[dataset]['random'] = report_performance(pd.read_pickle(f'datasets/{dataset}/random/2/features.pkl'), max_depth=2, balanced=True)
print(results[dataset]['random'])

# Results

In [36]:
pd.DataFrame({dataset: {typesplit: typesplit_results.average_precision for typesplit, typesplit_results in dataset_results.items()} for dataset, dataset_results in results.items()}).T

Unnamed: 0,random,temporal
condmat,0.011614,0.004799
enron,0.015961,0.011743
askubuntu,0.023293,0.004644


In [37]:
pd.DataFrame({dataset: {typesplit: typesplit_results.roc_auc for typesplit, typesplit_results in dataset_results.items()} for dataset, dataset_results in results.items()}).T

Unnamed: 0,random,temporal
condmat,0.716796,0.731934
enron,0.870673,0.874289
askubuntu,0.894699,0.851384
