# Setup

In [1]:
%cd ..

import importlib
import os

import joblib
import pandas as pd

import tlp

url = 'http://konect.cc/files/download.tsv.wikipedia-growth.tar.bz2'
dataset_id = '27'
adjusted_intervals = dict()

/scratch/bruingjde/temporalLinkprediction


# Download and extract data

In [2]:
filepath = f'data/{dataset_id}/'
if not os.path.isfile(filepath):
  edgelist = tlp.get_edgelist(url, filepath, verbose=True)
  joblib.dump(edgelist, os.path.join(filepath, 'edgelist.pkl'))

IsADirectoryError: [Errno 21] Is a directory: 'data/27/'

# Splitting

In [9]:
%%time
filepath_edgelist = f'data/{dataset_id}/edgelist.pkl'
filepath_mature = f'data/{dataset_id}/edgelist_mature.pkl'
filepath_probe = f'data/{dataset_id}/edgelist_probe.pkl'

if not os.path.isfile(filepath_mature) or not os.path.isfile(filepath_probe):
  edgelist_mature, edgelist_probe = tlp.split_in_intervals(
    edgelist=joblib.load(filepath_edgelist), **adjusted_intervals)
  joblib.dump(edgelist_mature, filepath_mature)
  joblib.dump(edgelist_probe, filepath_probe)

CPU times: user 3.04 s, sys: 3 s, total: 6.04 s
Wall time: 6.06 s


# Instances

In [10]:
%%time
filepath_mature = f'data/{dataset_id}/edgelist_mature.pkl'
filepath_instances = f'data/{dataset_id}/instances.pkl'

if not os.path.isfile(filepath_instances):
  instances = tlp.get_instances(edgelist_mature=joblib.load(filepath_mature),
    verbose=True)
  joblib.dump(instances, filepath_instances)

Collecting unconnected pairs of nodes:   0%|          | 0/1573757 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Targets

In [None]:
%%time
filepath_instances = f'data/{dataset_id}/instances.pkl'
filepath_probe = f'data/{dataset_id}/edgelist_probe.pkl'
filepath_targets = f'data/{dataset_id}/targets.pkl'

if not os.path.isfile(filepath_targets):
  targets = tlp.get_targets(
    instances=joblib.load(filepath_instances),
    edgelist_probe=joblib.load(filepath_probe),
    verbose=True)
  joblib.dump(targets, filepath_targets)

# Sampling

In [None]:
%%time
filepath_instances = f'data/{dataset_id}/instances.pkl'
filepath_targets = f'data/{dataset_id}/edgelist_targets.pkl'
filepath_instances_sampled = f'data/{dataset_id}/instances_sampled.pkl'
filepath_targets_sampled = f'data/{dataset_id}/targets_sampled.pkl'

if not (os.path.isfile(filepath_instances_sampled) and 
        os.path.isfile(filepath_targets_sampled)):
  instances_sampled, targets_sampled = tlp.balanced_sample(instances, targets)
  joblib.dump(instances_sampled, filepath_instances_sampled)
  joblib.dump(targets_sampled, filepath_targets_sampled)

# Analysis

## Figure: Edge distribution with maturing and probing intervals indicated

In [None]:
tlp.analysis.plot_datetime(
  joblib.load(f'{dataset_id}/edgelist.pkl')['datetime'], **adjusted_intervals)

## Metric: Class imbalance

In [None]:
tlp.analysis.class_imbalance(targets=joblib.load(f'{dataset_id}/targets.pkl'))

## Figure: Datetime strategies

In [None]:
edgelist = joblib.load(f'{dataset_id}/edgelist_mature.pkl')
t = tlp.analysis.plot_datetime_strategies(edgelist['datetime'])

## Figure: Datetime distribution

In [None]:
edgelist = joblib.load(f'{dataset_id}/edgelist_mature.pkl')
tlp.analysis.plot_datetime_distribution(edgelist['datetime'])

## Figure: Scores

In [None]:
feature_dict = dict()
for file in os.scandir(f'{dataset_id}/features'):
  if file.is_file():
    feature_dict.update(joblib.load(file.path))
targets=joblib.load('01/targets_sampled.pkl')

tlp.analysis.plot_score(feature_dict, targets)

## Figure: ROC

In [None]:
feature_dict = dict()
for file in os.scandir(f'{dataset_id}/features'):
  if file.is_file():
    feature_dict.update(joblib.load(file.path))
targets=joblib.load('01/targets_sampled.pkl')

tlp.analysis.plot_roc_auc(feature_dict, targets)

## AUC

In [None]:
feature_dict = dict()
for file in os.scandir(f'{dataset_id}/features'):
  for file in os.scandir(f'{dataset_id}/features'):
    if file.is_file():
      feature_dict.update(joblib.load(file.path))
targets=joblib.load('01/targets_sampled.pkl')

tlp.analysis.get_auc(feature_dict, targets)