# Data Preparation

In [1]:
from data_processor import DataProcess
# %load -r 323- Train/data_processor.py
processor = DataProcess('pubmed', seed=0)
processor.input(['adjtxt', 'attr_matrix', 'labels'])
processor.calculate(['deg', 'idx_train', 'attr_matrix_norm'])
processor.output(['deg', 'query', 'attr_matrix_norm'])
print(processor)

n=19717, m=88648, F=500, C=3 | feat: (19717, 500), label: (19717,) | 60/600/19057=0.00/0.03/0.97


In [2]:
%%script bash
DATASTR=pubmed
DATADIR=../data/${DATASTR}
../Precompute/build/featpush -algo clean_graph -is_undirected no \
        -graph ${DATADIR}/adj.txt -output_folder ${DATADIR}

--------------------------------------------------------------------------------
Configs:
	Algorithm Parameter: clean_graph
	Is Undirected: 0
	Input Graph File: ../data/pubmed/adj.txt
	Output Folder: ../data/pubmed
--------------------------------------------------------------------------------
88648 Lines Read.
88648-th Non-Self Loop Edges.
Finish Reading.
--------------------------------------------------------------------------------
Maximum ID: 19716
Minimum ID: 0
The number of dead end vertices: 0
The number of Isolated Points: 0
The maximum out degree is: 171
Writing Binary Finished.
--------------------------------------------------------------------------------


# Feat-Push

In [3]:
%%script bash
DATASTR=pubmed
ALGOSTR=featpush
SEED=0
DATADIR=../data/${DATASTR}
SAVEDIR=../save/${DATASTR}/${ALGOSTR}/${SEED}
mkdir -p ${SAVEDIR}
../Precompute/build/featpush -algo ${ALGOSTR} \
        -meta ${DATADIR}/attribute.txt -graph_binary ${DATADIR}/graph.bin \
        -query ${DATADIR}/query.txt -feature_file ${DATADIR}/feats_norm.npy \
        -estimation_folder ${SAVEDIR} -split_num 1 -seed ${SEED} \
        -alpha 0.1 -epsilon 2

--------------------------------------------------------------------------------
Configs:
	Algorithm Parameter: featpush
	Input Meta File: ../data/pubmed/attribute.txt
	Input Graph Binary File: ../data/pubmed/graph.bin
	Input Query File: ../data/pubmed/query.txt
	Feature File: ../data/pubmed/feats_norm.npy
	Estimation Folder: ../save/pubmed/featpush/0
	Number of Split: 1
	Random Seed: 0
	Alpha: 0.100000000000
	Epsilon: 2.000000000000
--------------------------------------------------------------------------------
The Number of Vertices: 19717
The Number of Edges: 88648
Returned Value of fread: 88648
The number of dead end vertices:0
edges_processed:88648
--------------------------------------------------------------------------------
Query size: 19717
Input shape: 19717 500
Feature size: 19717 500
num_of_walks:10000000
Result size: 9858500 
Feature saved: ../save/pubmed/featpush/0/score_0.1_2.npy
Mem: 569 MB
Total Time: 1.781927, Average: 0.003563853993
--------------------------------

# Feat-Reuse

In [4]:
%%script bash
DATASTR=pubmed
ALGOSTR=featreuse
SEED=0
DATADIR=../data/${DATASTR}
SAVEDIR=../save/${DATASTR}/${ALGOSTR}/${SEED}
mkdir -p ${SAVEDIR}
../Precompute/build/featpush -algo ${ALGOSTR} \
        -meta ${DATADIR}/attribute.txt -graph_binary ${DATADIR}/graph.bin \
        -query ${DATADIR}/query.txt -feature_file ${DATADIR}/feats_norm.npy \
        -estimation_folder ${SAVEDIR} -split_num 1 -seed ${SEED} \
        -alpha 0.1 -epsilon 2

--------------------------------------------------------------------------------
Configs:
	Algorithm Parameter: featreuse
	Input Meta File: ../data/pubmed/attribute.txt
	Input Graph Binary File: ../data/pubmed/graph.bin
	Input Query File: ../data/pubmed/query.txt
	Feature File: ../data/pubmed/feats_norm.npy
	Estimation Folder: ../save/pubmed/featreuse/0
	Number of Split: 1
	Random Seed: 0
	Alpha: 0.100000000000
	Epsilon: 2.000000000000
--------------------------------------------------------------------------------
The Number of Vertices: 19717
The Number of Edges: 88648
Returned Value of fread: 88648
The number of dead end vertices:0
edges_processed:88648
--------------------------------------------------------------------------------
Query size: 19717
Input shape: 19717 500
Feature size: 19717 500
num_of_walks:10000000
Result size: 9858500 
base_size:20
Time Used on Base 0.138076
Feature saved: ../save/pubmed/featreuse/0/score_0.1_2.npy
avg_tht:0.0141102
avg_res:0.995987
re_feat_num:

# Train

In [5]:
!python -u run.py -f 0 -c config/pubmed.json -v 0

--------------------
Option saved.
Config path: ../save/pubmed/featreuse/0/config.json
Option dict: {'seed': 0, 'config': 'config/pubmed.json', 'dev': 0, 'data': 'pubmed', 'path': '../data/', 'algo': 'featreuse', 'epochs': 1000, 'patience': 100, 'batch': 64, 'lr': 0.005, 'weight_decay': 0.0001, 'layer': 2, 'hidden': 128, 'dropout': 0.5, 'bias': 'none', 'alpha': 0.1, 'eps': 2, 'rrz': 0.5, 'spt': 1}

--------------------
n=19717, m=88648, F_t=torch.Size([60, 500])
n_train=torch.Size([60]), n_val=torch.Size([600]), n_test=torch.Size([19057])
--------------------
Epoch:0009 | train loss:0.3550, val acc:0.7867, cost:0.0168
Epoch:0019 | train loss:0.1621, val acc:0.7783, cost:0.0286
Epoch:0029 | train loss:0.0634, val acc:0.7783, cost:0.0403
Epoch:0039 | train loss:0.1016, val acc:0.7850, cost:0.0520
Epoch:0049 | train loss:0.0944, val acc:0.7733, cost:0.0638
Epoch:0059 | train loss:0.0149, val acc:0.7850, cost:0.0755
Epoch:0069 | train loss:0.0194, val acc:0.7933, cost:0.0873
Epoch:0079 | t