In [1]:
import os
import pandas as pd


import itertools
    
path = os.path.join("..","output2", "runs")

In [3]:
from notebook_utils import *

In [4]:
expected_datasets = [
    "cora_ml", "citeseer", "amazon_photos","pubmed"
]
expected_models= ["gcn", "sgc", "seal"]
expected_strategies_common = [
    "aleatoric_propagated", "entropy", "age", "anrmab", 
]
expected_strategies_sgc = [
    "geem", "approximate_uncertainty_esp", "approximate_uncertainty_mp"
]

In [5]:
for d,m,s in list(itertools.product(expected_datasets, ["gcn", "sgc"], ["random"])):
    print(generate_prompt(d,m,s))

nohup python main.py model=gcn data=cora_ml acquisition_strategy=random data.num_splits=5 model.num_inits=5 print_summary=True model.cached=True acquisition_strategy.adaptation_enabled=False acquisition_strategy.tta_enabled=False > logs_new/cora_ml_gcn_random.log &
nohup python main.py model=sgc data=cora_ml acquisition_strategy=random data.num_splits=5 model.num_inits=5 print_summary=True model.cached=True acquisition_strategy.adaptation_enabled=False acquisition_strategy.tta_enabled=False > logs_new/cora_ml_sgc_random.log &
nohup python main.py model=gcn data=citeseer acquisition_strategy=random data.num_splits=5 model.num_inits=5 print_summary=True model.cached=True acquisition_strategy.adaptation_enabled=False acquisition_strategy.tta_enabled=False > logs_new/citeseer_gcn_random.log &
nohup python main.py model=sgc data=citeseer acquisition_strategy=random data.num_splits=5 model.num_inits=5 print_summary=True model.cached=True acquisition_strategy.adaptation_enabled=False acquisit

In [7]:

cartesian_product_gcn = list(itertools.product(expected_datasets, ["gcn"], expected_strategies_common))
cartesian_product_sgc = list(itertools.product(expected_datasets, ["sgc"], expected_strategies_sgc))
cartesian_product_sgc = list(itertools.product(expected_datasets, ["sgc"], expected_strategies_common))
cartesian_product_seal = list(itertools.product(expected_datasets, ["seal"], ["seal"]))
cartesian_product = cartesian_product_gcn + cartesian_product_sgc + cartesian_product_seal
# cartesian_product = cartesian_product_sgc
cartesian_product.remove(('cora_ml', 'gcn', 'aleatoric_propagated'))
cartesian_product.remove(('cora_ml', 'sgc', 'aleatoric_propagated'))
cartesian_product.sort(key=lambda x: (x[0], x[1], x[2]))



In [8]:
cartesian_product

[('amazon_photos', 'gcn', 'age'),
 ('amazon_photos', 'gcn', 'aleatoric_propagated'),
 ('amazon_photos', 'gcn', 'anrmab'),
 ('amazon_photos', 'gcn', 'entropy'),
 ('amazon_photos', 'seal', 'seal'),
 ('amazon_photos', 'sgc', 'age'),
 ('amazon_photos', 'sgc', 'aleatoric_propagated'),
 ('amazon_photos', 'sgc', 'anrmab'),
 ('amazon_photos', 'sgc', 'entropy'),
 ('citeseer', 'gcn', 'age'),
 ('citeseer', 'gcn', 'aleatoric_propagated'),
 ('citeseer', 'gcn', 'anrmab'),
 ('citeseer', 'gcn', 'entropy'),
 ('citeseer', 'seal', 'seal'),
 ('citeseer', 'sgc', 'age'),
 ('citeseer', 'sgc', 'aleatoric_propagated'),
 ('citeseer', 'sgc', 'anrmab'),
 ('citeseer', 'sgc', 'entropy'),
 ('cora_ml', 'gcn', 'age'),
 ('cora_ml', 'gcn', 'anrmab'),
 ('cora_ml', 'gcn', 'entropy'),
 ('cora_ml', 'seal', 'seal'),
 ('cora_ml', 'sgc', 'age'),
 ('cora_ml', 'sgc', 'anrmab'),
 ('cora_ml', 'sgc', 'entropy'),
 ('pubmed', 'gcn', 'age'),
 ('pubmed', 'gcn', 'aleatoric_propagated'),
 ('pubmed', 'gcn', 'anrmab'),
 ('pubmed', 'gcn', '

In [9]:
tta_base_prompts = [generate_prompt_tta(dataset, model, strategy, "noise", "mask", 200, True, True, 1, 42, p_node=0.4, p_edge=0.4) for dataset,model,strategy in cartesian_product]

In [10]:
with open("../scripts/tta_base_prompts.txt", "w") as f:
    for tta_base_prompt in tta_base_prompts:
        f.write(str(tta_base_prompt) + "\n")



In [None]:
df = pd.DataFrame(cartesian_product, columns=["dataset", "model", "strategy"])
df.set_index(["dataset", "model", "strategy"], inplace=True)
df["done"] = False
df["progress"] = False
df["progress_percentage"] = 0.0
df["progress_count"] = 0
df.sort_index(inplace=True)

In [None]:
df = df.sort_index()
df = update_progress(df, path)

In [None]:
df[(df["progress"] == True) &
   (df["done"] == False)]

In [None]:
df[ (df.index.get_level_values("strategy") == "geem")]

In [None]:
df[((df.index.get_level_values("dataset") == "cora_ml") |
   (df.index.get_level_values("dataset") == "citeseer") |
   (df.index.get_level_values("dataset") == "amazon_photos")) &
   (df["done"] == False)]

In [5]:
from graph_al.acquisition.enum import *
dataset, model, strategy, seed = "cora_ml", "gcn", "aleatoric_propagated", 42

lr_feat = [0.01, 0.001, 0.0005 ,0.0001]
lr_adj = [0.1, 0.05,0.01]
epochs = [5, 10, 20, 30, 50]
loop_adj, loop_feat = 1,4
mode = [AdaptationMode.FEATURE, AdaptationMode.STRUCTURE, AdaptationMode.BOTH]
# integration = [AdaptationIntegration.QUERY, AdaptationIntegration.FINE_TUNE,
#                AdaptationIntegration.TRAIN,AdaptationIntegration.TRAIN_RECURSIVE]
integration = AdaptationIntegration.TRAIN_RECURSIVE
scale = 1


cartesian_product_feat = list(itertools.product(lr_feat, epochs))
cartesian_product_adj = list(itertools.product(lr_adj, epochs))
cartesian_product_both = list(itertools.product(lr_adj, lr_feat, epochs))

adaptation_configs = []

for lr_f, epoch in cartesian_product_feat:
    adaptation_configs.append(AdaptationConfig(
        lr_feat=lr_f,
        epochs=epoch,
        mode=AdaptationMode.FEATURE,
        integration=integration,
        seed = seed
    ))

# for lr_a, epoch in cartesian_product_adj:
#     adaptation_configs.append(AdaptationConfig(
#         lr_adj=lr_a,
#         epochs=epoch,
#         mode=AdaptationMode.STRUCTURE,
#         integration=integration,
#         seed = seed
#     ))

for lr_a, lr_f, epoch in cartesian_product_both:
    adaptation_configs.append(AdaptationConfig(
        lr_feat=lr_f,
        lr_adj=lr_a,
        epochs=epoch,
        mode=AdaptationMode.BOTH,
        integration=integration,
        seed = seed
    ))

adaptation_prompts =  [generate_prompt_adaptation(dataset, model, strategy, adaptation_config, scale, seed)[0] for adaptation_config in adaptation_configs]


In [7]:
with open("../scripts/adaptation_prompts_recursive.txt", "w") as f:
    for adaptation_prompt in adaptation_prompts:
        f.write(str(adaptation_prompt) + "\n")

In [None]:
adaptation_df = pd.DataFrame([{
    "lr_feat": config.lr_feat if (config.mode == AdaptationMode.FEATURE) or (config.mode == AdaptationMode.BOTH) else None,
    "lr_adj": config.lr_adj if (config.mode == AdaptationMode.STRUCTURE) or (config.mode == AdaptationMode.BOTH) else None,
    "epochs": config.epochs,
    "integration": config.integration,
    "mode": config.mode
} for config in adaptation_configs])

adaptation_paths = [generate_prompt_adaptation(dataset, model, strategy, adaptation_config, scale, seed)[1] for adaptation_config in adaptation_configs]
adaptation_df["metrics_exists"] = [
    any(
        os.path.exists(os.path.join(path, adaptation_path, subdir, "acquisition_curve_metrics.pt"))
        for subdir in os.listdir(os.path.join(path, adaptation_path))
    ) if os.path.exists(os.path.join(path, adaptation_path)) else False
    for adaptation_path in adaptation_paths
]

adaptation_df["progress_count"] = [
    sum(
        file.startswith("acquisition_metrics") for subdir in os.listdir(os.path.join(path, adaptation_path)) for file in os.listdir(os.path.join(path, adaptation_path, subdir))
    ) if os.path.exists(os.path.join(path, adaptation_path)) else 0
    for adaptation_path in adaptation_paths
]


adaptation_df["elapsed_time"] = [
    (pd.Timestamp.now() - pd.Timestamp(os.path.getctime(os.path.join(path, adaptation_path)), unit='s')).total_seconds() / 60 - 60
    if os.path.exists(os.path.join(path, adaptation_path)) else None
    for adaptation_path in adaptation_paths
]



In [None]:
adaptation_df[adaptation_df["metrics_exists"] == False]

In [8]:
dataset, model, strategy, seed = "pubmed", "gcn", "age", "42"

node_strats = ["mask", "noise", "none"]
edge_strats = ["mask", "none"]
filters = [True, False]
num, probs, scale = 100, True, 1
p = [0.05, 0.1, 0.15,0.2, 0.3, 0.4, 0.5,0.6, 0.7, 0.8, 0.9]
p_edges= [0.05, 0.1,0.2, 0.6, 0.7, 0.9]
p_both = [0.1, 0.2, 0.3, 0.4, 0.5]

# cartesian_product_tta = list(itertools.product(node_strats, edge_strats, filters))
cartesian_product_tta_node = list(itertools.product(["mask", "noise"], p))

to_remove = [
        ("noise",0.15),
        ("noise",0.1),
        ("noise",0.6),
        ("noise",0.9),
        ("mask",0.15),
     ]

for i in to_remove:
    cartesian_product_tta_node.remove(i)
    
cartesian_product_tta_both = list(itertools.product(["mask"], p_both, p_both))
    
tta_node_prompts = [generate_prompt_tta(dataset, model, strategy, node_strat, "none", num, True, probs, scale, seed, p_node=p_node)
                    for node_strat, p_node in cartesian_product_tta_node]
tta_edge_prompts = [generate_prompt_tta(dataset, model, strategy, "none", "mask", num, True, probs, scale, seed, p_edge=p_edge)
                    for p_edge in p_edges]
tta_both_prompts =  [generate_prompt_tta(dataset, model, strategy, node_strat, "mask", num, True, probs, scale, seed, p_node=p_n, p_edge=p_e) for node_strat, p_n, p_e in cartesian_product_tta_both]

tta_hyperparam = tta_node_prompts + tta_edge_prompts


In [6]:
len(cartesian_product_tta_both)

25

In [10]:
with open("../scripts/gcn_age_pubmed_tta_both_filter_prompts.txt", "w") as f:
    for prompt in tta_both_prompts:
        f.write(prompt + "\n")

In [9]:
tta_num_prompts = [generate_prompt_tta("pubmed", "gcn", "aleatoric_propagated", "mask", "mask", num, True, probs, scale, seed, p_node=0.1, p_edge=0.2) for num in [
                                                                                                                                                                   200,500,1000,2000]]

In [10]:
for t in tta_num_prompts:
    print(t)

nohup python main.py model=gcn data=pubmed acquisition_strategy=aleatoric_propagated data.num_splits=5 model.num_inits=5 print_summary=True model.cached=False seed=42 acquisition_strategy.adaptation_enabled=False acquisition_strategy.tta_enabled=True acquisition_strategy.scale=1 acquisition_strategy.tta.strat_node=mask acquisition_strategy.tta.strat_edge=mask acquisition_strategy.tta.num=200 acquisition_strategy.tta.p_node=0.1 acquisition_strategy.tta.p_edge=0.2 acquisition_strategy.tta.filter=True acquisition_strategy.tta.probs=True wandb.name=fmask_emask_200_filter_probs_0.1_0.2 > logs_new/pubmed_gcn_aleatoric_propagated_tta_fmask_emask_200_filter_probs_0.1_0.2.log &
nohup python main.py model=gcn data=pubmed acquisition_strategy=aleatoric_propagated data.num_splits=5 model.num_inits=5 print_summary=True model.cached=False seed=42 acquisition_strategy.adaptation_enabled=False acquisition_strategy.tta_enabled=True acquisition_strategy.scale=1 acquisition_strategy.tta.strat_node=mask a

In [6]:
ds = "citeseer"
strategy = "approximate_uncertainty_esp"
with open(f"../scripts/geem_seeds_{ds}.txt", "r") as file:
    seeds = file.readlines()
    seeds = [int(seed.strip()) for seed in seeds]
# with open(f"../scripts/{strategy}_tta_command_{ds}.txt", "w") as file:
#     for seed in seeds:
#         prompt = generate_prompt_tta(ds, "sgc", strategy, "mask", "mask", 200, True, True, 1, seed, p_node=0.5, p_edge=0.4, n_splits=1, n_inits=1)
#         file.write(prompt + "\n")
with open(f"../scripts/{strategy}_command_{ds}.txt", "w") as file:
    for seed in seeds:
        prompt = generate_prompt_geem(ds, "sgc", strategy, seed)
        file.write(prompt + "\n")

In [4]:
top_percents = [float(i*10) for i in range(10)]
low_percents = [0,0.5,1,5,10,20]
cartesian_product_educated_random_percentages = list(itertools.product(top_percents, low_percents))
cartesian_product_educated_random_percentages = [(top_percent, low_percent) for top_percent, low_percent in cartesian_product_educated_random_percentages if (top_percent + low_percent < 100)]

In [5]:
len(cartesian_product_educated_random_percentages)

57

In [6]:
with open("../scripts/educated_random_pubmed_alea.txt", "w") as file:
    for run in cartesian_product_educated_random_percentages:
        top_percent, low_percent = run
        prompt = generate_prompt_educated_random_by_pred_attribute_notta( "pubmed", "gcn","aleatoric_propagated",top_percent, low_percent, 42)
        print(prompt)
        file.write(prompt + "\n")

nohup python main.py model=gcn data=pubmed acquisition_strategy=educated_random data.num_splits=5 model.num_inits=5 print_summary=True model.cached=True seed=42 acquisition_strategy.adaptation_enabled=False acquisition_strategy.scale=1 acquisition_strategy.tta_enabled=False acquisition_strategy.top_percent=0.0 acquisition_strategy.low_percent=0 +acquisition_strategy.embedded_strategy=acquire_by_prediction_attribute +acquisition_strategy.embedded_strategy.higher_is_better=false +acquisition_strategy.embedded_strategy.attribute=MAX_SCORE +acquisition_strategy.embedded_strategy.propagated=True wandb.name=aleatoric_propagated_0.0_0 > logs_new/pubmed_gcn_educated_random_aleatoric_propagated_0.0_0.log &
nohup python main.py model=gcn data=pubmed acquisition_strategy=educated_random data.num_splits=5 model.num_inits=5 print_summary=True model.cached=True seed=42 acquisition_strategy.adaptation_enabled=False acquisition_strategy.scale=1 acquisition_strategy.tta_enabled=False acquisition_strate