# PLOT FOR CL-DGN WITH NEW API

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import os

## INPUT SECTION

In [None]:
# fill this
RUN_NAME = "final_run"
RUN_TEMPLATE = "final_run{}"
TRAINING_RESULTS_NAME = "training_results.csv"
INTERMEDIATE_RESULTS_NAME = "intermediate_results.csv"
RESULT_BASE_PATH = "/data/cossu/CL-DGN" # BASE PATH TO GOOGLE DRIVE FOLDER
SAVE_IMG_PATH = "/data/cossu/CL-DGN/PLOTS/" # PATH TO SAVE PLOTS
REG_SUFFIX = '_REG'

# used to get labels for plots instead of ugly long model names
models_to_label = {
    'SuperpixelsBaseline': 'MLP',
    'GraphSAGESuperpixels': 'DGN',
    f'GraphSAGESuperpixels{REG_SUFFIX}': f'DGN{REG_SUFFIX}',
    'GraphSAGEOGBGPPA': 'DGN-OGBG',
    f'GraphSAGEOGBGPPA{REG_SUFFIX}': f'DGN-OGBG{REG_SUFFIX}',
    'OGBGBaseline': 'MLP-OGBG'
}

plt.rcParams.update({'font.size': 12})
markers = ["o","v","^","<",">","8","s","p","P","*", ".", "h","H","+","x","X","D","d"]
linestyles = ['--', '-', '-.', ':']

## UTILITY FUNCTIONS

In [None]:
def preprocess_csv(fpath, write_new_file=False):

    with open(fpath,'r') as f:
        text = f.read()
        
    dest_text = []
    parenthesis = False
    first_number = True
    digits = ['0','1','2','3','4','5','6','7','8','9', '.']
    
    for el in text:
        if el == '{':
            parenthesis=True
        elif el == '}':
            parenthesis=False
            first_number = True
        elif el == ',' and parenthesis:
            first_number = False
        else:
            if not parenthesis:
                dest_text.append(el)
            else:
                if el in digits and first_number:
                    dest_text.append(el)
                    
    dest_text = ''.join(dest_text)
    
    if write_new_file:
        with open(fpath, 'w') as f:
            f.write(dest_text)
            
    return dest_text

In [None]:
def compute_training_mean_std(root):
    
    selected_columns = ['train_loss', 'main_score', 'val_loss', 'main_score.1']

    num_runs = len([el for el in os.listdir(root) if el.startswith(RUN_NAME)])
    data_gathered = None
    for i in range(1,num_runs+1):
        cur_file = os.path.join(root, RUN_TEMPLATE.format(i), TRAINING_RESULTS_NAME)
        
        #preprocess_csv(cur_file, write_new_file=True)
        
        data = pd.read_csv(cur_file)
        try:
            data = np.expand_dims(data[selected_columns].values, axis=0)
        except:
            print(i)
            print(data.head())
            raise ValueError()

        if data_gathered is None:
            data_gathered = data
        else:
            data_gathered = np.concatenate((data_gathered, data), axis=0)
    
    averages = np.average(data_gathered, axis=0)
    stds = np.std(data_gathered, axis=0)
    
    train_loss_mean, val_loss_mean, train_acc_mean, val_acc_mean = averages[:,0], averages[:,1], averages[:,2], averages[:,3]
    train_loss_std, val_loss_std, train_acc_std, val_acc_std = stds[:,0], stds[:,1], stds[:,2], stds[:,3]
    return (train_loss_mean, train_acc_mean, val_loss_mean, val_acc_mean), \
           (train_loss_std, train_acc_std, val_loss_std, val_acc_std)
  


In [None]:
def compute_intermediate_mean_std(root):

    selected_columns = ['loss', 'main_score']
    
    num_runs = len([el for el in os.listdir(root) if el.startswith(RUN_NAME)])
    data_gathered = None
    for i in range(1,num_runs+1):
        cur_file = os.path.join(root, RUN_TEMPLATE.format(i), INTERMEDIATE_RESULTS_NAME)

        #preprocess_csv(cur_file, write_new_file=True)

        data = pd.read_csv(cur_file)
        data = data[data['task_id'] == data['task_id'].max()] # choose last task
        data = np.expand_dims(data[selected_columns].values, axis=0)
        if data_gathered is None:
            data_gathered = data
        else:
            data_gathered = np.concatenate((data_gathered, data), axis=0)
        
    averages = np.average(data_gathered, axis=0)
    stds = np.std(data_gathered, axis=0)
    
    loss_mean, acc_mean = averages[:,0], averages[:,1]
    loss_std, acc_std = stds[:,0], stds[:,1]
    return (loss_mean, acc_mean), (loss_std, acc_std)    

In [None]:
def get_cmap(n, name='hsv'):
    '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct 
    RGB color; the keyword argument name must be a standard mpl colormap name.'''
    return plt.cm.get_cmap(name, n)

## NUMERICAL RESULTS (NO PLOTS)

In [None]:
model = 'GraphSAGEOGBGPPA'
experiment = 'EWC'
dataset = 'ogbg_ppa'
ROOT = os.path.join(RESULT_BASE_PATH, 
                    f"RESULTS_EWC/{model}_{dataset}_assessment/MODEL_ASSESSMENT/OUTER_FOLD_1")

avg, std = compute_training_mean_std(ROOT)
print(avg)
print()
print(std)
print()

avg, std = compute_intermediate_mean_std(ROOT)
print(avg)
print()
print(std)
print()

# mean and std over tasks (and over runs)
print(np.mean(avg[1]))
print()
print(np.mean(std[1]))

## PLOT AT INCREASING LEVEL OF MEMORY

In [None]:
# fill this
mem_sizes = [50, 100, 200, 500, 1000, 2000]
#models = ['SuperpixelsBaseline', 'GraphSAGESuperpixels', 'GraphSAGEOGBGPPA']
models = ['GraphSAGEOGBGPPA']
reg_models = ['GraphSAGEOGBGPPA']
DATASET = 'ogbg_ppa'

baseline_folder = os.path.join(RESULT_BASE_PATH, "RESULTS_REHEARSAL_{}{}/{}"+f"_{DATASET}_assessment/MODEL_ASSESSMENT/OUTER_FOLD_1")

In [None]:
avgs = {}
stds = {}
for model in models:
    avgs[model] = []
    stds[model] = []
    for mem_size in mem_sizes:
        (_, avg_run), (_, std_run) = compute_intermediate_mean_std(baseline_folder.format(mem_size, '', model))
        avg_task, std_task = np.mean(avg_run), np.mean(std_run)
        avgs[model].append(avg_task)
        stds[model].append(std_task)

for model in reg_models:
    avgs[model+REG_SUFFIX] = []
    stds[model+REG_SUFFIX] = []
    for mem_size in mem_sizes:
        (_, avg_run), (_, std_run) = compute_intermediate_mean_std(baseline_folder.format(mem_size, REG_SUFFIX, model))
        avg_task, std_task = np.mean(avg_run), np.mean(std_run)
        avgs[model+REG_SUFFIX].append(avg_task)
        stds[model+REG_SUFFIX].append(std_task)        

In [None]:
cmap = get_cmap(100, 'Set1')
xcoords = list(range(2,len(mem_sizes)*2+1,2))
plt.figure()
for i, model in enumerate(avgs.keys()):
    plt.errorbar(xcoords, avgs[model], yerr=stds[model], fmt=markers[i]+linestyles[i], label=models_to_label[model], c=cmap(i*20))
plt.xticks(xcoords, [str(el) for el in mem_sizes])
plt.legend(loc='best')
plt.ylabel('Accuracy')
plt.xlabel('Memory size')
plt.ylim(0, 100)
plt.grid(True)
plt.title(DATASET)
plt.savefig(os.path.join(SAVE_IMG_PATH, f'{DATASET}_rehearsal_memory.png'))
plt.show()

## PAIRED PLOTS

In [None]:
# fill this
#models = ['SuperpixelsBaseline', 'GraphSAGESuperpixels', 'GraphSAGEOGBGPPA'] 
models = ['GraphSAGEOGBGPPA']
exp_category = 'RESULTS_REHEARSAL_1000'
DATASET = 'ogbg_ppa'
reg_models = ['GraphSAGEOGBGPPA']

baseline_folder = os.path.join(RESULT_BASE_PATH, exp_category+"{}/{}"+f"_{DATASET}_assessment/MODEL_ASSESSMENT/OUTER_FOLD_1")

In [None]:
val_avgs = {}
test_avgs = {}
for model in models:
    (_, _, _, val_acc), _ = compute_training_mean_std(baseline_folder.format('', model))    
    (_, test_acc), _ = compute_intermediate_mean_std(baseline_folder.format('', model))
    val_avgs[model] = val_acc
    test_avgs[model] = test_acc

for model in reg_models:
    (_, _, _, val_acc), _ = compute_training_mean_std(baseline_folder.format(REG_SUFFIX, model))    
    (_, test_acc), _ = compute_intermediate_mean_std(baseline_folder.format(REG_SUFFIX, model))
    val_avgs[model+REG_SUFFIX] = val_acc
    test_avgs[model+REG_SUFFIX] = test_acc

In [None]:
cmap = get_cmap(val_avgs[models[0]].shape[0], 'Set1')
DIST = 20
INTER_DIST = 5
xcoords = list(range(8, (len(models)+len(reg_models))*DIST+1, DIST))
plt.figure()
plt.hlines(10, -3, xcoords[-1]+DIST, colors='gray', linewidth=2, linestyle=':', zorder=1, label='random')

# plot pairs of points
for i, model in enumerate(val_avgs.keys()):
    for task_id in range(val_avgs[model].shape[0]):
        plt.plot([xcoords[i]-INTER_DIST, xcoords[i]+INTER_DIST], [val_avgs[model][task_id], test_avgs[model][task_id]], c='k', linewidth=0.5, zorder=1)
        plt.scatter([xcoords[i]-INTER_DIST, xcoords[i]+INTER_DIST], [val_avgs[model][task_id], test_avgs[model][task_id]], marker=markers[task_id], s=40, c=[cmap(task_id)], label='T'+str(task_id+1), zorder=2)

# plot mean vector
for i, model in enumerate(val_avgs.keys()):
    left_mean = np.mean(val_avgs[model])
    right_mean = np.mean(test_avgs[model])
    plt.plot([xcoords[i]-INTER_DIST, xcoords[i]+INTER_DIST], [left_mean, right_mean], c='red', ls='--', linewidth=0.5, zorder=1)
    plt.scatter([xcoords[i]-INTER_DIST, xcoords[i]+INTER_DIST], [left_mean, right_mean], marker='*', s=40, c='darkred', label='mean', zorder=2)


# remove duplicated legend entries
handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys(), loc='best')

plt.ylabel('Accuracy')
plt.ylim(-3,105)
plt.xlim(0,xcoords[-1]+DIST)
plt.xticks(xcoords, [models_to_label[el] for el in val_avgs.keys()])
plt.grid(True)
plt.title(DATASET)
plt.savefig(os.path.join(SAVE_IMG_PATH, f'{exp_category}_{DATASET}_paired_plot.png'))
plt.show()