### Notebook Purpose

analyze results for feature map regularization

- csv result files
    - `run_20201120-092851.csv`, `run_20201120-092903.csv`: search over all hparams on three file_ids (1000273, 1000325, 1000464), for 2000 iterations
        - via `search_hyperparam.py` / `train.py`
        - resulting best 20 trial_ids saved for future work in `trials_best_20201120.json`
    - check statistical significance...
        - `run_20201121-160658.csv`: used 20 best trial_ids over a larger set of file_ids for 2000 iterations
            - are three samples sufficient for statistically significant results?
            - via `search_hyperparam_larger_set.py` / `train_larger_set.py`
            - `run_20201123-065717.csv`: baseline (alpha_fm=0) for larger set of file_ids with 2000 iterations
        - `run_20201121-160718.csv`: used 20 best trial_ids over same set of three file_ids for 10000 iterations
            - are 2000 samples sufficient for statistically significant results?
            - via `search_hyperparam_orig_three.py` / `train_orig_three.py`
            - `run_20201123-065450.csv`: baseline (alpha_fm=0) for three file_ids with 10000 iterations

# CURRENTLY 

running over 10000 iterations across n>3 for best trials per `run_20201121-160718.csv` i.e. `df_10k`

via `search_hyperparam_larger_set.py` and baseline `search_hyperparam_iter10k_alphafm0.npy`

In [109]:
import os, sys
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
import json

### load original csvs

### aggregate trial_ids into same column, i.e. average ssim/psnr across file_ids

In [193]:
display_rows = 10#len(df)
pd.set_option('display.max_rows', display_rows)

def agg_sort_df(df_in):
    ''' given df, aggregate trial_ids across means for different file_ids
        sort df rows by the highest ssim_dc '''
    
    agg_fns = {'ssim_dc': 'mean', 'psnr_dc': 'mean', 'ssim_est':'mean', 'psnr_est':'mean', \
           'alpha_fm': 'first', 'num_iter': 'first', 'iter_start_fm_loss': 'first', \
           'weight_method': 'first', 'downsamp_method':'first'}
    df = df_in.groupby(df_in['trial_id']).aggregate(agg_fns).reset_index()
    df = df.sort_values(by='ssim_dc', ascending=False)
    
    return df

In [194]:
path_results = '/home/vanveen/ConvDecoder/expmt/results/'
# csv_files = [f for f in os.listdir(path_results) if isfile(join(path_results, f))]
csv_files = ['run_20201120-092851.csv', 'run_20201120-092903.csv']
csv_paths = [path_results + f for f in csv_files]
df_in = pd.concat((pd.read_csv(f) for f in csv_paths))

df = agg_sort_df(df_in)
df

Unnamed: 0,trial_id,ssim_dc,psnr_dc,ssim_est,psnr_est,alpha_fm,num_iter,iter_start_fm_loss,weight_method,downsamp_method
66,f1g4rdh9,0.728433,28.932900,0.502467,24.481000,0.00010,2000,0,late,nearest
77,i4csenaj,0.727133,28.824767,0.501000,24.403667,0.00010,2000,0,late,bicubic
61,dyn9rycz,0.726833,28.816833,0.496467,24.212033,0.00100,2000,1000,late,bilinear
139,w1v8v9ni,0.726333,28.807200,0.500367,24.465900,0.00001,2000,0,all,bilinear
12,1p0kn9i7,0.725833,28.754067,0.492600,24.087367,0.00010,2000,1600,early,nearest
...,...,...,...,...,...,...,...,...,...,...
128,ui2gux4g,0.649633,26.584367,0.441233,22.162600,0.00100,2000,0,early,bilinear
93,kg9zpja1,0.642900,26.635567,0.413400,20.919067,0.00100,2000,0,all,bicubic
1,0b8qud3y,0.639467,26.834233,0.424267,21.567500,0.00100,2000,0,all,bilinear
30,7jlwlalt,0.634600,25.516700,0.406167,18.853367,0.01000,2000,1000,early,nearest


### confirmed: variance is lowest for `alpha_fm=0`, since that negates all other variables

- TODO: in future, don't need to run this over all variables, since it's the same experiment

### get best value for each variable

finding: given std() of each individual variable, too difficult to say whether one value performs better than another with confidence. also many other variables at play

In [157]:
def get_1d_results(df, variable=None):
    '''return results slicing across each individual dimension'''
    
    if not variable:
        variable_list = ['alpha_fm', 'iter_start_fm_loss', 'weight_method', 'downsamp_method']
    else:
        variable_list = [variable]

    for var in variable_list:

        print(var)
        var_values = list(set(df[var].tolist()))
        var_values.sort()

        for value in var_values:

            df_val = df.loc[df[var] == value]
            print('{}: ssim ~ N({}, {}), psnr ~ N({}, {}), n={}'.format(value, \
                         np.round(df_val['ssim_dc'].mean(), 4), \
                         np.round(df_val['ssim_dc'].std(), 4), \
                         np.round(df_val['psnr_dc'].mean(), 4), \
                         np.round(df_val['psnr_dc'].std(), 4), len(df_val)))

        print('')
        
get_1d_results(df)

alpha_fm
0.0: ssim ~ N(0.722, 0.0009), psnr ~ N(28.6451, 0.0342), n=27
1e-05: ssim ~ N(0.7224, 0.0017), psnr ~ N(28.635, 0.0679), n=27
0.0001: ssim ~ N(0.7226, 0.0028), psnr ~ N(28.6512, 0.1112), n=27
0.001: ssim ~ N(0.7036, 0.0305), psnr ~ N(28.1225, 0.8239), n=27
0.01: ssim ~ N(0.7051, 0.0257), psnr ~ N(28.164, 0.7131), n=27
0.1: ssim ~ N(0.7026, 0.0277), psnr ~ N(28.0624, 0.7551), n=27

iter_start_fm_loss
0: ssim ~ N(0.705, 0.0267), psnr ~ N(28.2254, 0.6605), n=54
1000: ssim ~ N(0.7138, 0.0237), psnr ~ N(28.3561, 0.7333), n=54
1600: ssim ~ N(0.7202, 0.0061), psnr ~ N(28.5586, 0.2344), n=54

weight_method
all: ssim ~ N(0.7122, 0.021), psnr ~ N(28.3588, 0.5693), n=54
early: ssim ~ N(0.7065, 0.029), psnr ~ N(28.2107, 0.7921), n=54
late: ssim ~ N(0.7204, 0.0067), psnr ~ N(28.5706, 0.2613), n=54

downsamp_method
bicubic: ssim ~ N(0.7125, 0.0207), psnr ~ N(28.3436, 0.6229), n=54
bilinear: ssim ~ N(0.7122, 0.0257), psnr ~ N(28.3794, 0.6331), n=54
nearest: ssim ~ N(0.7144, 0.0182), psnr ~ N

### save dictionary for best 20 trial_ids 

then more thoroughly analyze these trial_ids, e.g. across >3 samples or >2000 iterations

In [128]:
def save_df_to_json(json_path, df):
    ''' given dataframe, save dict version to json '''
    
    dict_ = df.to_dict('index')
    
    with open(json_path, 'w') as f:
        json.dump(dict_, f)
        
    return dict_  

json_path = path_results + 'trials_best_20201120.json'

# dict_best_results = save_df_to_json(df[:20])

### get baseline i.e. `alpha_fm=0` for iter 2000 and 10000

In [154]:
df_alpha0_2k = pd.read_csv(path_results + 'run_20201123-065717.csv')
df_alpha0_10k = pd.read_csv(path_results + 'run_20201123-065450.csv')

def print_avg_std(df):
    print('ssim ~ N({}, {}), psnr ~ N({}, {})'.format(\
                 np.round(df['ssim_dc'].mean(), 4), \
                 np.round(df['ssim_dc'].std(), 4), \
                 np.round(df['psnr_dc'].mean(), 4), \
                 np.round(df['psnr_dc'].std(), 4)))

print('2000 iterations across all samples')
print_avg_std(df_alpha0_2k)
print('')
print('10000 iterations across three samples')
print_avg_std(df_alpha0_10k)

2000 iterations across all samples
ssim ~ N(0.6895, 0.081), psnr ~ N(27.5732, 2.0144)

10000 iterations across three samples
ssim ~ N(0.7528, 0.0624), psnr ~ N(30.2344, 1.1311)


### iter 2000 across all samples

In [195]:
df_2k_ = pd.read_csv(path_results + 'run_20201121-160658.csv')

# must append original three samples w matching trial_id
top_trials = list(set(df_2k_['trial_id'].tolist()))
df_2k_3 = df_in[df_in['trial_id'].isin(top_trials)]

df_2k_all = pd.concat([df_2k_, df_2k_3])
df_2k_all = agg_sort_df(df_2k_all)
df_2k_all

Unnamed: 0,trial_id,ssim_dc,psnr_dc,ssim_est,psnr_est,alpha_fm,num_iter,iter_start_fm_loss,weight_method,downsamp_method
6,f1g4rdh9,0.693673,27.742191,0.443164,23.114291,0.00010,2000,0,late,nearest
19,w1v8v9ni,0.693491,27.701418,0.445564,23.136845,0.00001,2000,0,all,bilinear
9,jlpxw8ir,0.693291,27.672355,0.437873,22.877909,0.00010,2000,0,all,bilinear
13,r71wfg6k,0.692518,27.619409,0.438836,22.863400,0.00001,2000,0,all,nearest
7,i4csenaj,0.692382,27.649891,0.435355,22.847491,0.00010,2000,0,late,bicubic
...,...,...,...,...,...,...,...,...,...,...
8,ixn1eec3,0.691227,27.564955,0.428418,22.590864,0.00010,2000,1600,all,bilinear
1,39n9u56h,0.691164,27.609009,0.431609,22.759145,0.00100,2000,1000,late,bicubic
10,k6aoqok1,0.691155,27.601473,0.434109,22.851518,0.00001,2000,1000,early,bicubic
17,vz8mku1t,0.691064,27.571391,0.432873,22.744473,0.00100,2000,0,late,nearest


### iter 10000 across three samples

In [201]:
df_10k = pd.read_csv(path_results + 'run_20201121-160718.csv')
df_10k = agg_sort_df(df_10k)

save_df_to_json(path_results + 'trials_best_20201121-160718.json', df_10k)

df_10k

Unnamed: 0,trial_id,ssim_dc,psnr_dc,ssim_est,psnr_est,alpha_fm,num_iter,iter_start_fm_loss,weight_method,downsamp_method
1,409jkpvb,0.785700,31.536467,0.624867,28.717667,0.00010,10000,0,late,bicubic
8,lwd6legj,0.779667,31.398600,0.615567,28.503033,0.00001,10000,0,all,nearest
4,autybby9,0.777867,31.275200,0.608067,28.353100,0.00010,10000,0,all,bilinear
2,6efqyxuy,0.773500,31.069100,0.601700,28.163667,0.00010,10000,1000,early,bilinear
9,mfk7h8hd,0.772033,31.006633,0.606233,28.134867,0.00100,10000,0,late,nearest
...,...,...,...,...,...,...,...,...,...,...
0,2lvfzxsn,0.754967,30.366767,0.579600,27.362767,0.00010,10000,1600,all,bilinear
16,wk7lr3tv,0.754333,30.344267,0.578233,27.332367,0.00010,10000,1600,late,bicubic
15,tbsvubnj,0.753200,30.308633,0.568400,27.103633,0.00010,10000,1600,early,nearest
6,hgejliq8,0.752167,30.244700,0.567300,27.123033,0.00100,10000,1000,late,bicubic


In [160]:
# get_1d_results(df_2k)
get_1d_results(df_10k)

alpha_fm
1e-05: ssim ~ N(0.7621, 0.0092), psnr ~ N(30.6894, 0.3774), n=6
0.0001: ssim ~ N(0.7636, 0.0116), psnr ~ N(30.7082, 0.4496), n=10
0.001: ssim ~ N(0.7593, 0.0088), psnr ~ N(30.5398, 0.3324), n=4

iter_start_fm_loss
0: ssim ~ N(0.7689, 0.0103), psnr ~ N(30.9399, 0.3837), n=9
1000: ssim ~ N(0.7579, 0.007), psnr ~ N(30.4873, 0.2722), n=8
1600: ssim ~ N(0.7542, 0.0009), psnr ~ N(30.3399, 0.0293), n=3

weight_method
all: ssim ~ N(0.7634, 0.0107), psnr ~ N(30.7282, 0.4254), n=7
early: ssim ~ N(0.7606, 0.0082), psnr ~ N(30.6006, 0.3321), n=6
late: ssim ~ N(0.7627, 0.0121), psnr ~ N(30.668, 0.4578), n=7

downsamp_method
bicubic: ssim ~ N(0.7625, 0.0136), psnr ~ N(30.6642, 0.5194), n=5
bilinear: ssim ~ N(0.7621, 0.0084), psnr ~ N(30.6777, 0.3163), n=9
nearest: ssim ~ N(0.7624, 0.0113), psnr ~ N(30.6595, 0.4618), n=6

