This notebook documents the steps necessary to post-process the set of chaos expansion experiments performed here. It will do a few things:

- Archive all the scripts and intermediate output files for storage/backup elsewhere

`version: April 22, 2015`

In [1]:
import numpy as np
import pandas as pd

import os, pickle, pprint, shutil
from collections import namedtuple, OrderedDict
from itertools import product
from zipfile import ZipFile, ZIP_DEFLATED

First we set up some "big picture" information, such as the names of the experiments and any useful helper functions.

In [2]:
SAVE_DIR = os.path.join(os.getcwd(), "save")
EXP_NAMES = ["SM_OLS", "SM_LARS", "SM_LASSO"]
ARCH_NAME = "SM_experiment.zip"
SAMPLE_FN = "sample.csv"

def load_exp(name, sample="LHS", archived=False):
    """ Load an experiment's dictionary, results, LHS design, and
    LHS results files 
    
    Parameters
    ----------
    archived : boolean
        Look for archived scheme first and fallback on original copy
        if not available.
    """
    
    if not archived:

        with open("%s_exp.dict" % name, 'r') as f:
            exp_dict = pickle.load(f)
        with open("%s_results.dict" % name, 'r') as f:
            results_dict = pickle.load(f)

        design_name = "%s_%s_design" % (name, sample)
        sample_design = pd.read_csv("%s.csv" % design_name,
                                    index_col=0)
        sample_results = pd.read_csv("%s_results.csv" % design_name,
                                     index_col=0)
        ## Rename Neq_ARG/MBN to Nderiv_ARG/MBN
        for bad_key in ["Neq_ARG", "Neq_MBN"]:
            print "Re-named bad key (%s)" % bad_key
            if bad_key in sample_results:
                sample_results.rename(
                    columns={ bad_key: bad_key.replace("Neq","Nderiv") },
                    inplace=True
                )
        
        sampling_results = pd.concat([sample_design, sample_results], axis=1)

    else:
        
        with open("%s.p" % name, "r") as f:
            exp_dict = pickle.load(f)
        results_dict = {}
        
        sample_fn = os.path.join(name, SAMPLE_FN)
        sampling_results = pd.read_csv(sample_fn)
        
    return exp_dict, results_dict, sampling_results

Additionally, we should generate the datastore for running the pcm parameterization from Python and Fortran, which is accomplished by the script `pcm_param.py` run from the command line. Global var `RUNS` in this file contains mappings of experiment names and orders, and for each combination, will produce a file `{EXP_NAME}_{ORDER}.ascii`, which is what the Fortran code looks for. Additionally, it will create and HDF5 Datastore object on disk which retains all the details of the chaos expansions for quickly calling from Python.

In [3]:
%run pcm_param.py
%run pcm_param.py --TEST --BLOCK

SM_LASSO
    expansion_order_2
    expansion_order_3
    expansion_order_4
    expansion_order_5
SM_LARS
    expansion_order_2
    expansion_order_3
    expansion_order_4
    expansion_order_5
SM_OLS
    expansion_order_2
    expansion_order_3
    expansion_order_4
    expansion_order_5
V 1.0
PCM (0.002096470910337105, 490.28014574731134, 0.57680017146742513)
ARG (0.0020518378787395715, [483.37876924093644], [0.56868090498933699])
MBN (0.0023060400580614806, [520.34672851897585], [0.61217262178703047])
------------------------------------------------------------
Locked out


We'll need to move those files generated above into the appropriate directory.

---

Save the results by first organizing into a reliable folder stucture:

    | this_dir/
    | ------->/exp1.p
    | ------->/--->
    | ------->/--->/expansion_order_1
    | ...
    | ------->/--->/expansion_order_n
    | ------->/--->/exp_1.ASCII
    | ...
    | ------->/--->/exp_n.ASCII
    | ------->/--->/sample.csv
    
Note that they will be re-organized so that the raw output for each expansion is easily identifiable.

In [8]:
experiments = {}

print "Collecting experiment data"

all_sampling_results = None

for i, exp_name in enumerate(EXP_NAMES):
    _, method = exp_name.split("_")

    print "   " + exp_name
    exp_dir = os.path.join(os.getcwd(), exp_name)
    if not os.path.exists(exp_dir):
        os.mkdir(exp_dir)

    exp, results, sampling_results = load_exp(exp_name)

    sample_file = os.path.join(exp_dir, SAMPLE_FN)
    sampling_results.to_csv(sample_file, mode='w')

    print "   Copying results set"
    for name, val in product([exp['param_name'], ], exp['param_vals']):
        expansion = name + "_" + str(val)
        
        # Re-name columns in sampling_results for concatting to master
        # file; map "expansion_order" -> "OLS", e.g.
        cols = [ c for c in sampling_results.columns 
                 if c.endswith(expansion) ]
        sampling_results.rename(
            columns={ c: c.replace(name, method) for c in cols },
            inplace=True
        )

        data_loc = os.path.join(SAVE_DIR, results[expansion])
        save_loc = os.path.join(exp_dir, expansion)

        # Copy to top-level dir for archiving here
        if os.path.exists(save_loc):
            print "      Overwriting existing", save_loc
            shutil.rmtree(save_loc)
        shutil.copytree(data_loc, save_loc)
        
        ## ASCII file
        ASCII_fn = "%s_%d.ascii" % (exp_name, val)
        print "      " + ASCII_fn
        shutil.copy2(ASCII_fn, 
                     os.path.join(save_loc, ASCII_fn))
    
    # Join into master sampling results file
    #pprint.pprint(sorted(sampling_results.columns))
    if all_sampling_results is None:
        all_sampling_results = sampling_results.copy()
    else:
        for col in sampling_results:
            if not (col in all_sampling_results):
                print "      appending", col
                a = all_sampling_results.join(sampling_results[col])
                all_sampling_results = a
            
    experiments[exp_name] = exp

all_sampling_results.to_csv("SM_sampling_results.csv")
print "..done"

Collecting experiment data
   SM_OLS
Re-named bad key (Neq_ARG)
Re-named bad key (Neq_MBN)
   Copying results set
      Overwriting existing /home/darothen/workspace/CESM_PCE_exp/8_pce_dakota/single_mode/SM_OLS/expansion_order_2
      SM_OLS_2.ascii
      Overwriting existing /home/darothen/workspace/CESM_PCE_exp/8_pce_dakota/single_mode/SM_OLS/expansion_order_3
      SM_OLS_3.ascii
      Overwriting existing /home/darothen/workspace/CESM_PCE_exp/8_pce_dakota/single_mode/SM_OLS/expansion_order_4
      SM_OLS_4.ascii
      Overwriting existing /home/darothen/workspace/CESM_PCE_exp/8_pce_dakota/single_mode/SM_OLS/expansion_order_5
      SM_OLS_5.ascii
   SM_LARS
Re-named bad key (Neq_ARG)
Re-named bad key (Neq_MBN)
   Copying results set
      Overwriting existing /home/darothen/workspace/CESM_PCE_exp/8_pce_dakota/single_mode/SM_LARS/expansion_order_2
      SM_LARS_2.ascii
      Overwriting existing /home/darothen/workspace/CESM_PCE_exp/8_pce_dakota/single_mode/SM_LARS/expansion_order_3


---

## Statistics

Compute summary statistics for different comparisons between the various chaos expansions and different predicted quantities. A few assumptions go into the output here:

1. Compare different combinations of `Nderiv` from chaos expansions (using $S_\text{max}$ to predict $N_d$) and $N_{eq}$, $N_{kn}$ from the parcel model. Record error statistics/metrics.


In [9]:
from pce_vis import compute_stats, stat_label
from functools import partial

def result_key(output, name, val=0):
    base_key = "%s_%s" % (output, name)
    if val > 0:
        base_key += "_%d" % val
    return base_key

def compute_stats_vs_parcel(df, output, name, val=0, output_parcel=None,
                           power10=False):
    if output_parcel is None:
        output_parcel = output
    
    # Figure out which columns in df to pull
    key_param = result_key(output, name, val)
    key_parcel = result_key(output_parcel, "parcel")
    
    # Grab and clean data
    data_df = pd.DataFrame({key_param: df[key_param],
                            key_parcel: df[key_parcel]})
    data_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    data_df.dropna(inplace=True)
    
    results = {}
    results['log10'] = compute_stats(data_df[key_param], 
                                     data_df[key_parcel])   
    data_df = 10.**(data_df)
    results['normal'] = compute_stats(data_df[key_param], 
                                      data_df[key_parcel])   
    
    return results

stats_df_mi = pd.MultiIndex.from_product(
    [["Smax", "Neq", "Nkn", "Nderiv_Neq", "Nderiv_Nkn"], 
     ["log10", "normal"],
     ["rmse", "nrmse", "mae", "r2", "mre", "mre_std"]],
    names=["result", "scaling", "stat"]
)

all_stats = []
for exp_name, exp in experiments.iteritems():
    print exp_name
    
    _, method = exp_name.split("_")
    
    for val in exp['param_vals']:
        
        stats = {}
        
        ## Smax
        stats['Smax'] = compute_stats_vs_parcel(
            all_sampling_results, "Smax", method, val
        )
        
        ## Nderiv vs Neq
        stats['Nderiv_Neq'] = compute_stats_vs_parcel(
            all_sampling_results, "Nderiv", method, val, output_parcel="Neq"
        )
        
        ## Nderiv vs Nkn
        stats['Nderiv_Nkn'] = compute_stats_vs_parcel(
            all_sampling_results, "Nderiv", method, val, output_parcel="Nkn"
        )
        
        ## Neq vs Neq
        stats['Neq'] = compute_stats_vs_parcel(
            all_sampling_results, "Neq", method, val
        )
        
        ## Nkn vs Nkn
        stats['Nkn'] = compute_stats_vs_parcel(
            all_sampling_results, "Nkn", method, val
        )
                
        stats_vals = []
        for key in stats_df_mi:
            result, scaling, stat = key
            stats_vals.append(stats[result][scaling][stat])
        stats_df = pd.DataFrame(stats_vals, index=stats_df_mi,
                                columns=[(method, val)], )
        all_stats.append(stats_df.T)
        
pces_mi = pd.MultiIndex.from_tuples([df.index[0] for df in all_stats],
                                     names=["method", "order"])
all_df = pd.concat(all_stats)
all_df.set_index(pces_mi, inplace=True)

print "Writing..."
all_df.to_pickle("SM_stats.p")
print "done."

SM_LASSO
SM_LARS
SM_OLS
Writing...
done.


Create a 'master dataset' with all the sampling data, tagged appropriately so that it can be split apart using pandas/seaborn to do factor analysis by PCE method, order, etc. This produces a "tidy" DataFrame in the style of Hadley Wickham; each row is one observation, with all metadata encoded to figure out where it came from.

In [10]:
# Collect the parcel model output
# Note - we've taken 10^x for all fields, because it's assumed that
#        log10(x) is saved
parcel_Smax = 10.**all_sampling_results['Smax_parcel']
parcel_Neq = 10.**all_sampling_results['Neq_parcel']
parcel_Nkn = 10.**all_sampling_results['Nkn_parcel']

rel_errs_Smax, rel_errs_Neq, rel_errs_Nkn = [], [], []
rel_errs_Nd_Neq, rel_errs_Nd_Nkn = [], []
all_methods = []
all_orders = []
for exp_name in EXP_NAMES:
    _, method = exp_name.split("_")
    for order in [ 2, 3, 4, 5 ]:
        print method, order
        key = "%s_%d" % (method, order)
        Smaxes = 10.**all_sampling_results["Smax_"+key]
        Neqs = 10.**all_sampling_results["Neq_"+key]
        Nkns = 10.**all_sampling_results["Nkn_"+key]
        Nderivs = 10.**all_sampling_results["Nderiv_"+key]
        
        n_tot = len(Smaxes)
        
        all_orders.extend([int(order), ]*n_tot)
        all_methods.extend([method, ]*n_tot)
        rel_errs_Smax.extend(100.*(Smaxes - parcel_Smax)/parcel_Smax)
        rel_errs_Neq.extend(100.*(Neqs - parcel_Neq)/parcel_Neq)
        rel_errs_Nkn.extend(100.*(Nkns - parcel_Nkn)/parcel_Nkn)
        rel_errs_Nd_Neq.extend(100.*(Nderivs - parcel_Neq)/parcel_Neq)
        rel_errs_Nd_Nkn.extend(100.*(Nderivs - parcel_Nkn)/parcel_Nkn)
        
for method in ['ARG', 'MBN']:
    print method
    Smaxes = 10.**all_sampling_results["Smax_"+method]
    Nderivs = 10.**all_sampling_results["Nderiv_"+method]
    
    n_tot = len(Smaxes)
        
    all_orders.extend([0, ]*n_tot)
    all_methods.extend([method, ]*n_tot)
    rel_errs_Smax.extend(100.*(Smaxes - parcel_Smax)/parcel_Smax)
    rel_errs_Nd_Neq.extend(100.*(Nderivs - parcel_Neq)/parcel_Neq)
    rel_errs_Nd_Nkn.extend(100.*(Nderivs - parcel_Nkn)/parcel_Nkn)
    rel_errs_Neq.extend([np.NaN, ]*n_tot)
    rel_errs_Nkn.extend([np.NaN, ]*n_tot)
    
df = pd.DataFrame({'rel_err_Smax': rel_errs_Smax, 
                   'rel_err_Neq': rel_errs_Neq, 
                   'rel_err_Nkn': rel_errs_Nkn, 
                   'rel_err_Nd_Neq': rel_errs_Nd_Neq, 
                   'rel_err_Nd_Nkn': rel_errs_Nd_Nkn,
                   'method': all_methods, 'order': all_orders})
df.replace([np.inf, -np.inf], np.nan, inplace=True)

print "Writing..."
df.to_csv("SM_sampling_tidy.csv")

OLS 2
OLS 3
OLS 4
OLS 5
LARS 2
LARS 3
LARS 4
LARS 5
LASSO 2
LASSO 3
LASSO 4
LASSO 5
ARG
MBN
Writing...


Compress into a zip file for archival.

In [12]:
with ZipFile(ARCH_NAME, 'w', compression=ZIP_DEFLATED) as zf:

    for exp_name, exp in experiments.iteritems():
        
        print "   " + exp_name
        exp_dir = os.path.join(os.getcwd(), exp_name)

        exp, results, sampling_results = load_exp(exp_name)
        
        # Save sampling results into zipfile
        sample_file = os.path.join(exp_dir, SAMPLE_FN)
        zf.write(sample_file, 
                 "%s/%s_sample_results.csv" % (exp_name, exp_name))
        
        for name, val in product([exp['param_name'], ], exp['param_vals']):
            expansion = name + "_" + str(val)

            data_loc = os.path.join(SAVE_DIR, results[expansion])
            save_loc = os.path.join(exp_dir, expansion)

            # List and copy into zip file
            print "      compressing..."
            for f in os.listdir(save_loc):
                zf.write(os.path.join(save_loc, f), 
                         "%s/%s/%s" % (exp_name, expansion, f))
                
            # ASCII file
            ASCII_fn = "%s_%d.ascii" % (exp_name, val)
            zf.write(ASCII_fn, "%s/%s" % (exp_name, ASCII_fn))
                        

        # Dictionary for this exp
        print "   Saving experiments dictionary"
        with open("%s.p" % exp_name, 'w') as pf:
            pickle.dump(exp, pf)
        zf.write("%s.p" % exp_name)
        
        print ""
        
    # All sampling results
    print "   Saving combined sampling results and errors"
    zf.write("SM_sampling_results.csv")
    zf.write("SM_sampling_tidy.csv")
    zf.write("SM_stats.p")
    
    # archiving scripts
    print "Archiving scripts"
    for f in ["dakota_poly.py", "dist_tools.py", 
              'pcm_param.py', 'pcm_param.f90', 'pcm_param.h5']:
        print "   " + f
        zf.write(f)
        
    print "..done"

   SM_LASSO
Re-named bad key (Neq_ARG)
Re-named bad key (Neq_MBN)
      compressing...
      compressing...
      compressing...
      compressing...
   Saving experiments dictionary

   SM_LARS
Re-named bad key (Neq_ARG)
Re-named bad key (Neq_MBN)
      compressing...
      compressing...
      compressing...
      compressing...
   Saving experiments dictionary

   SM_OLS
Re-named bad key (Neq_ARG)
Re-named bad key (Neq_MBN)
      compressing...
      compressing...
      compressing...
      compressing...
   Saving experiments dictionary

   Saving combined sampling results and errors
Archiving scripts
   dakota_poly.py
   dist_tools.py
   pcm_param.py
   pcm_param.f90
   pcm_param.h5
..done
