# Generating a dataframe with key statistics from the network-tree comparison
Pascal Lesage, Dec. 13, 2016

In [15]:
import os
import numpy as np
import pickle
import pandas as pd
import sys
from collections import defaultdict

#### Specify location of output of comparison  
Output should be
 - a set of '_metadata' files with some data on trees  
 - a set of files with arrays containing result of Monte Carlo

In [None]:
network_tree_comparison_fp = r"E:\network_tree_comparison"

#### Organize filepaths in a dictionary for easy access

In [16]:
all_comparison_files = [os.path.join(network_tree_comparison_fp, 'raw', file) 
                        for file in os.listdir(
                            os.path.join(network_tree_comparison_fp, 'raw')
                            )
                        ]

#Keys of activities
acts = [os.path.basename(comparison)[:-7] for comparison in all_comparison_files 
                                            if 'metadata' not in comparison]
comparison_fp_dict = defaultdict()
for act in acts:
    comparison_fp_dict[act] = {}
    comparison_fp_dict[act]['metadata'] = os.path.join(
        network_tree_comparison_fp, 
        'raw', 
        '_metadata_{}.pickle'.format(act)
        )
    comparison_fp_dict[act]['data'] = os.path.join(
        network_tree_comparison_fp, 
        'raw', 
        '{}.pickle'.format(act)
        )

#### Generate lists that will be used as pandas dataframe indices

In [17]:
method_list = [*pickle.load(open(comparison_fp_dict[acts[0]]['data'], 'rb')).keys()]
method_as_string_list = [str(m) for m in method_list]
product_system_list = [*pickle.load(open(comparison_fp_dict[acts[0]]['data'], 'rb'))[method_list[0]].keys()]
metrics = ['GSD2',
           'median',
           'perc 1%',
           'perc 2.5%',
           'perc 5%',
           'perc -1SD',
           'perc +1SD',
           'perc 95%',
           'perc 97.5%',
           'perc 99%',
           'abs spread 99%',
           'abs spread 95%',
           'variance'
           ]

#### Generate and populate dataframe

In [21]:
c_multiindex = pd.MultiIndex.from_product((metrics, method_as_string_list, product_system_list), 
                                          names = ['metric', 'method', 'model'])
comparison_analysis_df = pd.DataFrame(index = acts, columns = c_multiindex)
# Populate results df
for i, act in enumerate(acts):
    sys.stdout.write("\rworking on {} of {}".format(i+1, len(acts)))
    sys.stdout.flush()
    act_data = pickle.load(open(comparison_fp_dict[act]['data'], 'rb'))
    for method in method_list:
        for product_system in product_system_list:
            sample = act_data[method][product_system]
            ln_sample = np.log(sample)
            # GSD2
            comparison_analysis_df.loc[act, ('GSD2', str(method), product_system)] = np.exp(np.std(ln_sample))**2
            # Variance
            comparison_analysis_df.loc[act, ('variance', str(method), product_system)] = np.var(sample)
            # Median
            comparison_analysis_df.loc[act, ('median', str(method), product_system)] = np.median(sample)
            # confidence intevals
            comparison_analysis_df.loc[act, ('perc 1%', str(method), product_system)] = np.percentile(sample, 1)
            comparison_analysis_df.loc[act, ('perc 2.5%', str(method), product_system)] = np.percentile(sample, 2.5)
            comparison_analysis_df.loc[act, ('perc 5%', str(method), product_system)] = np.percentile(sample, 5)
            comparison_analysis_df.loc[act, ('perc -1SD', str(method), product_system)] = np.percentile(sample, 15.865)
            comparison_analysis_df.loc[act, ('perc +1SD', str(method), product_system)] = np.percentile(sample, 84.1345)
            comparison_analysis_df.loc[act, ('perc 95%', str(method), product_system)] = np.percentile(sample, 95)
            comparison_analysis_df.loc[act, ('perc 97.5%', str(method), product_system)] = np.percentile(sample, 97.5)
            comparison_analysis_df.loc[act, ('perc 99%', str(method), product_system)] = np.percentile(sample, 99)
            comparison_analysis_df.loc[act, ('abs spread 99%', str(method), product_system)] = np.percentile(sample, 99.5) - np.percentile(sample, 0.5)
            comparison_analysis_df.loc[act, ('abs spread 95%', str(method), product_system)] = np.percentile(sample, 97.5) - np.percentile(sample, 2.5)
comparison_analysis_df = comparison_analysis_df.sort_index(axis=1)

working on 4087 of 4087

#### Write dataframe to pickle  
Note: pickle chosen to conserve multiindex

In [23]:
comparison_analysis_df.to_pickle(os.path.join(network_tree_comparison_fp, 'summary_statistics_dataframe.pickle'))