# timing_table_draft2017b.ipynb

## Purpose
Look at timing results from project [p17c-marc-comparison](https://github.com/grandey/p17c-marc-comparison) in order to assess computational performance.

## Requirements
- Climate Data Operators (CDO)
- NetCDF Operators (NCO)
- CESM output timing files - see [data_management.org](https://github.com/grandey/p17c-marc-comparison/blob/master/manage_data/data_management.org#syncing-timing-data-from-timing-simulations). These timing files also will be uploaded to figshare.

## Author
Benjamin S. Grandey, 2017

In [1]:
from glob import glob
import numpy as np
import os
import pandas as pd

# Print versions of packages
for module in [np, pd]:
    try:
        print('{}.__version__ = {}'.format(module.__name__, module.__version__))
    except AttributeError:
        pass

numpy.__version__ = 1.13.3
pandas.__version__ = 0.21.0


In [2]:
# List of timing files
timing_dir = os.path.expandvars('$HOME/data/projects/p17c_marc_comparison/output_timing/')
timing_filenames = glob('{}/ccsm_timing.*'.format(timing_dir))

In [3]:
# Read timing data of interest into dataframe
timing_df = pd.DataFrame(columns=['Simulation', 'Model Cost (pe-hrs/myr)',
                                  'Model Throughput (myr/day)'])
for filename in timing_filenames:  # loop over files
    simulation = filename.split('.')[1]
    model_cost, model_throughput = np.nan, np.nan  # in case data not found in file
    with open(filename, 'r') as f:
        for line in f:
            if 'Model Cost:' in line:
                model_cost = float(line.split()[2])
            elif 'Model Throughput:' in line:
                model_throughput = float(line.split()[2])
    timing_df = timing_df.append({'Simulation': simulation, 'Model Cost (pe-hrs/myr)': model_cost,
                                  'Model Throughput (myr/day)': model_throughput},
                                 ignore_index=True)
# Print summary statistics, grouped by simulation
timing_df.groupby('Simulation').describe()

Unnamed: 0_level_0,Model Cost (pe-hrs/myr),Model Cost (pe-hrs/myr),Model Cost (pe-hrs/myr),Model Cost (pe-hrs/myr),Model Cost (pe-hrs/myr),Model Cost (pe-hrs/myr),Model Cost (pe-hrs/myr),Model Cost (pe-hrs/myr),Model Throughput (myr/day),Model Throughput (myr/day),Model Throughput (myr/day),Model Throughput (myr/day),Model Throughput (myr/day),Model Throughput (myr/day),Model Throughput (myr/day),Model Throughput (myr/day)
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Simulation,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
p17c_t_mam3_r1,5.0,325.516,1.835873,323.4,323.81,326.21,326.48,327.68,5.0,53.084,0.299132,52.73,52.93,52.97,53.36,53.43
p17c_t_mam3_r2,5.0,362.264,1.71204,361.07,361.1,361.11,363.17,364.87,5.0,47.7,0.223942,47.36,47.58,47.85,47.85,47.86
p17c_t_mam7_r1,5.0,435.616,3.394574,433.59,433.84,434.09,434.94,441.62,5.0,39.67,0.305287,39.13,39.73,39.81,39.83,39.85
p17c_t_mam7_r2,5.0,472.226,2.130899,469.71,471.21,471.3,474.4,474.51,5.0,36.594,0.162573,36.42,36.43,36.66,36.67,36.79
p17c_t_marc_r1,5.0,344.274,1.59544,342.51,342.69,344.78,345.33,346.06,5.0,50.194,0.234585,49.93,50.04,50.12,50.43,50.45
p17c_t_marc_r2,5.0,361.126,1.614088,359.53,359.64,361.24,361.82,363.4,5.0,47.852,0.213471,47.55,47.76,47.84,48.05,48.06


In [4]:
# Calculate means, standard errors, and cost relative to p17c_t_mam3_r1
mean_df = timing_df.groupby('Simulation').mean()
se_df = timing_df.groupby('Simulation').std(ddof=1) / np.sqrt(5)
rel_series = (mean_df['Model Cost (pe-hrs/myr)'] / mean_df['Model Cost (pe-hrs/myr)']['p17c_t_mam3_r1']
              * 100) - 100
rel_series.name = 'Model Cost Relative to p17c_t_mam3_r1 (%)'
mean_se_df = mean_df.join(se_df, how='outer', lsuffix=' Mean', rsuffix=' Standard Error')
mean_se_rel_df = mean_se_df.join(rel_series)
mean_se_rel_df

Unnamed: 0_level_0,Model Cost (pe-hrs/myr) Mean,Model Throughput (myr/day) Mean,Model Cost (pe-hrs/myr) Standard Error,Model Throughput (myr/day) Standard Error,Model Cost Relative to p17c_t_mam3_r1 (%)
Simulation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
p17c_t_mam3_r1,325.516,53.084,0.821027,0.133776,0.0
p17c_t_mam3_r2,362.264,47.7,0.765647,0.10015,11.289153
p17c_t_mam7_r1,435.616,39.67,1.518099,0.136528,33.823222
p17c_t_mam7_r2,472.226,36.594,0.952967,0.072705,45.069981
p17c_t_marc_r1,344.274,50.194,0.713503,0.104909,5.762543
p17c_t_marc_r2,361.126,47.852,0.721842,0.095467,10.939554


In [5]:
# Table showing mean model cost +/- standard error and relative cost
# (I guess this should be doable in one line - if you know how, please do let me know!)
mean_str_series = mean_se_rel_df['Model Cost (pe-hrs/myr) Mean'].map('{:.1f}'.format)
se_str_series = mean_se_rel_df['Model Cost (pe-hrs/myr) Standard Error'].map('{:.1f}'.format)
mean_se_str_series = mean_str_series + r'$\pm$' + se_str_series
mean_se_str_series.name = 'Model cost (processor hours / model year)'
rel_str_series = mean_se_rel_df['Model Cost Relative to p17c_t_mam3_r1 (%)'].map('{:.1f}%'.format)
rel_str_series.name = 'Relative model cost (% above p17c_t_mam3_r1)'
pd.concat([mean_se_str_series, rel_str_series], axis=1)

Unnamed: 0_level_0,Model cost (processor hours / model year),Relative model cost (% above p17c_t_mam3_r1)
Simulation,Unnamed: 1_level_1,Unnamed: 2_level_1
p17c_t_mam3_r1,325.5$\pm$0.8,0.0%
p17c_t_mam3_r2,362.3$\pm$0.8,11.3%
p17c_t_mam7_r1,435.6$\pm$1.5,33.8%
p17c_t_mam7_r2,472.2$\pm$1.0,45.1%
p17c_t_marc_r1,344.3$\pm$0.7,5.8%
p17c_t_marc_r2,361.1$\pm$0.7,10.9%
