When running the LTOP workflow there is a circumstance where ties can occur in the paramater selection process. In these instances, version 0.1.0 is just selecting the first item in the df. However, for reproducability and to improve the param set pick, a more codified score/rank approach is required. This notebook is for working out the logic of that process before it is integrated into the existing workflow. This was developed in the ltop_py env on Islay. 

In [None]:
import pandas as pd 
import os
import sys
import matplotlib.pyplot as plt 
import seaborn as sns 
import numpy as np 
%matplotlib inline 

In [4]:
#read in the intermediate param selection outputs - this path can be changed to a local version
fn = r'/vol/v1/proj/LTOP_FTV_Py/param_selection_testing_outputs/intermediate_testing_output.csv'
df = pd.read_csv(fn)
df.columns

Index(['cluster_id', 'fitted', 'index', 'orig', 'param_num', 'params', 'rmse',
       'vert', 'year', '.geo', 'paramNum', 'vert90', 'vert91', 'vert92',
       'vert93', 'vert94', 'vert95', 'vert96', 'vert97', 'vert98', 'vert99',
       'vert00', 'vert01', 'vert02', 'vert03', 'vert04', 'vert05', 'vert06',
       'vert07', 'vert08', 'vert09', 'vert10', 'vert11', 'vert12', 'vert13',
       'vert14', 'vert15', 'vert16', 'vert17', 'vert18', 'vert19', 'vert20',
       'vert21', 'NRMSE', 'AIC', 'AICc', 'index_cid', 'len_vert', 'len_year',
       'len_fitted', 'len_orig', 'max_rmse', 'rmse_num', 'n_segs', 'vertscore',
       'rankVscore', 'rankAICc', 'combined', 'selected', 'spikeThreshold',
       'maxSegments', 'recoveryThreshold', 'pvalThreshold'],
      dtype='object')

In [9]:
####set some user args#### 
#select a cluster id, not sure if this is what we want to do or if we want to iterate? 
select_cluster = 37
clust_df = df.loc[df.cluster_id == select_cluster]
clust_df.columns
#select the param you want to plot 


Index(['cluster_id', 'fitted', 'index', 'orig', 'param_num', 'params', 'rmse',
       'vert', 'year', '.geo', 'paramNum', 'vert90', 'vert91', 'vert92',
       'vert93', 'vert94', 'vert95', 'vert96', 'vert97', 'vert98', 'vert99',
       'vert00', 'vert01', 'vert02', 'vert03', 'vert04', 'vert05', 'vert06',
       'vert07', 'vert08', 'vert09', 'vert10', 'vert11', 'vert12', 'vert13',
       'vert14', 'vert15', 'vert16', 'vert17', 'vert18', 'vert19', 'vert20',
       'vert21', 'NRMSE', 'AIC', 'AICc', 'index_cid', 'len_vert', 'len_year',
       'len_fitted', 'len_orig', 'max_rmse', 'rmse_num', 'n_segs', 'vertscore',
       'rankVscore', 'rankAICc', 'combined', 'selected', 'spikeThreshold',
       'maxSegments', 'recoveryThreshold', 'pvalThreshold'],
      dtype='object')

In [10]:
def get_max_mean(df1,col_name): 
    '''
    Calculate the mean of the combined rankVscore and rankAICcscore, considering the weighting factors, 
    for each possible value for a given param. Then take the max mean value. 
    '''
    #this assumes you've already subset by cluster_id as is the case in the param selection code 
    #get the mean by possible param values 
    df1 = pd.DataFrame(df1.groupby([col_name])['combined'].mean()).reset_index()
    #get the max mean value for the given param 
    df2 = df1.loc[df1['combined'] == df1['combined'].max()]
    return df2

In [12]:
#do the subsetting 
#first get the max mean value for each param - this will be a one line dataframe for the cluster id in question
#do recoveryThreshold first
#TODO do we want to calculate these means on the whole cluster_id? or just on the tied ones? 
rec_select = get_max_mean(clust_df,'recoveryThreshold')
#next do spikeThreshold
spike_select = get_max_mean(clust_df,'spikeThreshold')
#then maxSegments
max_select = get_max_mean(clust_df,'maxSegments')
#then pvalThreshold 
pval_select = get_max_mean(clust_df,'pvalThreshold')

#then do the sequential subsetting, starting with the full dataframe - in the actual code this is a subset of the df for the cluster
#TODO the clust_df here is not actually correct, that will be the ties not the full df for the cluster_id
df1 = clust_df.loc[clust_df.recoveryThreshold == rec_select.recoveryThreshold.iloc[0]]
df2 = df1.loc[df1.spikeThreshold == df1.spikeThreshold.iloc[0]]
df3 = df2.loc[df2.maxSegments == df2.maxSegments.iloc[0]]
df4 = df3.loc[df3.pvalThreshold == df3.pvalThreshold.iloc[0]]

#this will be the output, which ideally has only one row left? 
df4.shape


(5, 63)