# Percentile Analysis

In [4]:
import sys

import numpy as np
import pandas as pd
from scipy.stats import norm

sys.path.append('..')
from constants import MIN_DAYSAHEAD, MAX_DAYSAHEAD, BIN_FREQ_PER_DAY

In [2]:
dfs = {}

for i in range(MIN_DAYSAHEAD, MAX_DAYSAHEAD + 1):
    dfs[i] = pd.read_csv(f'../data/processed/processed_daysahead{i}_R000.csv')
    
dfs[1] .head()

Unnamed: 0,current_time,forward_time0,Vp_pred0,Vp_obs0,sigma0,crps0,forward_time1,Vp_pred1,Vp_obs1,sigma1,...,forward_time2,Vp_pred2,Vp_obs2,sigma2,crps2,forward_time3,Vp_pred3,Vp_obs3,sigma3,crps3
0,2010-01-06 18:00:00,2010-01-07 00:00:00,313.1,307.61,64.345831,15.224052,2010-01-07 06:00:00,306.15,293.158333,84.738204,...,2010-01-07 12:00:00,299.2,282.966667,104.161217,25.349213,2010-01-07 18:00:00,310.615385,283.506667,107.132278,27.758344
1,2010-01-07 00:00:00,2010-01-07 06:00:00,306.15,293.158333,67.734154,16.8202,2010-01-07 12:00:00,299.2,282.966667,92.508446,...,2010-01-07 18:00:00,310.615385,283.506667,104.65342,27.242816,2010-01-08 00:00:00,311.9,303.408333,107.500048,25.389683
2,2010-01-07 06:00:00,2010-01-07 12:00:00,299.2,282.966667,58.765539,15.51089,2010-01-07 18:00:00,310.615385,283.506667,82.503682,...,2010-01-08 00:00:00,311.9,303.408333,103.549962,24.476759,2010-01-08 06:00:00,303.8,301.7675,107.472441,25.131104
3,2010-01-07 12:00:00,2010-01-07 18:00:00,310.615385,283.506667,59.2516,18.710244,2010-01-08 00:00:00,311.9,303.408333,83.865706,...,2010-01-08 06:00:00,303.8,301.7675,101.635209,23.767853,2010-01-08 12:00:00,327.0875,286.885,107.960524,31.134194
4,2010-01-07 18:00:00,2010-01-08 00:00:00,311.9,303.408333,57.405524,13.915591,2010-01-08 06:00:00,303.8,301.7675,71.132267,...,2010-01-08 12:00:00,327.0875,286.885,93.399814,28.625937,2010-01-08 18:00:00,303.475,283.535,102.364343,25.466723


In [5]:
percentiles = [50, 75, 90, 95]
daysahead_cols = {daysahead: f'{daysahead} Days' for daysahead in dfs.keys()}
records = {}

for idx, colname in daysahead_cols.items():
    print(colname)
    
    for percentile in percentiles:
        records[colname, percentile] = []
    
        for _, row in dfs[idx].iterrows():
            i = BIN_FREQ_PER_DAY * idx - 1
            Vp_pred = row[f'Vp_pred{i}']
            Vp_obs = row[f'Vp_obs{i}']
            sigma = row[f'sigma{i}']
            left, right = norm(loc=Vp_pred, scale=sigma).interval(percentile/100)
            
            records[colname, percentile].append(bool(Vp_obs > left and Vp_obs < right))

1 Days
2 Days
3 Days
4 Days
5 Days
6 Days
7 Days


In [8]:
records.keys()

dict_keys([('1 Days', 50), ('1 Days', 75), ('1 Days', 90), ('1 Days', 95), ('2 Days', 50), ('2 Days', 75), ('2 Days', 90), ('2 Days', 95), ('3 Days', 50), ('3 Days', 75), ('3 Days', 90), ('3 Days', 95), ('4 Days', 50), ('4 Days', 75), ('4 Days', 90), ('4 Days', 95), ('5 Days', 50), ('5 Days', 75), ('5 Days', 90), ('5 Days', 95), ('6 Days', 50), ('6 Days', 75), ('6 Days', 90), ('6 Days', 95), ('7 Days', 50), ('7 Days', 75), ('7 Days', 90), ('7 Days', 95)])

In [9]:
df_rows = []
df_cols = None

for idx, percentile in enumerate(percentiles):
    df_row = [percentile]
    df_cols = ['percentile']
    
    for colname in daysahead_cols.values():
        df_row.append(100*np.mean(records[colname, percentile]))
        df_cols.append(colname)

    df_rows.append(df_row)
    
pd.options.display.float_format = '{:.3}%'.format

df_output = pd.DataFrame(df_rows, columns=df_cols)
df_output

Unnamed: 0,percentile,1 Days,2 Days,3 Days,4 Days,5 Days,6 Days,7 Days
0,50,52.3%,52.5%,53.1%,54.9%,57.6%,52.4%,52.1%
1,75,74.5%,74.6%,74.4%,76.0%,77.9%,74.3%,73.0%
2,90,88.4%,87.7%,87.3%,87.8%,88.9%,86.4%,85.5%
3,95,93.4%,92.5%,92.4%,92.6%,93.1%,90.9%,90.8%
