# Percentile Analysis

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import norm

In [2]:
df_dataset = pd.read_csv('../data/processed/processed_daysahead3_R000.csv', index_col=0)

for col in df_dataset.columns:
    if 'time' in col:
        df_dataset[col] = pd.to_datetime(df_dataset[col])
    
df_dataset.head()

Unnamed: 0,current_time,forward_time0,vp_pred0,vp_obs0,sigma0,crps0,forward_time1,vp_pred1,vp_obs1,sigma1,...,forward_time10,vp_pred10,vp_obs10,sigma10,crps10,forward_time11,vp_pred11,vp_obs11,sigma11,crps11
0,2010-01-08 12:00:00,2010-01-08 18:00:00,320.569697,283.535,66.959643,23.617758,2010-01-09 00:00:00,312.584848,289.951667,89.107926,...,2010-01-11 06:00:00,410.35,353.42,77.615267,34.088606,2010-01-11 12:00:00,524.1,459.845,107.379439,39.991405
1,2010-01-08 18:00:00,2010-01-09 00:00:00,312.584848,289.951667,76.443002,20.518383,2010-01-09 06:00:00,304.6,294.783333,90.828665,...,2010-01-11 12:00:00,524.1,459.845,97.93721,39.127235,2010-01-11 18:00:00,338.315385,473.376667,90.759446,89.321174
2,2010-01-09 00:00:00,2010-01-09 06:00:00,304.6,294.783333,71.339896,17.209824,2010-01-09 12:00:00,318.1,291.255,81.895182,...,2010-01-11 18:00:00,338.315385,473.376667,89.352175,89.750479,2010-01-12 00:00:00,340.570588,478.65,92.195806,91.496123
3,2010-01-09 06:00:00,2010-01-09 12:00:00,318.1,291.255,66.850183,19.86633,2010-01-09 18:00:00,305.4,278.983333,83.825415,...,2010-01-12 00:00:00,340.570588,478.65,92.907464,91.281199,2010-01-12 06:00:00,509.3,469.765,99.331979,29.409278
4,2010-01-09 12:00:00,2010-01-09 18:00:00,305.4,278.983333,72.41926,20.726205,2010-01-10 00:00:00,292.7,267.603333,90.733372,...,2010-01-12 06:00:00,509.3,469.765,102.177195,29.905951,2010-01-12 12:00:00,482.45,452.088,107.247701,28.46965


In [5]:
percentiles = [50, 75, 90, 95]
forward_cols = {0: '+6 Hours', 3: '+1 Day', 7: '+2 Days', 11: '+3 Days'}
records = {}

for idx, colname in forward_cols.items():
    print(colname)
    
    for percentile in percentiles:
        records[colname, percentile] = []
    
        for _, row in df_dataset.iterrows():
            Vp_pred = row[f'vp_pred{idx}']
            Vp_obs = row[f'vp_obs{idx}']
            sigma = row[f'sigma{idx}']
            left, right = norm(loc=Vp_pred, scale=sigma).interval(percentile/100)
            
            records[colname, percentile].append(bool(Vp_obs > left and Vp_obs < right))

+6 Hours
+1 Day
+2 Days
+3 Days


In [10]:
df_rows = []
df_cols = None

for idx, percentile in enumerate(percentiles):
    df_row = [percentile]
    df_cols = ['percentile']
    
    for _, colname in forward_cols.items():
        df_row.append(100*np.mean(records[colname, percentile]))
        df_cols.append(colname)

    df_rows.append(df_row)
    
pd.options.display.float_format = '{:.3}%'.format

df_output = pd.DataFrame(df_rows, columns=df_cols)
df_output

Unnamed: 0,percentile,+6 Hours,+1 Day,+2 Days,+3 Days
0,50,49.3%,53.2%,53.4%,53.1%
1,75,73.0%,75.1%,74.8%,74.4%
2,90,88.6%,88.9%,88.1%,87.3%
3,95,94.1%,93.8%,92.9%,92.4%
