Right now I just want to read the (currently) preprocessed data and start to set up a workflow for each event calculating the statistics for one station. Later, we can add additional stations into the framework and incorporate the rest of the data.

In [1]:
## import libraries
import os, sys
import yaml
import xarray as xr
import pandas as pd
import numpy as np
import metpy.calc as mpcalc
from metpy.units import units

%matplotlib inline

sys.path.append('../modules')
import ar_funcs

In [2]:
path_to_data = '/cw3e/mead/projects/cwp140/scratch/dnash/data/'      # project data -- read only
path_to_out  = '../out/'       # output files (numerical results, intermediate datafiles) -- read & write
path_to_figs = '../figs/'      # figures

In [3]:
## read .yaml file with station information
yaml_doc = '../data/ASOS_station_info.yaml'
config = yaml.load(open(yaml_doc), Loader=yaml.SafeLoader)

## build empty dataframes
df_lst = []
for i, stationID in enumerate(config):
    print(i, stationID)
    df = ar_funcs.build_empty_df(stationID)
    df_lst.append(df)

0 COOPAHNA2
1 COOPHCSA2
2 COOPHOOA2
3 COOPPECA2
4 CRGA2
5 HONA2
6 JNEA2
7 KTNA2
8 PAGS
9 PAGY
10 PAHN
11 PAJK
12 PAJN
13 PAKT
14 PAKW
15 PAPG
16 PASI
17 PAWG
18 PAYA


In [15]:
## read AR duration file
duration_df = pd.read_csv('../out/AR_track_duration_SEAK.csv')
# duration_df['start_date'] = pd.to_datetime(duration_df['start_date'])
duration_df['start_date'] = duration_df['trackID'].map(ar_funcs.get_new_start)
duration_df['end_date'] = pd.to_datetime(duration_df['end_date'])
duration_df.index = duration_df['trackID']

ARID_issues = [200411121210, 200411191202, 200610151213, 200610201812, 201205201201, 201209010004]


error_desc = ['IVT nan', 'IVT nan' ,'prec wrong dates', 'prec wrong dates', 'prec time unsorted', 'freeze level not same datetime as ivt']
duration_df = duration_df[~duration_df['trackID'].isin(ARID_issues)]

ARID_lst = duration_df.index.values
len(ARID_lst)

1494

In [16]:
%%time
## enumerate through ARIDs
for i, ARID in enumerate(ARID_lst):
    ARID = int(ARID)
    ## open IVT file
    ds = ar_funcs.read_GEFSv12_reforecast_data('ivt', ARID)   
    ## get IVT information
    df_lst = ar_funcs.preprocess_IVT_info(config, ds, ARID, df_lst)
    ## close IVT file
    ds.close()
    
    ## open freezing level file
    ds = ar_funcs.read_GEFSv12_reforecast_data('freezing_level', ARID)
    ## get freezing level info
    ds_lst = ar_funcs.preprocess_freezing_level(config, ds, ARID, df_lst)    
    ## close freezing level file
    ds.close()
    
    ## open precipitation file
    ds = ar_funcs.read_GEFSv12_reforecast_data('prec', ARID)    
    ## get precipitation information
    df_lst = ar_funcs.preprocess_prec_GEFS(config, ds, ARID, df_lst)
    ## close precipitation file
    ds.close()

CPU times: user 11min, sys: 8min 17s, total: 19min 18s
Wall time: 32min 15s


In [14]:
ARID

201205201201

In [17]:
fname = '/data/projects/Comet/cwp140/downloads/ar_impact_info.csv'
impact_df = pd.read_csv(fname)
impact_df = impact_df.set_index(pd.to_datetime(impact_df['Impact dates']))
start_date = '2000-01-01'
end_date = '2019-08-31'
idx = (impact_df.index >= start_date) & (impact_df.index <= end_date)
impact_df = impact_df.loc[idx]
## fix names of stations to match ASOS/COOP station ID
impact_df['Location'] = impact_df['Location'].replace({'JNNA2': 'PAJN', 'PECA2': 'COOPPECA2', 
                        'HCSA2': 'COOPHCSA2', 'AHNA2': 'COOPAHNA2',
                        'Thorne Bay': 'PAKW', 'Thorne Bay/PAKW': 'PAKW',
                        'Staney/PAKW': 'PAKW', 'HOOA2': 'COOPHOOA2',
                        np.nan: 'PAJN'})

impact_date = impact_df.loc[(impact_df['Location'] == 'PAKT')].index[-1]

In [18]:
df_lst[0]

Unnamed: 0_level_0,Unnamed: 0,trackID,start_date,end_date,duration,IVT_max,IVT_max_time,IVT_dir,tIVT,freezing_level,...,ARI_1hr,ARI_3hr,ARI_6hr,ARI_12hr,ARI_24hr,impact_scale,impacts,impact_notes,impact_type,misc
trackID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2.000011e+11,0,2.000011e+11,2000-01-06 00:00:00,2000-01-06 18:00:00,18.0,88.773054,2000-01-06T00:00:00.000000000,3.795190,1.006065e+07,,...,,,,,,,0,,,
2.000011e+11,1,2.000011e+11,2000-01-05 00:00:00,2000-01-08 06:00:00,6.0,88.773054,2000-01-06T00:00:00.000000000,3.795190,2.102813e+07,,...,,,,,,,0,,,
2.000012e+11,2,2.000012e+11,2000-01-19 06:00:00,2000-01-19 12:00:00,6.0,29.305255,2000-01-19T06:00:00.000000000,5.057773,6.870235e+06,,...,,,,,,,0,,,
2.000012e+11,3,2.000012e+11,2000-01-24 06:00:00,2000-01-31 00:00:00,90.0,237.418411,2000-01-27T21:00:00.000000000,3.380810,6.413225e+07,,...,,,,,,,0,,,
2.000013e+11,4,2.000013e+11,2000-01-28 12:00:00,2000-01-29 00:00:00,6.0,79.688997,2000-01-29T00:00:00.000000000,2.963235,4.310870e+07,,...,,,,,,,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2.019082e+11,1495,2.019082e+11,2019-08-16 00:00:00,2019-08-21 12:00:00,132.0,274.384125,2019-08-17T03:00:00.000000000,5.342937,5.708361e+07,3076.810059,...,,,,,,,0,,,
2.019082e+11,1496,2.019082e+11,2019-08-21 12:00:00,2019-08-22 06:00:00,18.0,39.802027,2019-08-22T00:00:00.000000000,4.531237,4.672867e+07,1922.530762,...,,,,,,,0,,,
2.019082e+11,1497,2.019082e+11,2019-08-22 12:00:00,2019-08-23 00:00:00,6.0,161.074144,2019-08-23T00:00:00.000000000,2.982816,3.810578e+07,1861.273682,...,,,,,,,0,,,
2.019082e+11,1498,2.019082e+11,2019-08-23 12:00:00,2019-08-24 06:00:00,18.0,79.984345,2019-08-23T12:00:00.000000000,3.184267,2.195518e+07,1779.009399,...,,,,,,,0,,,


In [19]:
from pandas.tseries.offsets import MonthEnd
def add_impact_info(stationIDX, stationID):
    
    test = impact_df.loc[(impact_df['Location'] == stationID)]
    ## current station df
    subset_df = df_lst[stationIDX]
    subset_df['impact_type'] = np.nan # type of impact
    subset_df['misc'] = np.nan # copies other notes over
        
    ar_impact = []
    for idx_impact, row_impact in test.iterrows():
        year = idx_impact.year
        month = idx_impact.month

        ## subset to year/month of current row in impact dataframe +- 15 days
        start = pd.to_datetime('{0}-{1}-01'.format(year, month)) - pd.DateOffset(days=15)
        end = pd.to_datetime('{0}-{1}'.format(idx_impact.year, idx_impact.month), format="%Y-%m") + MonthEnd(0, normalize=True) + pd.DateOffset(days=15)
        idx = (subset_df['start_date'] >= start) & (subset_df['end_date'] <= end)
        tmp = subset_df.loc[idx]

        for index, row in tmp.iterrows():
            date1 = row['start_date'] - pd.DateOffset(hours=24)
            date2 = row['end_date'] + pd.DateOffset(hours=24)

            if date1 <= idx_impact <= date2:
                # print(date1, date2, impact_date, index, "PASS!")
                ar_impact.append(idx_impact)
                subset_df.loc[index, 'impact_scale'] = row_impact['Impact Level']
                subset_df.loc[index, 'impacts'] = 1
                subset_df.loc[index, 'impact_type'] = row_impact['Impact']
                subset_df.loc[index, 'impact_notes'] = row_impact['Impact Information']
                subset_df.loc[index, 'misc'] = row_impact['Notes']
            else:
                pass
    ## get the impact dates not found in AR database        
    a = ar_impact
    b = test.index
    ar_not_found = set(a) ^ set(b)
                
    return subset_df, ar_not_found

In [23]:
subset_df_lst = []
ar_impact_lst = []
for i, stationID in enumerate(config):
    print(i, stationID)
    subset_df, ar_impact = add_impact_info(i, stationID)
    subset_df_lst.append(subset_df)
    ar_impact_lst.append(ar_impact)
    
    outfile = path_to_out + 'combined_df_{0}.csv'.format(stationID)
    subset_df.to_csv(outfile)

0 COOPAHNA2
1 COOPHCSA2
2 COOPHOOA2
3 COOPPECA2
4 CRGA2
5 HONA2
6 JNEA2
7 KTNA2
8 PAGS
9 PAGY
10 PAHN
11 PAJK
12 PAJN
13 PAKT
14 PAKW
15 PAPG
16 PASI
17 PAWG
18 PAYA


In [None]:
for i, stationID in enumerate(config):
    

In [21]:
for i, ar in enumerate(ar_impact_lst):
    print(ar)

{Timestamp('2018-09-12 00:00:00')}
set()
set()
set()
set()
set()
set()
set()
set()
{Timestamp('2007-07-16 00:00:00')}
set()
set()
{Timestamp('2000-06-28 00:00:00'), Timestamp('2015-08-21 00:00:00'), Timestamp('2013-07-05 00:00:00')}
set()
set()
{Timestamp('2009-06-12 00:00:00')}
set()
{Timestamp('2009-06-12 00:00:00')}
{Timestamp('2000-07-13 00:00:00')}


In [22]:
impact_df.loc[(impact_df['Location'] == 'COOPAHNA2')]

Unnamed: 0_level_0,Last Date,Location,Total Hours,Total IVT,Max IVT,Total Precip,Avg. Direction,Return Period,1h ARI,3h ARI,...,1d ARI,Impact Level,Impact dates,Impact,Impact Information,Location.1,Impact Source,Notes,Wick Precip,Wick Return
Impact dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-07-03,,COOPAHNA2,,,,,,,,,...,,0.0,07/03/2019,Action stage,Chilkat at 7th highest crest,AHNA2,AHPS,,,
2018-09-12,,COOPAHNA2,,,,,,,,,...,,0.0,09/12/2018,Action stage,Chilkat at 4th highest crest,AHNA2,AHPS,,,
2017-09-09,20170907_00Z,COOPAHNA2,24.0,1093.77,649.3,1.48,172.89,,,,...,,2.0,09/09/2017,Moderate flood stage; action flood stage,Taiya at its 10th highest crest; Chilkat at it...,AHNA2,"AHPS, HYDRO, NWS",various ar events in a row,,
2015-01-21,20150122_00Z,COOPAHNA2,36.0,983.83,381.13,4.9,193.23,,,,...,,2.0,01/21/2015,Mudslides,"Mudslides at haines, rockslide in dyea",AHNA2,HYDRO,The 21st and 22nd had the 17th and 14th highes...,,
2014-07-06,,COOPAHNA2,,,,,,,,,...,,2.0,07/06/2014,Action stage; flooding; moderate flood stage,Chilkat river at 5th highest crest; Skagway at...,AHNA2,NWS,,,
2013-06-28,,COOPAHNA2,,,,,,,,,...,,2.0,06/28/2013,Action stage; moderate flood stage,Chilkat at 6th highest crest; Taiyu at 27th hi...,AHNA2,AHPS,,,
2012-06-26,,COOPAHNA2,,,,,,,,,...,,0.0,06/26/2012,Action stage,Chilkat at 2nd highest crest,AHNA2,AHPS,,,
2005-11-24,20051119_00Z,COOPAHNA2,60.0,1901.64,586.62,1.67,222.064,,,,...,,5.0,11/24/2005,Flooding; moderate flood stage,"Disaster declerations in Juneau, Haines, Sitka...",AHNA2,"WFO, NWS, HYDRO",Widespread event; 3 consecutive ar events; 23r...,,
2005-11-24,20051124_12Z,COOPAHNA2,48.0,1940.97,607.99,5.23,189.65,,,,...,,5.0,11/24/2005,Flooding; moderate flood stage,"Disaster declerations in Juneau, Haines, Sitka...",AHNA2,"WFO, NWS, HYDRO",Widespread event; 3 consecutive ar events; 23r...,,"2.06 on the 22nd, 2.47 for AHNA"


In [19]:
## write df to .csv
duration_df.loc[ARID]

Unnamed: 0                    526
trackID            200610151213.0
start_date    2006-10-17 18:00:00
end_date      2006-10-22 18:00:00
duration                    120.0
Name: 200610151213.0, dtype: object