Right now I just want to read the (currently) preprocessed data and start to set up a workflow for each event calculating the statistics for one station. Later, we can add additional stations into the framework and incorporate the rest of the data.

In [1]:
## import libraries
import os, sys
import yaml
import xarray as xr
import pandas as pd
import numpy as np
import metpy.calc as mpcalc
from metpy.units import units

%matplotlib inline

sys.path.append('../modules')
import ar_funcs

In [2]:
path_to_data = '/cw3e/mead/projects/cwp140/scratch/dnash/data/'      # project data -- read only
path_to_out  = '../out/'       # output files (numerical results, intermediate datafiles) -- read & write
path_to_figs = '../figs/'      # figures

In [3]:
## read .yaml file with station information
yaml_doc = '../data/ASOS_station_info.yaml'
config = yaml.load(open(yaml_doc), Loader=yaml.SafeLoader)

## build empty dataframes
df_lst = []
for i, stationID in enumerate(config):
    print(i, stationID)
    df = ar_funcs.build_empty_df(stationID)
    df_lst.append(df)

0 COOPAHNA2
1 COOPHCSA2
2 COOPHOOA2
3 COOPPECA2
4 CRGA2
5 HONA2
6 JNEA2
7 KTNA2
8 PAGS
9 PAGY
10 PAHN
11 PAJK
12 PAJN
13 PAKT
14 PAKW
15 PAPG
16 PASI
17 PAWG
18 PAYA


In [4]:
## read AR duration file
duration_df = pd.read_csv('../out/AR_track_duration_SEAK.csv')
duration_df['start_date'] = pd.to_datetime(duration_df['start_date'])
duration_df['end_date'] = pd.to_datetime(duration_df['end_date'])
duration_df.index = duration_df['trackID']

ARID_issues = [200411121210, 200411191202, 200610151213, 200610201812, 201205201201,
               201209010004]
error_desc = ['IVT nan', 'IVT nan', 'prec wrong dates', 'prec wrong dates', 'prec time unsorted',
              'freeze level not same datetime as ivt']
df_filtered = duration_df[~duration_df['trackID'].isin(ARID_issues)]

ARID_lst = df_filtered.index.values
len(ARID_lst)

1494

In [5]:
%%time
## enumerate through ARIDs
for i, ARID in enumerate(ARID_lst):
    ARID = int(ARID)
    ## open IVT file
    ds = ar_funcs.read_GEFSv12_reforecast_data('ivt', ARID)   
    ## get IVT information
    df_lst = ar_funcs.preprocess_IVT_info(config, ds, ARID, df_lst)
    ## close IVT file
    ds.close()
    
    ## open freezing level file
    ds = ar_funcs.read_GEFSv12_reforecast_data('freezing_level', ARID)
    ## get freezing level info
    ds_lst = ar_funcs.preprocess_freezing_level(config, ds, ARID, df_lst)    
    ## close freezing level file
    ds.close()
    
    ## open precipitation file
    ds = ar_funcs.read_GEFSv12_reforecast_data('prec', ARID)    
    ## get precipitation information
    df_lst = ar_funcs.preprocess_prec_GEFS(config, ds, ARID, df_lst)
    ## close precipitation file
    ds.close()

CPU times: user 7min 37s, sys: 1min 14s, total: 8min 52s
Wall time: 35min 14s


In [23]:
fname = '/data/projects/Comet/cwp140/downloads/ar_impact_info.csv'
impact_df = pd.read_csv(fname)
impact_df = impact_df.set_index(pd.to_datetime(impact_df['Impact dates']))
start_date = '2000-01-01'
end_date = '2019-08-31'
idx = (impact_df.index >= start_date) & (impact_df.index <= end_date)
impact_df = impact_df.loc[idx]
## fix names of stations to match ASOS/COOP station ID
impact_df['Location'] = impact_df['Location'].replace({'JNNA2': 'PAJN', 'PECA2': 'COOPPECA2', 
                        'HCSA2': 'COOPHCSA2', 'AHNA2': 'COOPAHNA2',
                        'Thorne Bay': 'PAKW', 'Thorne Bay/PAKW': 'PAKW',
                        'Staney/PAKW': 'PAKW', 'HOOA2': 'COOPHOOA2',
                        np.nan: 'PAJN'})

impact_date = impact_df.loc[(impact_df['Location'] == 'PAKT')].index[-1]

In [49]:
df_lst[0]

Unnamed: 0_level_0,Unnamed: 0,trackID,start_date,end_date,duration,IVT_max,IVT_max_time,IVT_dir,freezing_level,ar_scale,...,ASOS_prec_accum,ASOS_prec_max_rate,ARI_1hr,ARI_3hr,ARI_6hr,ARI_12hr,ARI_24hr,impact_scale,impacts,impact_notes
trackID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2.000011e+11,0,2.000011e+11,2000-01-06 00:00:00,2000-01-06 18:00:00,18.0,88.773054,2000-01-06T00:00:00.000000000,3.795190,,,...,,,,,,,,,,
2.000011e+11,1,2.000011e+11,2000-01-08 00:00:00,2000-01-08 06:00:00,6.0,96.206675,2000-01-08T15:00:00.000000000,3.243829,,,...,,,,,,,,,,
2.000012e+11,2,2.000012e+11,2000-01-19 06:00:00,2000-01-19 12:00:00,6.0,57.560952,2000-01-18T15:00:00.000000000,4.975841,,,...,,,,,,,,,,
2.000012e+11,3,2.000012e+11,2000-01-27 06:00:00,2000-01-31 00:00:00,90.0,237.418411,2000-01-27T21:00:00.000000000,3.380810,,,...,,,,,,,,,,
2.000013e+11,4,2.000013e+11,2000-01-28 18:00:00,2000-01-29 00:00:00,6.0,237.418411,2000-01-27T21:00:00.000000000,3.380810,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2.019082e+11,1495,2.019082e+11,2019-08-16 00:00:00,2019-08-21 12:00:00,132.0,274.384125,2019-08-17T03:00:00.000000000,5.342937,3076.810059,0.0,...,,,,,,,,,,
2.019082e+11,1496,2.019082e+11,2019-08-21 12:00:00,2019-08-22 06:00:00,18.0,274.384125,2019-08-17T03:00:00.000000000,5.342937,3076.810059,0.0,...,,,,,,,,,,
2.019082e+11,1497,2.019082e+11,2019-08-22 18:00:00,2019-08-23 00:00:00,6.0,274.384125,2019-08-17T03:00:00.000000000,5.342937,3076.810059,0.0,...,,,,,,,,,,
2.019082e+11,1498,2.019082e+11,2019-08-23 12:00:00,2019-08-24 06:00:00,18.0,274.384125,2019-08-17T03:00:00.000000000,5.342937,3076.810059,0.0,...,,,,,,,,,,


In [132]:
year = 2019
month = 9
pd.to_datetime('{0}-{1}-01'.format(year, month)) - pd.DateOffset(days=15)

Timestamp('2019-08-17 00:00:00')

In [77]:
from pandas.tseries.offsets import MonthEnd
def add_impact_info(stationIDX, stationID):
    
    test = impact_df.loc[(impact_df['Location'] == stationID)]
    ## current station df
    subset_df = df_lst[stationIDX]
    subset_df['impact_type'] = np.nan # type of impact
    subset_df['misc'] = np.nan # copies other notes over
        
    ar_impact = []
    for idx_impact, row_impact in test.iterrows():
        year = idx_impact.year
        month = idx_impact.month

        ## subset to year/month of current row in impact dataframe +- 15 days
        start = pd.to_datetime('{0}-{1}-01'.format(year, month)) - pd.DateOffset(days=15)
        end = pd.to_datetime('{0}-{1}'.format(idx_impact.year, idx_impact.month), format="%Y-%m") + MonthEnd(0, normalize=True) + pd.DateOffset(days=15)
        idx = (subset_df['start_date'] >= start) & (subset_df['end_date'] <= end)
        tmp = subset_df.loc[idx]

        for index, row in tmp.iterrows():
            date1 = row['start_date'] - pd.DateOffset(hours=24)
            date2 = row['end_date'] + pd.DateOffset(hours=24)

            if date1 <= idx_impact <= date2:
                # print(date1, date2, impact_date, index, "PASS!")
                ar_impact.append(idx_impact)
                subset_df.loc[index, 'impact_scale'] = row_impact['Impact Level']
                subset_df.loc[index, 'impacts'] = 1
                subset_df.loc[index, 'impact_type'] = row_impact['Impact']
                subset_df.loc[index, 'impact_notes'] = row_impact['Impact Information']
                subset_df.loc[index, 'misc'] = row_impact['Notes']
            else:
                pass
    ## get the impact dates not found in AR database        
    a = ar_impact
    b = test.index
    ar_not_found = set(a) ^ set(b)
                
    return subset_df, ar_not_found

In [78]:
subset_df_lst = []
ar_impact_lst = []
for i, stationID in enumerate(config):
    print(i, stationID)
    subset_df, ar_impact = add_impact_info(i, stationID)
    subset_df_lst.append(subset_df)
    ar_impact_lst.append(ar_impact)

0 COOPAHNA2
1 COOPHCSA2
2 COOPHOOA2
3 COOPPECA2
4 CRGA2
5 HONA2
6 JNEA2
7 KTNA2
8 PAGS
9 PAGY
10 PAHN
11 PAJK
12 PAJN
13 PAKT
14 PAKW
15 PAPG
16 PASI
17 PAWG
18 PAYA


In [130]:
a = ar_impact_lst[18]
b = impact_df.loc[(impact_df['Location'] == 'PAYA')].index

set(a) ^ set(b)

{Timestamp('2000-07-13 00:00:00'),
 Timestamp('2004-06-29 00:00:00'),
 Timestamp('2015-09-29 00:00:00'),
 Timestamp('2016-07-18 00:00:00')}

In [131]:
impact_df.loc[(impact_df['Location'] == 'PAYA')][:20]

Unnamed: 0_level_0,Last Date,Location,Total Hours,Total IVT,Max IVT,Total Precip,Avg. Direction,Return Period,1h ARI,3h ARI,...,1d ARI,Impact Level,Impact dates,Impact,Impact Information,Location.1,Impact Source,Notes,Wick Precip,Wick Return
Impact dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-03-18,20190318_12Z,PAYA,24.0,1128.22,610.07,2.05,174.2,,1.0,1.0,...,1.0,0.0,03/18/2019,Action flooding,Situk was at its 9th highest overflow,PAYA,AHPS,This is not in the AR database,,
2017-12-09,,PAYA,,,,,,,1.0,,...,1.0,0.0,12/09/2017,"Action stage, flooding","multiple rises on srya2 with minor flooding, w...",PAYA,ARDAT,THis is not in the AR database,,
2016-07-18,,PAYA,,,,,,,,,...,,0.0,07/18/2016,Flood stage,Alsk was at its highest overflow,PAYA,AHPS,This is not in the AR database,,
2015-09-29,,PAYA,,,,,,,1.0,,...,1.0,0.0,09/29/2015,Action flooding,13th highest ovewrflow of situk,PAYA,AHPS,This was not deemed AR event but had entry in ...,,
2014-01-23,20140124_00Z,PAYA,24.0,1199.0,770.0,,196.0,,1.0,,...,1.0,0.0,01/23/2014,"Heavy damage, flood stage, landslides","17k in damage in Yakutat, minor to moderate fl...",PAYA,"NWS, AHPS",This was the 4th highest IVT event in Yakutat,,
2013-09-08,20130908_12Z,PAYA,36.0,1688.0,825.0,,171.0,,1.0,1.0,...,1.0,0.0,09/08/2013,"flood stage, action stage",Situk was at its 8th highest overflow (flood s...,PAYA,AHPS,THe 8th was the highest IVT value of all AR ev...,,
2009-08-01,,PAYA,,,,,,,1.0,1.0,...,1.0,0.0,08/01/2009,Action flooding,Alsek river at 9th highest overflow,PAYA,AHPS,This may be this event: date: 20090727_00Z ...,,
2009-01-18,20090118_00Z,PAYA,24.0,790.0,404.0,,176.5,,1.0,1.0,...,5.0,0.0,01/18/2009,Flood stage,Situk river at 6th highest,PAYA,AHPS,,,
2005-11-23,,PAYA,,,,,,,1.0,1.0,...,1.0,0.0,11/23/2005,Action flooding,Situk river at 10th highest,PAYA,AHPS,This may be in AR database as there is an even...,,
2004-12-23,,PAYA,,,,,,,1.0,1.0,...,1.0,0.0,12/23/2004,Flood stage,Situk river at its 4th highest overflow,PAYA,AHPS,This is not in AR database,,


In [35]:
impact_sub = impact_df.loc[(impact_df['Location'] == 'PAKT')].iloc[-1]
impact_sub

Last Date                                                  20000725_00Z
Location                                                           PAKT
Total Hours                                                        36.0
Total IVT                                                       1335.43
Max IVT                                                          581.58
Total Precip                                                       6.01
Avg. Direction                                               178.673333
Return Period                                                         1
1h ARI                                                              1.0
3h ARI                                                              1.0
6h ARI                                                                1
12h ARI                                                             NaN
1d ARI                                                              1.0
Impact Level                                                    

In [29]:
ivt = ar_funcs.read_GEFSv12_reforecast_data('ivt', ARID)
freeze = ar_funcs.read_GEFSv12_reforecast_data('freezing_level', ARID)

In [30]:
ivt.time.values

array(['2012-08-25T03:00:00.000000000', '2012-08-25T06:00:00.000000000',
       '2012-08-25T09:00:00.000000000', '2012-08-25T12:00:00.000000000',
       '2012-08-25T15:00:00.000000000', '2012-08-25T18:00:00.000000000',
       '2012-08-25T21:00:00.000000000', '2012-08-26T00:00:00.000000000',
       '2012-08-26T03:00:00.000000000', '2012-08-26T06:00:00.000000000',
       '2012-08-26T09:00:00.000000000', '2012-08-26T12:00:00.000000000',
       '2012-08-26T15:00:00.000000000', '2012-08-26T18:00:00.000000000',
       '2012-08-26T21:00:00.000000000', '2012-08-27T00:00:00.000000000',
       '2012-08-27T03:00:00.000000000', '2012-08-27T06:00:00.000000000',
       '2012-08-27T09:00:00.000000000', '2012-08-27T12:00:00.000000000',
       '2012-08-27T15:00:00.000000000', '2012-08-27T18:00:00.000000000',
       '2012-08-27T21:00:00.000000000', '2012-08-28T00:00:00.000000000',
       '2012-08-28T03:00:00.000000000', '2012-08-28T06:00:00.000000000',
       '2012-08-28T09:00:00.000000000', '2012-08-28

In [31]:
freeze.time.values

array(['2012-08-25T03:00:00.000000000', '2012-08-25T06:00:00.000000000',
       '2012-08-25T09:00:00.000000000', '2012-08-25T12:00:00.000000000',
       '2012-08-25T15:00:00.000000000', '2012-08-25T18:00:00.000000000',
       '2012-08-25T21:00:00.000000000', '2012-08-26T00:00:00.000000000',
       '2012-08-26T03:00:00.000000000', '2012-08-26T06:00:00.000000000',
       '2012-08-26T09:00:00.000000000', '2012-08-26T12:00:00.000000000',
       '2012-08-26T15:00:00.000000000', '2012-08-26T18:00:00.000000000',
       '2012-08-26T21:00:00.000000000', '2012-08-27T00:00:00.000000000',
       '2012-08-27T03:00:00.000000000', '2012-08-27T06:00:00.000000000',
       '2012-08-27T09:00:00.000000000', '2012-08-27T12:00:00.000000000',
       '2012-08-27T15:00:00.000000000', '2012-08-27T18:00:00.000000000',
       '2012-08-27T21:00:00.000000000', '2012-08-28T00:00:00.000000000',
       '2012-08-29T03:00:00.000000000', '2012-08-29T06:00:00.000000000',
       '2012-08-29T09:00:00.000000000', '2012-08-29

In [25]:
lat = float(config[stationID]['lat'])
lon = float(config[stationID]['lon']) % 360
ts = test.sel(lat=lat, lon=lon, method='nearest')

In [28]:
ts.sel(time='2012-08-27T15:00:00.000000000').freezing_level.values

array(3602.4224, dtype=float32)

In [14]:
ts.ivt.argmax(dim='time').values

array(20)

In [15]:
ts.ivt

In [17]:
test = ts.sel(time=slice(duration_df.loc[ARID].start_date, duration_df.loc[ARID].end_date))
test

In [19]:
## write df to .csv
duration_df.loc[ARID]

Unnamed: 0                    526
trackID            200610151213.0
start_date    2006-10-17 18:00:00
end_date      2006-10-22 18:00:00
duration                    120.0
Name: 200610151213.0, dtype: object