# LVL1 WX Code

In [69]:
#import external libraries
import pandas as pd
import os
pd.options.mode.chained_assignment = None  # default='warn'; chained index warning
import numpy as np
import imp
import datetime
import pytz

#plotting
import seaborn as sns; sns.set(style='whitegrid')
import matplotlib.pyplot as plt
%matplotlib notebook

#import self-written libraries
import LVL1

In [52]:
imp.reload(LVL1)

<module 'LVL1' from 'C:\\Users\\ehbaker\\Documents\\Python\\WXmunge\\LVL1.py'>

** Define Location-Specific Baseline Settings **

In [53]:
#path to level 0 data
Glacier="Wolverine" #or "Gulkana"
Station="990" #or other elevations; this depends on the naming convention of input data
timezone='America/Anchorage' #choose from pytz.all_timezones
pth=r"Q:/Project Data/GlacierData/Benchmark_Program/Data/" +Glacier+ r"/AllYears/Wx/LVL0/" + Glacier.lower()+"_990_ALL_15minLVL0.csv"

#time format and column
UTC_time_column_name='UTC_time'
date_format='%Y/%m/%d %H:%M'

#directory to save output data
save_pth=r"Q:/Project Data/GlacierData/Benchmark_Program/Data/" +Glacier+ r"/AllYears/Wx/LVL1/emily/" + Glacier.lower()+"_990_ALL_15minLVL1.csv"

### Data import
- read in data 
- set timezone
- set time as index

In [54]:
#read in level0 CSV
dat=pd.read_csv(pth)

#TIME SHOULD BE DEALT WITH IN LEVEL0 PROCESSING (aka before this); LEVEL 0 will have time in UTC.
#I'm doing it here because it is currently NOT; shift this block to level 0 processing when possible.
dat['time']=pd.to_datetime(dat[UTC_time_column_name], format= date_format) #this is not actually UTC time; it's UTC-8
dat['UTC']=dat.time + datetime.timedelta(hours=8) #This is now UTC time #THIS NAME MUST BE USED!

#Convert date to date-time object, and index of dataframe
dat['UTC']=pd.to_datetime(dat.UTC, format=date_format)
dat=dat.set_index("UTC") #Set this UTC time object as the index of the dataframe

#Timezone
local_timezone=pytz.timezone(timezone) #create local timezone object (e.g. AK time)
dat['local_time'] = dat.index.tz_localize('UTC').tz_convert(local_timezone)
dat=dat.set_index('local_time')

#Create QC-Flag identical dataframe - will store QC status of each value
dat_qc_info = pd.DataFrame().reindex_like(dat)

Station-specific Data Adjustments

In [55]:
# Wolverine990: Prior to 2014.04.20, asp. temperature only aspirated on the hour; other msmts set to NAN
if Glacier+Station=="Wolverine990":
    dat.loc[(dat.index>'2014/04/20 00:00:00') & (dat.index.minute!=0), 'TAspirated1']=np.nan
    dat_qc_info.loc[(dat.index>'2014/04/20 00:00:00') & (dat.index.minute!=0), 'TAspirated1']=1 #seet QC flag to 1, indicating removed values
    dat.loc[(dat.index>'2014/04/20 00:00:00') & (dat.index.minute!=0), 'TAspirated2']=np.nan
    dat_qc_info.loc[(dat.index>'2014/04/20 00:00:00') & (dat.index.minute!=0), 'TAspirated2']=1

# Remove this subset after done troubleshooting!!

In [56]:
#Subset to 3 months, for speed during trouble-shoot
dat=dat['2015-03':'2015-06']

** Remove malfunctioning sensors manually **
* done separately for each timestep entered on look-up table

In [57]:
#read in table listing sensors and timeperiods that need to be modified
pth_bad_sensor_dates=r"Q:\Project Data\GlacierData\Benchmark_Program\Metadata\weather"
bad_sensor_dates_dat=pd.read_excel(os.path.join(pth_bad_sensor_dates, Glacier+Station+"_SensorLog.xlsx"))

#Set bad data to NAN. Move sensor data to correct column, if neccessary.
dat=LVL1.remove_malfunctioning_sensor_data(dat, bad_sensor_dates_dat)
print("Removed:")
bad_sensor_dates_dat

Removed:


Unnamed: 0,Sensor,Start_Date,End_Date,Action,Correct_Label,Location
0,TAspirated2,2014-04-25 06:45:00,2014-09-04 09:00:00,switch_label,Tpassive2,Wolverine990
1,Tpassive1,2013-05-07 02:15:00,2013-11-06 08:00:00,bad,,Wolverine990
2,Tpassive2,2013-11-06 08:00:00,2014-09-04 09:15:00,bad,,Wolverine990
3,Tpassive2,2016-07-12 16:00:00,2016-08-18 03:59:00,bad,,Wolverine990


** Set hard-coded cutoff values** (station specific)

In [58]:
if Glacier+Station=="Wolverine990":
    low_temp_cutoff=-40.0 #deg. C
    high_temp_cutoff=30.0

    precip_high_cutoff=0.015 #meters
    precip_drain_cutoff=-0.015
    obvious_error_precip_cutoff=0.3 #if 30 cm in 15 minutes, obviously an error.
    
    Stage_dies='2015/09/01 06:30:00' #time at which instrument was removed
    TPG_born='2012/10/02 00:00:00' #time at which instrument was removed

** Clean Temperature Data**

In [59]:
#Temperature
temp_columns=['Tpassive1', 'Tpassive2', 'TAspirated1', 'TAspirated2']
for temp_col in temp_columns:
    temps=dat.loc[:,temp_col]
    dat.loc[:,temp_col]=LVL1.remove_error_temperature_values(temps, low_temp_cutoff, high_temp_cutoff)

** Plot original precipitation data**

In [60]:
#Precip
precip_columns=['TPGCumulative', 'StageCumulative']
dat_original=dat.copy() #save unaltered data for later comparrison
for precip_col in precip_columns:
    dat_original.loc[:, precip_col]=dat_original[precip_col]-dat_original[precip_col][0] #adjust so that series begins at 0; neccesary for plots of non-full timeseries
dat_original[precip_columns].plot()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x25942f17208>

** Clean Precipitation Data **

In [61]:
#Set gages to NAN, when not installed (single gauge style does not cover period of record)
for precip_col in precip_columns:
    if 'TPG' in precip_col.upper():
        dat.loc[:TPG_born, precip_col]=np.nan
    if 'stage' in precip_col.lower():
        dat.loc[Stage_dies:, precip_col]=np.nan
        
#Remove sensor malfunctions with hampel filter
for precip_col in precip_columns:
    dat[precip_col]=LVL1.hampel(dat[precip_col].values, k=3)
        
#Remove obvious errors, with given hard-coded values set above
for precip_col in precip_columns:
    edited_values=LVL1.remove_error_precip_values(dat[precip_col], obvious_error_precip_cutoff, precip_high_cutoff, precip_drain_cutoff)
    dat[precip_col]=edited_values
    
data_before_smoothing=dat.copy() #save this data for later plotting

In [62]:
ax= data_before_smoothing[precip_columns].plot()
dat_original[precip_columns].plot(ax=ax)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x2593e88c208>

** Initial outlier strip - hampel filter**
- remove drain and fill events

In [12]:
incremental_precip_columns=[] #initialize empty list that will hold column names of incremental precip; filled below
for precip_col in precip_columns:
    print("Starting " + precip_col)

    #strip outlier values using a 6-sample Hampel filter
    print("    Hampel Outlier Filter")
    dat[precip_col]=LVL1.hampel(dat[precip_col].values, k=3)

    #Create an incremental timeseries for precip
    dPrecip=precip_col.split("Cumulative")[0]+ "Incremental" #create name for new incremental column
    incremental_precip_columns.append(dPrecip)
    dat[dPrecip]=dat[precip_col] - dat[precip_col].shift(1)
    dat.loc[dat[dPrecip].isnull(), dPrecip]=0 #set increments that are NAN to 0 (error)

    #Precip drain and fill
    dat.loc[(dat[dPrecip]>precip_high_cutoff) & (dat.index.month>=8) & (dat.index.month<12), dPrecip]=0 #remove antifreeze fill
    dat.loc[(dat[dPrecip]<precip_drain_cutoff), dPrecip]=0 #remove negative values associated with drain events

    #Sensor malfunction, giving physically impossible high readings
    dat.loc[dat[dPrecip]>0.3, dPrecip]=0 #cutoff: 30cm in 15 min

data_after_first_smooth=dat.copy() #create copy of data at this step for later plotting comparrison

Starting TPGCumulative
    Hampel Outlier Filter
Starting StageCumulative
    Hampel Outlier Filter


**Plot cumulative timeseries: **

In [13]:
data_after_first_smooth[incremental_precip_columns].cumsum().plot()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1fa7471f128>

In [14]:
for incr_precip_col in incremental_precip_columns:
    #Smooth this data with method from Nayak 2010
    print ("smoothing "+ incr_precip_col)
    dat=LVL1.smooth_precip_Nayak2010(dat, incr_precip_col)

data_after_second_smooth=dat.copy()

smoothing TPGIncremental
  smoothing data in forward direction; may take a minute
  done with forward smoothing
  smoothing data in reverse direction; may take a minute
  done with backwards
storing TPGIncremental_smooth
smoothing StageIncremental
  smoothing data in forward direction; may take a minute
  done with forward smoothing
  smoothing data in reverse direction; may take a minute
  done with backwards
storing StageIncremental_smooth


** Data After Second Smoothing **

In [15]:
smoothed_columns=[s + "_smooth" for s in incremental_precip_columns] #names of columns post-smoothing
dat[incremental_precip_columns+smoothed_columns].cumsum().plot()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1fa78d132e8>

In [16]:
#Convert units from meters to mm
for col in smoothed_columns:
    dat.loc[:,col]=dat[col]*1000

In [19]:
#Set no-data values to NaN; not 0, as was neccessary for smoothing
for precip_col in smoothed_columns:
    if 'TPG' in precip_col.upper():
        dat.loc[:TPG_born, precip_col]=np.nan
    if 'stage' in precip_col.lower():
        dat.loc[Stage_dies:, precip_col]=np.nan

## Save Edited Data

In [20]:
#Time
dat['Local_time']=dat.index.tz_convert(local_timezone).strftime(date_format)#Create column for true local time (as string, not UTC - X hrs)
#Column Names
out_columns=['Local_time']+temp_columns+smoothed_columns+['RelHum', 'WindSpeed', 'WindGustSpeed', 'WindDir', 'LoggerTemp','LoggerBattery']
#Subset
save_dat=dat[out_columns] #removes unwanted columns
#Rename columns with "smooth" in them; match original naming
for col in smoothed_columns:
    newname=col.strip("_smooth")
    save_dat.rename(index=str, columns={col: newname}, inplace=True)

#Save
save_dat.to_csv(save_pth, index=False, float_format='%g')

In [65]:
%qtconsole

In [34]:
obvious_error_precip_cutoff

0.3

In [14]:
dat_original

Unnamed: 0_level_0,Station,UTC_time,AK_time,Tpassive1,Tpassive2,TAspirated1,TAspirated2,RelHum,StageCumulative,TPGCumulative,WindSpeed,WindGustSpeed,WindDir,LoggerTemp,LoggerBattery,time
local_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2015-03-01 00:00:00-09:00,wolverine_990,2015/03/01 01:00,2015/02/28 16:00,-3.2,,-3.79,,85.0,0.0000,0.0000,0.0,0.0,270.0,2.6,14.2,2015-03-01 01:00:00
2015-03-01 00:15:00-09:00,wolverine_990,2015/03/01 01:15,2015/02/28 16:15,-3.2,,,,87.0,0.0000,0.0005,0.0,0.2,15.0,2.6,14.1,2015-03-01 01:15:00
2015-03-01 00:30:00-09:00,wolverine_990,2015/03/01 01:30,2015/02/28 16:30,-2.9,,,,89.0,0.0000,0.0007,0.1,0.6,64.0,2.6,14.1,2015-03-01 01:30:00
2015-03-01 00:45:00-09:00,wolverine_990,2015/03/01 01:45,2015/02/28 16:45,-3.6,,,,88.0,0.0000,0.0008,0.1,0.4,41.0,2.9,14.0,2015-03-01 01:45:00
2015-03-01 01:00:00-09:00,wolverine_990,2015/03/01 02:00,2015/02/28 17:00,-2.8,,-3.40,,87.0,0.0000,0.0006,0.4,1.1,211.0,2.9,14.0,2015-03-01 02:00:00
2015-03-01 01:15:00-09:00,wolverine_990,2015/03/01 02:15,2015/02/28 17:15,-3.4,,,,87.0,0.0000,0.0014,1.2,1.9,241.0,2.9,13.9,2015-03-01 02:15:00
2015-03-01 01:30:00-09:00,wolverine_990,2015/03/01 02:30,2015/02/28 17:30,-3.6,,,,88.0,0.0000,0.0015,3.0,3.8,236.0,2.9,13.9,2015-03-01 02:30:00
2015-03-01 01:45:00-09:00,wolverine_990,2015/03/01 02:45,2015/02/28 17:45,-4.1,,,,90.0,0.0005,0.0016,1.8,4.0,208.0,2.6,13.8,2015-03-01 02:45:00
2015-03-01 02:00:00-09:00,wolverine_990,2015/03/01 03:00,2015/02/28 18:00,-4.1,,-3.70,,91.0,0.0005,0.0016,1.7,2.7,237.0,2.4,13.7,2015-03-01 03:00:00
2015-03-01 02:15:00-09:00,wolverine_990,2015/03/01 03:15,2015/02/28 18:15,-3.7,,,,92.0,0.0005,0.0016,3.1,4.2,232.0,2.1,13.6,2015-03-01 03:15:00


In [15]:
edited_values=LVL1.remove_error_precip_values(dat.TPGCumulative, obvious_error_precip_cutoff, precip_high_cutoff, precip_drain_cutoff)

In [18]:
test_dat=dat.TPGCumulative['2015-03-15':'2015-03-20']

In [20]:
test_out=LVL1.remove_error_precip_values(test_dat, obvious_error_precip_cutoff, precip_high_cutoff, precip_drain_cutoff)

In [28]:
test_out.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x2593cade5f8>

In [185]:
precip_edit=dat_original[precip_columns[0]].copy()

In [186]:
#Step 1 : use incremental precip to set sensor malfunction jumps to NAN in CUMULATIVE timeseres
dPrecip=precip_edit -precip_edit.shift(1) #create incremental precip timeseries
for ii in range(0, len(dPrecip)):
    if abs(dPrecip[ii])>obvious_error_precip_cutoff:
        precip_edit[ii]=np.nan
step1=precip_edit.copy()

In [187]:
#Step1B: remove remaining outliers using one-day (96 samples) median filter
rolling_median=precip_edit.rolling(96).median().fillna(method='ffill').fillna(method='bfill')
difference=np.abs(precip_edit - rolling_median)
threshold=0.2 #threshold for difference between median and the given value
outlier_idx=difference>threshold
precip_edit[outlier_idx]=np.nan

In [188]:
#Step2 - remove NANs in cumulative series output by instruments
precip_edit=precip_edit.interpolate(method='linear', limit=96) #interpolate for gaps < 1 day
step2=precip_edit.copy()

In [190]:
#Step3 -recalculate incremental precip, set values outside expected range to 0
dPrecip=precip_edit -precip_edit.shift(1) #incremental precip
dPrecip.loc[dPrecip>obvious_error_precip_cutoff]=0
dPrecip.loc[(dPrecip>precip_high_cutoff) & (dPrecip.index.month>=8) & (dPrecip.index.month<=11)]=0
dPrecip.loc[dPrecip<precip_drain_cutoff]=0
new_precip_cumulative=dPrecip.cumsum()
new_precip_cumulative[0]=0 #set beginning equal to 0, not NAN as is created with the cumulative sum
step3=new_precip_cumulative.copy()

In [191]:
ax=dat_original.TPGCumulative.plot()
#step1.plot(ax=ax)
step3.plot(ax=ax)

plt.show()

<IPython.core.display.Javascript object>

In [123]:
step1.equals(step1_hampel)

True