# Weather Station Data - cleaned final version 
## Level 1 QC

In [1]:
#import external libraries
import pandas as pd
import os
pd.options.mode.chained_assignment = None  # default='warn'; chained index warning
import numpy as np
import imp
import datetime
import pytz

#plotting
import matplotlib.pyplot as plt
%matplotlib notebook

#import self-written libraries
import LVL1

In [2]:
#Do you want to display plots (slow for large datasets)
show_plots=False #True= show plots
frac=0.3 #fraction of data to show in plots (0.3 = 30%)

** Define Location-Specific Baseline Settings **

In [3]:
#path to level 0 data
Glacier="Wolverine" #or "Gulkana"
Station="990" #or other elevations; this depends on the naming convention of input data
timezone='America/Anchorage' #choose from pytz.all_timezones
pth=r"Q:/Project Data/GlacierData/Benchmark_Program/Data/" +Glacier+ r"/AllYears/Wx/LVL0/emily/" + Glacier.lower()+ Station+ "_all_15minL0.csv"
#pth=r"Q:\Project Data\GlacierData\Benchmark_Program\Data\Wolverine\AllYears\Wx\LVL0\emily\wolverine990_2017_15minL0.csv"

#time format and column
UTC_time_column_name='UTC_time'
local_time_column_name='AK_time'
date_format='%Y/%m/%d %H:%M'

#directory to save output data
save_pth=r"Q:/Project Data/GlacierData/Benchmark_Program/Data/" +Glacier+ r"/AllYears/Wx/LVL1/emily/" + Glacier.lower()+Station+"_15minLVL1.csv"

#Columns which contain precipitation data
precip_columns=['TPGCumulative', 'StageCumulative']

### Data import
- read in data 
- set timezone
- set time as index

In [4]:
#read in level0 CSV
dat=pd.read_csv(pth)

#Set local time to dataframe index
dat['local_time']=pd.to_datetime(dat[local_time_column_name], format=date_format)
dat=dat.set_index('local_time')
dat=dat.tz_localize(timezone, ambiguous='infer')

In [5]:
# #TIME SHOULD BE DEALT WITH IN LEVEL0 PROCESSING (aka before this); LEVEL 0 will have time in UTC.
# #I'm doing it here because it is currently NOT; shift this block to level 0 processing when possible.
# dat['time']=pd.to_datetime(dat[UTC_time_column_name], format= date_format) #this is not actually UTC time; it's UTC-8
# dat['UTC']=dat.time + datetime.timedelta(hours=8) #This is now UTC time #THIS NAME MUST BE USED!

# dat['UTC']=dat.UTC_time

# #Convert date to date-time object, and index of dataframe
# dat['UTC']=pd.to_datetime(dat.UTC, format=date_format)
# dat=dat.set_index("UTC") #Set this UTC time object as the index of the dataframe

# #Timezone
# local_timezone=pytz.timezone(timezone) #create local timezone object (e.g. AK time)
# dat['local_time'] = dat.index.tz_localize('UTC').tz_convert(local_timezone)
# dat=dat.set_index('local_time')

# #Create QC-Flag identical dataframe - will store QC status of each value
dat_qc_info = pd.DataFrame().reindex_like(dat)

Station-specific Data Adjustments

In [6]:
# Wolverine990: Prior to 2014.04.20, asp. temperature only aspirated on the hour; other msmts set to NAN
if Glacier+Station=="Wolverine990":
    dat.loc[(dat.index<'2014/04/20 00:00:00') & (dat.index.minute!=0), 'TAspirated1']=np.nan
    dat_qc_info.loc[(dat.index<'2014/04/20 00:00:00') & (dat.index.minute!=0), 'TAspirated1']=1 #seet QC flag to 1, indicating removed values
    dat.loc[(dat.index<'2014/04/20 00:00:00') & (dat.index.minute!=0), 'TAspirated2']=np.nan
    dat_qc_info.loc[(dat.index<'2014/04/20 00:00:00') & (dat.index.minute!=0), 'TAspirated2']=1

# Remove this subset after done troubleshooting!!

In [7]:
# #Subset to 3 months, for speed during trouble-shoot

# dat=dat['2013-06':'2013-11-08']
# for precip_col in precip_columns:
#     dat[precip_col]=dat[precip_col]-dat[precip_col][0] #set series so that it begins at 0; only neccesary for subsets

** Remove malfunctioning sensors manually **
* done separately for each timestep entered on look-up table

In [8]:
#read in table listing sensors and timeperiods that need to be modified
pth_bad_sensor_dates=r"Q:\Project Data\GlacierData\Benchmark_Program\Metadata\weather"
bad_sensor_dates_dat=pd.read_excel(os.path.join(pth_bad_sensor_dates, Glacier+Station+"_SensorLog.xlsx"))

#Set bad data to NAN. Move sensor data to correct column, if neccessary.
dat=LVL1.remove_malfunctioning_sensor_data(dat, bad_sensor_dates_dat)
print("Removed:")
bad_sensor_dates_dat

Removed:


Unnamed: 0,Sensor,Start_Date,End_Date,Action,Correct_Label,Location,Comment
0,TAspirated2,2014-04-25 06:45:00,2014-09-04 09:00:00,switch_label,Tpassive2,Wolverine990,mislabeled
1,Tpassive1,2013-05-07 02:15:00,2013-11-06 08:00:00,bad,,Wolverine990,
2,Tpassive2,2013-11-06 08:00:00,2014-09-04 09:15:00,bad,,Wolverine990,
3,Tpassive2,2016-07-12 16:00:00,2016-08-18 03:59:00,bad,,Wolverine990,
4,TPGCumulative,2010-01-01 00:00:00,2011-10-21 15:00:00,bad,,Wolverine990,slow drain
5,StageCumulative,2010-09-15 00:00:00,2010-10-03 00:00:00,bad,,Wolverine990,slow drain


** Set hard-coded cutoff values** (station specific)

In [9]:
if Glacier+Station=="Wolverine990":
    low_temp_cutoff=-40.0 #deg. C
    high_temp_cutoff=30.0

    precip_high_cutoff=0.015 #meters
    precip_drain_cutoff=-0.015
    obvious_error_precip_cutoff=0.3 #if 30 cm in 15 minutes, obviously an error.
    
    noise_limit=0.0025
    
    Stage_dies='2015/09/01 06:30:00' #time at which instrument was removed
    TPG_born='2012/10/02 00:00:00' #time at which instrument was removed
    
if Glacier + Station== "Gulkana1480":
    low_temp_cutoff=-40
    high_temp_cutoff=40

    precip_high_cutoff=0.015 #in meters
    precip_drain_cutoff=-0.015
    obvious_error_precip_cutoff=0.3 #if 30 cm in 15 minutes, obviously an error.
    noise_limit=0.0025
    
    Stage_dies='2015/09/25 11:30:00' #time at which instrument was removed
    TPG_born='2011/07/15 18:30:00' #time at which instrument was removed

In [10]:
#Save unaltered data for later comparrison with edits
dat_original=dat.copy()

** Clean Temperature Data**

In [11]:
#Temperature
temp_columns=['Tpassive1', 'Tpassive2', 'TAspirated1', 'TAspirated2']
for temp_col in temp_columns:
    temps=dat.loc[:,temp_col]
    dat.loc[:,temp_col]=LVL1.remove_error_temperature_values(temps, low_temp_cutoff, high_temp_cutoff)

#### Plot temperature data, before and after

In [12]:
if show_plots:
    %matplotlib notebook
    LVL1.plot_comparrison(df_old=dat_original.sample(frac=0.2), df_new=dat.sample(frac=0.2), data_col_name='TAspirated1')

** Plot original precipitation data**

In [13]:
#Precip
if show_plots:
    dat_original=dat_original[precip_columns] #subset to only precip columns
    dat_original.sample(frac=0.4).plot()

In [14]:
#Set gages to NAN, when not installed (single gauge style does not cover period of record)
for precip_col in precip_columns:
    if 'TPG' in precip_col.upper():
        dat.loc[:TPG_born, precip_col]=np.nan
    if 'stage' in precip_col.lower():
        dat.loc[Stage_dies:, precip_col]=np.nan

** Remove Gauge Drain and Fill Maintenance **

In [15]:
#Remove unrealistically large instantaneous jumps; create continuous timreseries
for precip_col in precip_columns:
    dat.loc[:,precip_col]=LVL1.precip_remove_obvious_sensor_malfunctions(dat_original[precip_col], obvious_error_precip_cutoff, noise_cutoff=precip_high_cutoff)

dat_step0=dat[precip_columns].copy() #save for plotting

In [16]:
#Plot
if show_plots:
    %matplotlib notebook
    LVL1.plot_comparrison(df_old=dat_original.sample(frac=0.3), df_new=dat_step0.sample(frac=0.3), data_col_name=precip_columns[0], label_old="original", label_new="drain and fill removed")

In [17]:
#Plot other precip gage
if show_plots:
    %matplotlib notebook
    LVL1.plot_comparrison(df_old=dat_original.sample(frac=0.2), df_new=dat_step0.sample(frac=0.2), data_col_name=precip_columns[1], label_old="original", label_new="drain and fill removed")

In [18]:
#Fill any remaining gaps due to sensor fallout, etc: 
for precip_col in precip_columns:
    dat_step0.loc[:,precip_col]=LVL1.precip_interpolate_gaps_under1day(dat_step0[precip_col])

#### Remove high-amplitude noise related to wind

In [19]:
#Remove high-amplitude noise, related to wind, etc.
for precip_col in precip_columns:
    dat.loc[:, precip_col]=LVL1.precip_remove_high_frequency_noiseNayak2010(dat_step0[precip_col], noise=0.0025, bucket_fill_drain_cutoff=obvious_error_precip_cutoff, n_forward_noise_free=10)
dat_step1=dat[precip_columns].copy() #save for plotting

noise starts at 2011-10-22 04:45:00-08:00 ; 69091
     single value removed at 2011-10-22 04:45:00-08:00
     skipping iteration2011-10-22 05:00:00-08:00
noise starts at 2011-10-22 12:00:00-08:00 ; 69120
     single value removed at 2011-10-22 12:00:00-08:00
     skipping iteration2011-10-22 12:15:00-08:00
noise starts at 2011-10-23 00:45:00-08:00 ; 69171
     single value removed at 2011-10-23 00:45:00-08:00
     skipping iteration2011-10-23 01:00:00-08:00
noise starts at 2011-10-23 15:00:00-08:00 ; 69228
     interpolated noise at locations 2011-10-23 15:00:00-08:00:2011-10-23 16:45:00-08:00
noise starts at 2011-10-25 08:45:00-08:00 ; 69395
     single value removed at 2011-10-25 08:45:00-08:00
     skipping iteration2011-10-25 09:00:00-08:00
noise starts at 2011-10-27 00:45:00-08:00 ; 69555
     single value removed at 2011-10-27 00:45:00-08:00
     skipping iteration2011-10-27 01:00:00-08:00
noise starts at 2011-10-27 04:45:00-08:00 ; 69571
     single value removed at 2011-10-27 0

In [20]:
#Plot results
if show_plots:
    %matplotlib notebook
    LVL1.plot_comparrison(df_old=dat_step0.sample(frac=0.3), df_new=dat_step1.sample(frac=0.3), data_col_name=precip_columns[0], label_old="original", label_new="high amplitude noise removed")

In [21]:
#Strip Remaining Outlier Values with Hampel Filter
for precip_col in precip_columns:
    dat.loc[:,precip_col]=LVL1.hampel(dat_step1[precip_col])
dat_step2=dat[precip_columns].copy() #save for plotting

In [22]:
#Plot results
if show_plots:
    %matplotlib notebook
    LVL1.plot_comparrison(df_old=dat_step1.sample(frac=frac), df_new=dat_step2.sample(frac=frac), data_col_name=precip_columns[0], label_old="original", label_new="after hampel outlier filter")

In [23]:
#Interpolate gaps under one day in length
# for precip_col in precip_columns:
#     dat[precip_col]=LVL1.precip_interpolate_gaps_under1day(dat_step2[precip_col])

**Smooth Data with method from Nayak (2010): **

In [24]:
#Smooth Incremental Precip Data
for precip_col in precip_columns:
    #Smooth this data with method from Nayak 2010
    print ("smoothing "+ precip_col)
    dat[precip_col]=LVL1.smooth_precip_Nayak2010(dat_step2[precip_col])
dat_step3=dat[precip_columns].copy() #save for plotting

smoothing TPGCumulative
  smoothing data in forward direction; may take a minute
  smoothing data in reverse direction; may take a minute
UGHyeahblergh
smoothing StageCumulative
  smoothing data in forward direction; may take a minute
  smoothing data in reverse direction; may take a minute
UGHyeahblergh


In [25]:
#Plot
if show_plots:
    %matplotlib notebook
    LVL1.plot_comparrison(df_old=dat_step2.sample(frac=frac), df_new=dat_step3.sample(frac=frac), data_col_name=precip_columns[0], label_old="original", label_new="after smoothing")

In [26]:
if show_plots:
    %matplotlib notebook
    LVL1.plot_comparrison(df_old=dat_step2.sample(frac=frac), df_new=dat_step3.sample(frac=frac), data_col_name=precip_columns[1], label_old="original", label_new="after smoothing")

In [27]:
#Final Comparrison Plot
if show_plots:
    %matplotlib notebook
    frac=0.1
    ax=dat_original[precip_columns[0]].sample(frac=frac).plot(color='black', label='original')
    dat_step0[precip_columns[0]].sample(frac=frac).plot(color='blue', ax=ax, label='fills removed')
    dat_step3[precip_columns[0]].sample(frac=frac).plot(color='red', ax=ax, label='final')
    plt.legend()

Final Steps

In [28]:
#Convert units from meters to mm
for precip_col in precip_columns:
    dat.loc[:,precip_col]=dat[precip_col]*1000

#Set no-data values before TPG installed, and after rocket removed to NAN (not 0)
for precip_col in precip_columns:
    if 'TPG' in precip_col.upper():
        dat.loc[:TPG_born, precip_col]=np.nan
    if 'stage' in precip_col.lower():
        dat.loc[Stage_dies:, precip_col]=np.nan

In [29]:
dat

Unnamed: 0_level_0,UTC_time,AK_time,Tpassive1,Tpassive2,TAspirated1,TAspirated2,RelHum,StageCumulative,TPGCumulative,WindSpeed,WindGustSpeed,WindDir,LoggerTemp,LoggerBattery
local_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2009-10-01 00:00:00-08:00,2009/10/01 08:00,2009/10/01 00:00,,0.03,,,,279.3,,,,208.0,,
2009-10-01 00:15:00-08:00,2009/10/01 08:15,2009/10/01 00:15,0.27,0.10,,,,279.3,,,,290.0,,
2009-10-01 00:30:00-08:00,2009/10/01 08:30,2009/10/01 00:30,0.34,0.17,,,,279.3,,,,26.0,,
2009-10-01 00:45:00-08:00,2009/10/01 08:45,2009/10/01 00:45,0.54,0.41,,,,279.3,,,,318.0,,
2009-10-01 01:00:00-08:00,2009/10/01 09:00,2009/10/01 01:00,0.71,0.57,,,,279.3,,,,279.0,,
2009-10-01 01:15:00-08:00,2009/10/01 09:15,2009/10/01 01:15,0.85,0.70,,,,279.3,,,,313.0,,
2009-10-01 01:30:00-08:00,2009/10/01 09:30,2009/10/01 01:30,0.95,0.81,,,,279.3,,,,312.0,,
2009-10-01 01:45:00-08:00,2009/10/01 09:45,2009/10/01 01:45,0.97,0.82,,,,279.3,,,,291.0,,
2009-10-01 02:00:00-08:00,2009/10/01 10:00,2009/10/01 02:00,1.07,0.93,,,,279.3,,,,239.0,,
2009-10-01 02:15:00-08:00,2009/10/01 10:15,2009/10/01 02:15,1.02,0.85,,,,279.3,,,,261.0,,


## Save Edited Data

In [30]:
#Create time as text column
local_timezone=pytz.timezone(timezone) #create local timezone object (e.g. AK time)
dat['Local_time']=dat.index.tz_convert(local_timezone).strftime(date_format)#Create column for true local time (as string, not UTC - X hrs)
#Column Names
out_columns=['Local_time']+temp_columns+precip_columns+['RelHum', 'WindSpeed', 'WindGustSpeed', 'WindDir', 'LoggerTemp','LoggerBattery']
#Subset
save_dat=dat[out_columns] #removes unwanted columns

#Save
save_dat.to_csv(save_pth, index=False, float_format='%g')
#save_dat.to_csv(index=False, float_format='%g')
