In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Pseudo-code
GOAL: take in RAW, not quality controlled data and clean it up. This does NOT include interpolation of missing data points, but does include:
1. Compare to sensor health data or logs of known failure periods and remove data in those windows
2. Plausible value check. Removes any values that are beyond standard range for each variable
3. Remove noise. Higher-level processing but could follow USGS methods
4. Compare site-best. If multiple sensors exist for a given data variable, compare values and fill gaps 
5. Combine data files. If data is separated into multiple datafiles, resample to get on the same time basis (can resample to hourly here, or the lowest resolution of what is available). Combine files with consistent naming into one dataframe for checks

In [75]:
AWS_fp = '~/research/climate_data/AWS/Raw/'
data_fns = ['SouthGlacier_AWS_HalfHourData.csv','SouthGlacier_AWS_FiveMinData.csv','SouthGlacier_AWS_HealthData.csv']
time_vn = 'TIMESTAMP'

In [81]:
df = pd.read_csv(AWS_fp+data_fns[1],index_col=time_vn)
df.columns

  df = pd.read_csv(AWS_fp+data_fns[1],index_col=time_vn)


Index(['RECORD', 'WS_ms_S_WVT', 'WindDir_D1_WVT', 'WindDir_SD1_WVT',
       'WS_ms_Max', 'AirTC', 'NR_Wm2_Avg', 'CNR_Wm2_Avg', 'RH', 'SWin_Wm2_Avg',
       'SWout_Wm2_Avg', 'cnr4_T_C_Avg', 'short_up_Avg', 'short_dn_Avg',
       'long_up_corr_Avg', 'long_dn_corr_Avg'],
      dtype='object')

Rename variables to have consistency

*** How to handle when there are multiple columns with the same data? i.e. two SWin terms

In [None]:
names = {'temp':['site_temp_USGS','temperature','Tair_aws','temp','TA_2.0m','T','AirTC'],
            'tp':['Precip_Weighing_Incremental','precipitation','Ptotal_aws','tp','P','Rain_mm_tot'],
            'rh':['RelHum','RH','rh','rH','RH_aws','RH_2.0m'],
            'SWin':['RadiationIn','SWin','SWin_aws','SW_IN','short_dn_Avg'],
            'SWout':['RadiationOut','SWout','SWout_aws','SW_out','SW_OUT','short_up_Avg'],
            'LWin':['LWRadiationIn','LWin','LWin_aws','LW_in','LW_IN'],
            'LWout':['LWRadiationOut','LWout','LWout_aws','LW_OUT'],
            'wind':['WindSpeed','wind','Wind','ws_aws','WS','WS_ms_S_WVT'],
            'winddir':['VecAvgWindDir','WindDir','Winddir','winddir','WD','WindDir_D1_WVT'],
            'sp':['barom','sp','press','Press_aws','Barom','BP'],
            'tcc':['cloud_fraction','tcc','CCF','CCF_aws']}
# RENAMING
drop_vars = []
all_vars = ['temp','tp','rh','SWin','SWout','LWin','LWout','wind','winddir','sp','tcc']
for var in df.columns.to_numpy():
    renamed = False
    for var_check in all_vars:
        if var in names[var_check]:
            df = df.rename(columns={var:var_check})
            all_vars.remove(var_check)
            renamed = True
    if not renamed:
        drop_vars.append(var)
if len(drop_vars) > 0:
    print('Variables were not renamed, including:')
    print(drop_vars)
else:
    drop_vars = [0]

Check datatypes to sort out random strings or non-float values

In [69]:
for fn in data_fns:
    df = pd.read_csv(AWS_fp+fn,index_col=time_vn)
    for col in df.columns:
        if df[col].dtype == object:
            df[col] = df[col].astype(float)

  df = pd.read_csv(AWS_fp+fn,index_col=time_vn)


Sensor malfunctions to remove time periods where the sensor is known to be malfunctioning, according to some indicator (panel temperature, voltage, etc.) and healthy limit for said indicator

In [74]:
# Load in health dataset and specify bounds to remove datapoints
health_df = pd.read_csv(AWS_fp+health_fn,index_col=time_vn)
indicator = 'Panel_Temp_Max'
healthy_limit = 10
unhealthy_idx = health_df[indicator] > healthy_limit
# print(unhealthy_idx)
print(unhealthy_idx.value_counts())


False    3085
True      251
Name: Panel_Temp_Max, dtype: int64


Plausible values

In [None]:
# Define boundaries for each variable
bounds = {'temp':[-40,30],'precip':[0,60],'wind':[0,75],'winddir':[0,360],
            'sp':[50,110],'sw':[0,1600],'lw':[-100,400],'rh':[0,100],'tcc':[0,100]}
units = {'temp':'C','precip':'mm hr-1','wind':'m s-1','winddir':'deg',
            'sp':'kPa','sw':'W m-2','lw':'W m-2','rh':'%','tcc':'%'}

Multiple data files

In [53]:
# def merge_files(fns):
fns = [data1_fn,data2_fn]
df = pd.read_csv(AWS_fp+fns[0],index_col=time_vn)
timestep_original = pd.to_datetime(df.index[1]) - pd.to_datetime(df.index[0])
for fn in fns[1:]:
    df_load = pd.read_csv(AWS_fp+fn,encoding='ISO-8859-1')
    timestep_load = pd.to_datetime(df_load.index[1]) - pd.to_datetime(df_load.index[0])
    if timestep_load < timestep_original:
        print(timestep_load.seconds)

  df_load = pd.read_csv(AWS_fp+fn,encoding='ISO-8859-1')


0


In [32]:
def basic_stats(fn,header,droprows):
    df = pd.read_csv(AWS_fp+fn,header=header,encoding = 'ISO-8859-1')
    df = df.drop(droprows,axis=0)
    df = df.set_index(time_vn)
    for column in df.columns:
        print(column,'Nonzero count:',df[column].count())

In [33]:
basic_stats(data2_fn,1,[0,1])

  df = pd.read_csv(AWS_fp+fn,header=header,encoding = 'ISO-8859-1')


RECORD Nonzero count: 956668
WS_ms_S_WVT Nonzero count: 956668
WindDir_D1_WVT Nonzero count: 956668
WindDir_SD1_WVT Nonzero count: 956668
WS_ms_Max Nonzero count: 956668
AirTC Nonzero count: 956668
NR_Wm2_Avg Nonzero count: 956668
CNR_Wm2_Avg Nonzero count: 191532
RH Nonzero count: 956668
SWin_Wm2_Avg Nonzero count: 457951
SWout_Wm2_Avg Nonzero count: 457951
cnr4_T_C_Avg Nonzero count: 307187
short_up_Avg Nonzero count: 307187
short_dn_Avg Nonzero count: 307187
long_up_corr_Avg Nonzero count: 307187
long_dn_corr_Avg Nonzero count: 307187
