In [None]:
import os
import header
paths = header.setup_environment()
import platform
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#print(paths)
if platform.system()=='Darwin':
    INPUTDIR = '/Users/thompsong/Dropbox/PROFESSIONAL/RESEARCH/3_Project_Documents/NASAprojects/201602_Rocket_Seismology/DATA/2022_DATA/WellData/O5_DAILY'
else:
    INPUTDIR = os.path.join(paths['DROPBOX_DATA_TOP'], 'WellData', '05_DAILY')
print(os.listdir(INPUTDIR))
import libWellData as LLE
transducersDF = LLE.get_transducers_dataframe(paths)
serials = LLE.watercolumns
sensors = {'shallow':[serials[i] for i in [0,1,4]], 'intermediate':[serials[i] for i in [2,3,5]]}
print(sensors)

def median_despike(data, window_size, threshold=None):
    """Despike a pandas Series using a moving window median filter."""
    m = data.median()
    data = data - m
    #print(f'median={m}, data={data.describe()}')
    rolling_median = data.rolling(window_size, center=True, min_periods=1).median()
    difference = np.abs(data - rolling_median)
    if not threshold:
        mad = difference.rolling(window_size, center=True, min_periods=1).median()
        threshold = 3 * mad  # Adjust threshold as needed
    #print(f'threshold={threshold}, rolling_median={rolling_median.describe()}')
    despiked_data = np.where(difference > threshold, rolling_median, data)
    #return pd.Series(despiked_data, index=data.index) + m
    return pd.Series(despiked_data) + m


wells = ['shallow', 'intermediate']
wldf = {}
fh, axs = plt.subplots(1,1)
for well in wells:
    wldf[well] = pd.read_excel('/home/thompsong/Dropbox/waterlevels.xlsx', sheet_name=well)
    wldf[well].columns = wldf[well].columns.str.replace(' ', '')
    #display(wldf[well])
    wldf[well]['datetime']  = pd.to_datetime( wldf[well]['Date'].astype(str)+' '+wldf[well]['Time'].astype(str))
    display(wldf[well])
    wldf[well].rename(columns={'CorrectedElevationinFeet(NAVD)':'NAVD88_Feet'}, inplace=True)
    wldf[well].plot(ax=axs, x='datetime', y='NAVD88_Feet', label=well, style='o', ylabel='Water Level (NAV88D, Feet)')

# get list of unique dates
dates = wldf['shallow']['Date'].unique()
print(dates)

In [None]:
correct_for_setdepth = True

lod = []
for d in dates:

    try:
        pklfile = os.path.join(INPUTDIR, f'{d.strftime("%Y%m%d")}.pkl')
        daydf = pd.read_pickle(pklfile)
        
    except:
        print(f'Pickle file {pklfile} for {d} does not exist, trying previous day') # for 2022-12-03, and 2022-12-02 is bad
        if d==pd.to_datetime('2022-12-03'):
            filed = d - pd.Timedelta(hours=48)
            pklfile = os.path.join(INPUTDIR, f'{filed.strftime("%Y%m%d")}.pkl')
            daydf = pd.read_pickle(pklfile)
        else:
            continue

    daydf.drop(columns=daydf.columns[daydf.columns.str.startswith('Therm')], inplace=True)
    display(daydf)
    for well in wells:
        for serial in sensors[well]:
            print('\ndespiking', d, well, serial)
            daydf[serial] = median_despike(daydf[serial], 30*100*60, threshold=1/12) # 30 minutes of data. 1 inch max departure from median in that time.

            if not correct_for_setdepth:
                daydf[serial] = LLE.psi2feet(daydf[serial])
            else:
                this_transducer = LLE.get_transducer_metadata(serial, transducersDF)
                setdepth = this_transducer['set_depth_ft']                
                daydf[serial] = LLE.psi2feet(daydf[serial]) - abs(setdepth)
    
    pkloutfile = os.path.join(INPUTDIR, f'{d.strftime("%Y%m%d")}_feet.pkl')
    daydf.to_pickle(pkloutfile)
    
    # downsample to 1-minute for plotting purposes
    daydf.set_index('datetime', inplace=True)
    daydf = daydf.resample('1min').median()
    daydf.reset_index(inplace=True)

    # plot
    fh, axs = plt.subplots(2,1)
    for i, well in enumerate(wells):
        print('*******************')
        print(f'**** {well} Well ****')
        daydf.plot(ax=axs[i], x='datetime', y=sensors[well])
        truedf = wldf[well]
        mindt = min( [d,daydf['datetime'].min()] )
        trueday = truedf[(truedf['datetime']>mindt) & (truedf['datetime']<d+pd.Timedelta(hours=24))]
        trueday.plot(ax=axs[i], x='datetime',y='NAVD88_Feet', style='o', label='measured', ylabel='Water Level (NAVD88 Feet)', xlim=[mindt,d+pd.Timedelta(hours=24)], title=well)
        # difference between estimated and measured times 
        for j, row in trueday.iterrows():
            print(f'Measured {row["NAVD88_Feet"]} at {row["datetime"]}')
            for serial in sensors[well]:
                serialdf = daydf.copy().dropna(subset=[serial])
                serialdf['timediff'] = abs(serialdf['datetime']-row['datetime'])
                timediff = serialdf['timediff'].min()
                idx = serialdf['timediff'].idxmin()
                datadf = serialdf.loc[idx]
                print(f'- Estimated {datadf[serial]:.3f} feet at {datadf["datetime"]} from sensor {serial}: time diff={timediff}')
                lod.append({'well':well, 'serial':serial, 'measured_datetime':row['datetime'], 'measured_wl':row['NAVD88_Feet'], 'estimated_datetime':datadf['datetime'], 'estimated_wl':datadf[serial], 'datetime_diff':timediff, 'wl_diff':row['NAVD88_Feet']-datadf[serial]})
    plt.show()
    waterleveldf = pd.DataFrame(lod)
    display(waterleveldf)

In [None]:
display(waterleveldf)


In [None]:

waterleveldf.to_csv(os.path.join(INPUTDIR, 'waterleveldf.csv'))

display(transducersDF)
transducersDFnew = transducersDF.copy()
heightshiftdfs = {}
for well in wells:
    fig, axs = plt.subplots(len(sensors[well]),1)
    dfwell = waterleveldf[waterleveldf['well']==well]
    
    for i,serial in enumerate(sensors[well]):
        dfsensor = dfwell[dfwell['serial']==serial]
        dfsensor.plot(ax=axs[i], x='estimated_datetime', y='wl_diff', ylabel='Feet', legend=False, title=f'Water Level Difference (M-E) for sensor {serial}', style='o-', grid=True)
        print('interpolating water level correction for ',serial)

        # resample measured data from intermittent sparse to 1-minute
        thisdf = dfsensor[['measured_datetime', 'wl_diff']]
        thisdf.set_index('measured_datetime', inplace=True)
        thisdf = thisdf.resample('1min').interpolate()
        thisdf.reset_index(inplace=True)
        heightshiftdfs[serial] = thisdf
        thisdf.plot(ax=axs[i], x='measured_datetime',y='wl_diff', label='interp')
        thisdf.to_csv(f'{serial}_height_correction_feet.csv', index=False)

grouped=waterleveldf.groupby(['serial'])
for name,group in grouped:
    print(f'Water level estimated for {name} is on average too low compared to measured value by {group["wl_diff"].mean():.3f} feet')


In [None]:
lod = []
for d in dates:

    try:
        pklfile = os.path.join(INPUTDIR, f'{d.strftime("%Y%m%d")}_feet.pkl')
        daydf = pd.read_pickle(pklfile)
        
    except:
        print(f'Pickle file {pklfile} for {d} does not exist')
        continue
    else:      
        display(daydf)
        # for each sensor in each well, plot water depth
        for well in wells:
            for serial in sensors[well]:
                print('\ncorrecting', d, well, serial)
                # load height correction data, which is every minute, and interpolate
                csvfile = f'{serial}_height_correction_feet.csv'
                hcdf = pd.read_csv(csvfile, index_col=None)
                hcdf['measured_datetime'] = pd.to_datetime(hcdf['measured_datetime'])
                hcdf.set_index('measured_datetime', inplace=True)
                hcdf = hcdf.loc[daydf['datetime'].min():daydf['datetime'].max()] 
                display(hcdf)
                hcdf = hcdf.resample('10ms').interpolate()
                hcdf.reset_index(inplace=True)
                newcol = f'{serial}_wldiff'
                hcdf.rename(columns={'wl_diff':newcol}, inplace=True)
                # join
                daydf = daydf.merge(hcdf, left_on='datetime', right_on='measured_datetime', how='inner')
                daydf.drop(columns='measured_datetime', inplace=True)
                #print(daydf[newcol].describe())
                daydf[serial] = daydf[serial] + daydf[newcol] 

        # downsample for plotting
        #display(daydf)
        daydf.set_index('datetime', inplace=True)
        daydf = daydf.resample('1min').median()
        daydf.reset_index(inplace=True)
        display(daydf)
        

        fh, axs = plt.subplots(2,1)
        for i, well in enumerate(wells):
            print('*******************')
            print(f'**** {well} Well ****')
            daydf.plot(ax=axs[i], x='datetime', y=sensors[well])
            truedf = wldf[well]
            trueday = truedf[(truedf['datetime']>d) & (truedf['datetime']<d+pd.Timedelta(hours=24))]
            trueday.plot(ax=axs[i], x='datetime',y='NAVD88_Feet', style='o', label='measured', ylabel='Water Level (NAVD88 Feet)', xlim=[d,d+pd.Timedelta(hours=24)], title=well)
            # difference between estimated and measured times 
            for j, row in trueday.iterrows():
                print(f'Measured {row["NAVD88_Feet"]} at {row["datetime"]}')
                for serial in sensors[well]:
                    serialdf = daydf.copy().dropna(subset=[serial])
                    serialdf['timediff'] = abs(serialdf['datetime']-row['datetime'])
                    timediff = serialdf['timediff'].min()
                    idx = serialdf['timediff'].idxmin()
                    datadf = serialdf.loc[idx]
                    print(f'- Estimated {datadf[serial]:.3f} feet at {datadf["datetime"]} from sensor {serial}: time diff={timediff}')
                    lod.append({'well':well, 'serial':serial, 'measured_datetime':row['datetime'], 'measured_wl':row['NAVD88_Feet'], 'estimated_datetime':datadf['datetime'], 'estimated_wl':datadf[serial], 'datetime_diff':timediff, 'wl_diff':row['NAVD88_Feet']-datadf[serial]})
        plt.show()
        waterleveldf = pd.DataFrame(lod)
        

In [None]:
# this works. so now load each day and process it a similar way

# loop over each pklfile
startdate = pd.to_datetime('2022-07-21')
enddate = pd.to_datetime('2022-12-02')
d = startdate
while d < enddate:
    pkloutfile = os.path.join(INPUTDIR, f'{d.strftime("%Y%m%d")}_final.pkl')

    # check if this day already processed
    if os.path.isfile(pkloutfile.replace('.pkl', '.png')):
        d = d + pd.Timedelta(hours=24)
        continue
    
    try:
        pklfile = os.path.join(INPUTDIR, f'{d.strftime("%Y%m%d")}.pkl')
        daydf = pd.read_pickle(pklfile)
    except:
        print(f'Pickle file {pklfile} for {d} does not exist')
        d = d + pd.Timedelta(hours=24)
        continue
    else:
        daydf.drop(columns=daydf.columns[daydf.columns.str.startswith('Therm')], inplace=True)
        for well in wells:
            for serial in sensors[well]:
                
                if not serial in daydf.columns:
                    continue
                print('\ndespiking', d, well, serial)
                daydf[serial] = median_despike(daydf[serial], 30*100*60, threshold=1/12) # 30 minutes of data. 1 inch max departure from median in that time.

                this_transducer = LLE.get_transducer_metadata(serial, transducersDF)
                setdepth = this_transducer['set_depth_ft']                
                daydf[serial] = LLE.psi2feet(daydf[serial]) - abs(setdepth)
        
                # load height correction data, which is every minute, and interpolate
                print('\ncorrecting', d, well, serial)
                csvfile = f'{serial}_height_correction_feet.csv'
                hcdf = pd.read_csv(csvfile, index_col=None)
                hcdf['measured_datetime'] = pd.to_datetime(hcdf['measured_datetime'])
                hcdf.set_index('measured_datetime', inplace=True)
                hcdf = hcdf.loc[daydf['datetime'].min():daydf['datetime'].max()] 
                hcdf = hcdf.resample('10ms').interpolate()
                hcdf.reset_index(inplace=True)
                newcol = f'{serial}_wldiff'
                hcdf.rename(columns={'wl_diff':newcol}, inplace=True)
                # join
                daydf = daydf.merge(hcdf, left_on='datetime', right_on='measured_datetime', how='inner')
                daydf.drop(columns='measured_datetime', inplace=True)
                daydf[serial] = daydf[serial] + daydf[newcol] 
        
        daydf.to_pickle(pkloutfile)

        # downsample for plotting
        daydf.set_index('datetime', inplace=True)
        daydf = daydf.resample('1min').median()
        daydf.reset_index(inplace=True)

        fh, axs = plt.subplots(2,1)
        for i, well in enumerate(wells):
            print('*******************')
            print(f'**** {well} Well ****')
            available_sensors = list(set(daydf.columns) & set(sensors[well])) 
            daydf.plot(ax=axs[i], x='datetime', y=available_sensors, ylabel='Water Level (NAVD88 Feet)', xlim=[d,d+pd.Timedelta(hours=24)], title=well)
        plt.savefig(pkloutfile.replace('.pkl', '.png'))

    
    d = d + pd.Timedelta(hours=24)



In [None]:
print(daydf.columns)
print(sensors['shallow'])

In [None]:
print(list(set(daydf.columns) & set(sensors['shallow'])) )