# Compare barometric pressure recorded by analog barometers and vibrating wire sensors
Here we use the 4-hourly summary data.
* Analog barometers are temperature-corrected, so can be considered a direct measure of true barometric pressure. However, the still need correcting to a height of 0 feet.
* Vibrating wire sensors need to be corrected with the corresponding temperature data, and height corrected to 0 feet.

## 1. Set up and Load transducers metadata

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import datetime as dt
import header
paths = header.setup_environment()
import platform
if platform.system()=='Darwin':
    INPUTDIR = '/Users/thompsong/Dropbox/PROFESSIONAL/RESEARCH/3_Project_Documents/NASAprojects/201602_Rocket_Seismology/DATA/2022_DATA/WellData/MERGED'
else:
    INPUTDIR = os.path.join(paths['new_data'], '03_merge_inventories')
import libWellData as LLE
transducersDF = LLE.get_transducers_dataframe(paths)
display(transducersDF)


# 2. Subset the summary of all files dataframe by Baro, 20 Hz, and 100 Hz subdirectories
Display the columns of each, after dropping empty columns.
* Baro only contains data columns for AirPressureShallow and AirPressureDeep.
* 20 Hz contains data columns for 1226423 and 2151692, plus corresponding temperature and stdev data
* 100 Hz contains data columns for '1226421', '1226419', '1226420', '2149882','2151691', and '1226429', plus corresponding temperature and stdev data

We plot:
- the barometric data
- the 100 Hz temperature data

Since none of these needs to be corrected (although later we will adjust barometric data to a height of 0 feet)

In [20]:
# Load in the summary of all files - raw 4 hourly data 
dfall2 = pd.read_csv(os.path.join(INPUTDIR, 'all2.csv'))

# Split in baro, 20Hz, 100Hz
dfbaro = dfall2.copy()[dfall2['subdir']=="Baro"]
dfbaro['datetime'] = pd.to_datetime(dfbaro['TIMESTAMP'])
dfbaro.dropna(how='all', axis=1, inplace=True) 
dfbaro.drop(dfbaro.columns[dfbaro.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

df20hz = dfall2.copy()[dfall2['subdir']=="20hz"]
df20hz['datetime'] = pd.to_datetime(df20hz['TIMESTAMP'])
df20hz.dropna(how='all', axis=1, inplace=True) 
df20hz.drop(df20hz.columns[df20hz.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

df100hz = dfall2.copy()[dfall2['subdir']=="100hz"]
df100hz['datetime'] = pd.to_datetime(df100hz['TIMESTAMP'])
df100hz.dropna(how='all', axis=1, inplace=True) 
df100hz.drop(df100hz.columns[df100hz.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)




## 3. Examine thermometer data 
For each vibrating wire pressure transducer, there is a corresponding thermal record
Find these columns and rename them to match the transducer serial number + "_temp"

In [None]:
# subset to all rows that have a 'Therm(6)' column
df100hz_thermal = df100hz.copy().dropna(subset=['Therm(6)'])
#display(df100hz_thermal)
count = 0 
lod = []
for col in df100hz_thermal:
    if isinstance(col,str) and (col[0:2]=='12' or col[0:2]=='21'):
        count += 1
        oldcol = f'Therm({count})'
        newcol = f'{col}_temp'
        print(oldcol,'->',newcol)
        df100hz_thermal.rename(columns={oldcol:newcol}, inplace=True)
        this_transducer = LLE.get_transducer_metadata(col, transducersDF)
        depth = this_transducer['set_depth_ft']
        lod.append({'depth':depth, 'Tstd':df100hz_thermal[newcol].std()})
thermal_columns = [item for item in df100hz_thermal.columns if item.endswith('_temp')]
df100hz_thermal.plot(x='datetime', y=thermal_columns, kind='line', ylabel='Temperature (C)').legend(bbox_to_anchor=(1.0,1.0), fontsize='small')
dftemp = pd.DataFrame(lod)
dftemp.plot(x='depth', y='Tstd', style='o', ylabel='stdev(Temperature), C ')

## 4. Plot raw barometric data


In [None]:
dfbaro.plot(x='datetime', y=['AirPressureShallow', 'AirPressureDeep'], kind='line')
df100hz.plot(x='datetime', y=['1226420', '1226429'], kind='line')

## 5. Correct all barometers and vibrating wire sensor data
- barometers are corrected for height only
- vibrating wire sensors in air are corrected for height also
- all vibrating wire sensors are corrected for sensitivity, and temperature
- vibrating wire sensors in water are corrected for barometric pressure too, from vibrating wire sensor in air

In [None]:
dfbaro_elevationRemoved_PSI = LLE.correctBarometricData(dfbaro, ['AirPressureShallow','AirPressureDeep'], transducersDF, temperatureCorrect=False, heightCorrect=True)  
dfbaro_elevationRemoved_PSI.plot(x='datetime', y=['AirPressureShallow', 'AirPressureDeep'], kind='line')
df100hz_elevationRemoved_aircolumnonly = LLE.correctBarometricData(df100hz_thermal, ['1226420', '1226429'], transducersDF, temperatureCorrect=True, heightCorrect=True)
df100hz_elevationRemoved_PSI = LLE.rawdf2psidf(df100hz_elevationRemoved_aircolumnonly, transducersDF, temperatureCorrect=True, airpressureCorrect=True, depthCorrect=False)
df100hz_elevationRemoved_PSI.plot(x='datetime', y=['1226420', '1226429'], kind='line')

## 6. Merge the dataframes after rounding timestamps to nearest hour 
All time series from the all2.csv are 4-hourly since we have only 1 file per 4 hours. So we can round to the nearest hour to align times that are generally within 1-s

Drop columns we do not need

In [None]:
df100hz_elevationRemoved_PSI ['nearesthour'] = df100hz_elevationRemoved_PSI ['datetime'].dt.round('H')
dfbaro_elevationRemoved_PSI['nearesthour'] = dfbaro_elevationRemoved_PSI['datetime'].dt.round('H')
dfmerged = pd.merge(df100hz_elevationRemoved_PSI , dfbaro_elevationRemoved_PSI, on='nearesthour')
dfmerged = dfmerged.loc[:, ~dfmerged.columns.str.endswith('_x')]
dfmerged = dfmerged.loc[:, ~dfmerged.columns.str.endswith('_y')]
dfmerged = dfmerged.loc[:, ~dfmerged.columns.str.startswith('DynStdDev')]
display(dfmerged)


## 7. Correlate the air pressure columns - and compare their means too


In [None]:
cols = ['AirPressureShallow', 'AirPressureDeep', '1226420', '1226429']
xcorr = np.ndarray(shape=(4,4), dtype=float)
dcshift = np.ndarray(shape=(4,4), dtype=float)
for i1,col1 in enumerate(cols):
    #display(col1, dfmerged[col1].describe())
    for i2,col2 in enumerate(cols):
        xcorr[i1,i2] = pd.Series.corr(dfmerged[col1], dfmerged[col2])
        dcshift[i1,i2] = dfmerged[col1].mean()-dfmerged[col2].mean()
display(xcorr)
display(dcshift)

## 8. Repeat the analysis, but don't make a temperature correction
Improves correlation between 1226420 and analog barometers by 2%, but degrades 1226429 against same by almost 1%
Improves correlation between 1226420 and 1226429 by 4% 
However, if we stick to using 1226429, this loss in performance is fine.
Best shifts are now:
* +14.42556 PSI to align 1226420 with AirPressureShallow
* +14.55743 PSI to align 1226429 with AirPressureShallow

In [None]:
df100hz_elevationRemoved_aircolumnonly_noT = LLE.correctBarometricData(df100hz_thermal, ['1226420', '1226429'], transducersDF, temperatureCorrect=False, heightCorrect=True)
df100hz_elevationRemoved_aircolumnonly_noT['nearesthour'] = df100hz_elevationRemoved_aircolumnonly_noT['datetime'].dt.round('H')
dfmerged2 = pd.merge(df100hz_elevationRemoved_aircolumnonly_noT , dfbaro_elevationRemoved_PSI, on='nearesthour')
dfmerged2 = dfmerged2.loc[:, ~dfmerged2.columns.str.endswith('_x')]
dfmerged2 = dfmerged2.loc[:, ~dfmerged2.columns.str.endswith('_y')]
dfmerged2 = dfmerged2.loc[:, ~dfmerged2.columns.str.startswith('DynStdDev')]
cols = ['AirPressureShallow', 'AirPressureDeep', '1226420', '1226429']
xcorr2 = np.ndarray(shape=(4,4), dtype=float)
dcshift2 = np.ndarray(shape=(4,4), dtype=float)
for i1,col1 in enumerate(cols):
    #display(col1, dfmerged2[col1].describe())
    for i2,col2 in enumerate(cols):
        xcorr2[i1,i2] = pd.Series.corr(dfmerged2[col1], dfmerged2[col2])
        dcshift2[i1,i2] = dfmerged2[col1].mean()-dfmerged2[col2].mean()
display(xcorr2)
display(dcshift2)
display(xcorr2-xcorr)
display(dcshift2-dcshift)

In [None]:
## 9. Compare with these shifts applied
dfmerged3 = dfmerged2.copy()
dfmerged3['1226420'] += 14.42556
dfmerged3['1226429'] += 14.55743
xcorr3 = np.ndarray(shape=(4,4), dtype=float)
dcshift3 = np.ndarray(shape=(4,4), dtype=float)
for i1,col1 in enumerate(cols):
    #display(col1, dfmerged2[col1].describe())
    for i2,col2 in enumerate(cols):
        xcorr3[i1,i2] = pd.Series.corr(dfmerged3[col1], dfmerged3[col2])
        dcshift3[i1,i2] = dfmerged3[col1].mean()-dfmerged3[col2].mean()
display(xcorr3)
display(dcshift3)
display(xcorr3-xcorr2)
display(dcshift3-dcshift2)

## 10. Full workflow for 100 Hz data


In [None]:
# 10.1: Get raw 100 Hz data and remove anything with spikes
display(df100hz)

# 10.2: Correct digital air barometers only for height and calibration (digits->PSI) with a PSI shift applied
df100hz_correctedAirSensors = LLE.correctBarometricData(df100hz, ['1226420', '1226429'], transducersDF, temperatureCorrect=True, heightCorrect=True, dcshifts=[14.42556,14.55743])
display(df100hz_correctedAirSensors[['1226420', '1226429']])

# 10.3: Correct digital water transducers for calibration and barometric pressure
df100hz_correctedAllSensors = LLE.rawdf2psidf(df100hz_correctedAirSensors, transducersDF, temperatureCorrect=True, airpressureCorrect=True, depthCorrect=False)
display(df100hz_correctedAllSensors[['1226420', '1226429', '1226419', '1226421', '2151691', '2149882']])

# 10.4: all have spikes in. 
for col in df100hz_correctedAllSensors:
    if isinstance(col,str) and (col[0:2]=='12' or col[0:2]=='21'):
        m = df100hz_correctedAllSensors[col].median()
        df100hz_correctedAllSensors[col] = df100hz_correctedAllSensors[col].clip(m-0.5,m+0.5)

display(df100hz_correctedAllSensors[['1226420', '1226429', '1226419', '1226421', '2151691', '2149882']])

# 10.5: plot
df100hz_correctedAllSensors.plot(x='datetime', y=['1226420', '1226429'], style='.')
df100hz_correctedAllSensors.plot(x='datetime', y=['1226419', '1226421', '2151691', '2149882'], style='.')

## 11. To do
In this notebook, we have examined temperature data and air pressure data, computed cross-correlations and DC shifts, and we explored how to correct data.

What we need to do next is locate the other script I had written to load data between two datetimes, using all2.csv as a file lookup table. Then we need to add in these additional processing steps.

We also need to add in steps to convert to Pascals and metres of water.