# Analyze summary data, one line per file 
Here we use the 4-hourly summary data.
* Analog barometers are temperature-corrected, so can be considered a direct measure of true barometric pressure. However, the still need correcting to a height of 0 feet.
* Vibrating wire sensors need to be corrected with the corresponding temperature data, and height corrected to 0 feet.

## 1. Set up and Load transducers metadata

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import header
paths = header.setup_environment()
import platform
if platform.system()=='Darwin':
    INPUTDIR = '/Users/thompsong/Dropbox/PROFESSIONAL/RESEARCH/3_Project_Documents/NASAprojects/201602_Rocket_Seismology/DATA/2022_DATA/WellData/MERGED'
else:
    INPUTDIR = os.path.join(paths['new_data'], '03_merge_inventories')
import libWellData as LLE
transducersDF = LLE.get_transducers_dataframe(paths)
display(transducersDF)

# 2. Subset the summary of all files dataframe by Baro, 20 Hz, and 100 Hz subdirectories
Display the columns of each, after dropping empty columns.
* Baro only contains data columns for AirPressureShallow and AirPressureDeep.
* 20 Hz contains data columns for 1226423 and 2151692, plus corresponding temperature and stdev data
* 100 Hz contains data columns for '1226421', '1226419', '1226420', '2149882','2151691', and '1226429', plus corresponding temperature and stdev data

We plot:
- the barometric data
- the 100 Hz temperature data

Since none of these needs to be corrected (although later we will adjust barometric data to a height of 0 feet)

In [None]:
# if we do not remerge, we get the 1hz data too
all_dataframes = LLE.load_summary_csv(os.path.join(INPUTDIR, 'all2.csv'), split=True, remerge=False)
print('**** RETURNED ******')
for k in all_dataframes:
    print(k, all_dataframes[k].columns)
#display(all_dataframes['1hz'])

In [None]:
summarydf = LLE.load_summary_csv(os.path.join(INPUTDIR, 'all2.csv'), split=True, remerge=True)
print(len(summarydf))
display(summarydf.columns)

## 3. Examine thermometer data 
For each vibrating wire pressure transducer, there is a corresponding thermal record which code above translated from Therm(1)..Therm(6) etc. into serialno+'_temp'

In [None]:
temperaturedf = LLE.load_summary_csv(os.path.join(INPUTDIR, 'all2.csv'), split=True, remerge=True, temperatureData=True)
#temperaturedf = temperaturedf.reindex(sorted(temperaturedf.columns), axis=1) # for some reason sorting columns here or in qc_dataframe throws off the datetime plot
temperaturedf['nearestminute'] = pd.to_datetime(temperaturedf['nearestminute'])

fh, ax = plt.subplots(1,1)
lod = []
#thermalcolumns = [item for item in temperaturedf.columns if item.endswith('_temp')]
thermalcolumns = temperaturedf.columns[temperaturedf.columns.str.endswith('_temp')]
for tempcol in thermalcolumns:
    col = tempcol.replace('_temp', '')
    this_transducer = LLE.get_transducer_metadata(col, transducersDF)
    depth = this_transducer['set_depth_ft']
    lod.append({'depth':depth, 'mean':temperaturedf[tempcol].mean(), 'Tstd':temperaturedf[tempcol].std(), 'label':col})
    this_df = temperaturedf.copy()[['nearestminute', tempcol]]
    this_df.dropna(subset=[tempcol], inplace=True)
    this_df.plot(ax=ax, x='nearestminute', y=tempcol, kind='line', xlabel='Date', ylabel='Temperature (C)', label=col, grid=True, xlim=[temperaturedf.iloc[0]['nearestminute'], temperaturedf.iloc[-1]['nearestminute']]).legend(bbox_to_anchor=(1.0,1.0), fontsize='small')
dftemp = pd.DataFrame(lod)
plt.show()

fh, ax = plt.subplots(1,1)
for thislabel in dftemp['label']:
    thisdf = dftemp[dftemp['label']==thislabel]
    thisdf.plot(ax=ax, x='depth', y='mean', style='o', ylabel='Mean Temperature (C)', label=thislabel, grid=True)    
plt.show()

fh, ax = plt.subplots(1,1)
for thislabel in dftemp['label']:
    thisdf = dftemp[dftemp['label']==thislabel]
    thisdf.plot(ax=ax, x='depth', y='Tstd', style='o', ylabel='St. Dev. Temperature (C)', label=thislabel, grid=True)    
plt.show()

display(temperaturedf[thermalcolumns].mean())


## 4. Plot raw barometric data


In [None]:
#all_dataframes['baro'].plot(x='datetime', y=['AirPressureShallow', 'AirPressureDeep'], kind='line')
#all_dataframes['100hz'].plot(x='datetime', y=['1226420', '1226429'], kind='line')
aircolumns = ['AirPressureShallow', 'AirPressureDeep', '1226420', '1226429']
summarydf.plot(x='datetime', y=aircolumns, kind='line')
print(len(summarydf))
print(summarydf.iloc[-1]['datetime'])
print(summarydf.iloc[0]['datetime'])
display(summarydf[['datetime', 'basename']])


## 5. Correct all barometers and vibrating wire sensor data
- barometers are corrected for height only
- vibrating wire sensors in air are corrected for height also
- all vibrating wire sensors are corrected for sensitivity, and temperature
- vibrating wire sensors in water are corrected for barometric pressure too, from vibrating wire sensor in air

In [None]:
'''
dfbaro_elevationRemoved_PSI = LLE.correctBarometricData(all_dataframes['baro'], aircolumns[:2], transducersDF, temperatureCorrect=False, heightCorrect=True)  
dfbaro_elevationRemoved_PSI.plot(x='datetime', y=['AirPressureShallow', 'AirPressureDeep'], kind='line')
df100hz_elevationRemoved_aircolumnonly = LLE.correctBarometricData(df100hz_thermal, aircolumns[2:], transducersDF, temperatureCorrect=True, heightCorrect=True)
df100hz_elevationRemoved_PSI = LLE.rawdf2psidf(df100hz_elevationRemoved_aircolumnonly, transducersDF, temperatureCorrect=True, airpressureColumn='1226429', depthCorrect=False)
df100hz_elevationRemoved_PSI.plot(x='datetime', y=aircolumns[2:], kind='line')
'''
correcteddf = LLE.correctBarometricData(summarydf, aircolumns, transducersDF, temperatureCorrect=False, heightCorrect=True)  
ax=correcteddf.plot(x='datetime', y=aircolumns, kind='line')
airstatsdf = correcteddf[aircolumns].describe()
#display(correcteddf[aircolumns].mean())
#display(correcteddf[aircolumns[2:]])
#correcteddf = LLE.correctBarometricData(correcteddf, aircolumns[2:], transducersDF, temperatureCorrect=False, heightCorrect=False)
#correcteddf = LLE.rawdf2psidf(correcteddf, transducersDF, temperatureCorrect=True, airpressureColumn='1226429', depthCorrect=False)
#correcteddf.plot(x='datetime', y=aircolumns[2:], kind='line')
#display(correcteddf[aircolumns[2:]])

dfnasa = pd.read_csv('Barometric Pressure.csv')
dfnasa['datetime']=pd.to_datetime(dfnasa['datetime'])
dfnasa = dfnasa.set_index('datetime')
dfnasa = dfnasa.resample('4h').median()
dfnasa.reset_index(inplace=True)
dfnasa.plot(ax=ax, x='datetime', y='PSI', label='KSC weather tower', ylabel='Air Pressure (PSI)', xlabel='Date')
plt.show()
airstatsdf['KSC']=dfnasa['PSI'].describe()
display(airstatsdf)

# demeaned plot
demeaneddf = pd.DataFrame()
demeaneddf['datetime'] = correcteddf['datetime']
for col in aircolumns:
    demeaneddf[col] = correcteddf[col] - correcteddf[col].median()
ax=demeaneddf.plot(x='datetime', y=aircolumns, kind='line')
demeaneddf2 = pd.DataFrame()
demeaneddf2['datetime'] = dfnasa['datetime']
demeaneddf2['PSI'] = dfnasa['PSI'] - dfnasa['PSI'].median()
demeaneddf2.plot(ax=ax,x='datetime', y='PSI', kind='line', label='KSC weather tower', ylabel='Air Pressure (PSI)', xlabel='Date')
plt.show()

watercolumns = ['1226419', '1226421', '2151691', '2149882', '1226423', '2151692']
correcteddf = LLE.rawdf2psidf(correcteddf, transducersDF, temperatureCorrect=True, airpressureColumn='AirPressureShallow', depthCorrect=False)
correcteddf.plot(x='datetime', y=watercolumns, ylabel='Water Pressure PSI', xlabel='Date')

## 6. Merge the dataframes after rounding timestamps to nearest minute
All time series from the all2.csv are 4-hourly since we have only 1 file per 4 hours. So we can round to the nearest minute to align times that are generally within 1-s

Drop columns we do not need

In [None]:
LLE.round_datetime(df100hz_elevationRemoved_PSI, freq='min' )
LLE.round_datetime(dfbaro_elevationRemoved_PSI, freq='min')
dfmerged1 = LLE.merge_and_drop(df100hz_elevationRemoved_PSI, dfbaro_elevationRemoved_PSI, on='nearestminute')
display(dfmerged1)
dfmerged1.plot(x='nearestminute', y=aircolumns, kind='line')

## 7. Correlate the air pressure columns - and compare their means too


In [None]:
print(len(correcteddf))
dfmerged1 = correcteddf.copy()
xcorrdf1, dcshiftdf1 = LLE.xcorr_columns(dfmerged1, aircolumns)
dfshifted1, dcshifts1 = LLE.apply_dcshifts(dfmerged1, xcorrdf1, dcshiftdf1)
dfshifted1.plot(x='nearestminute', y=aircolumns, kind='line', ylabel='PSI')

## 8. Repeat the analysis, but don't make a temperature correction
Improves correlation between 1226420 and analog barometers by 2%, but degrades 1226429 against same by almost 1%
Improves correlation between 1226420 and 1226429 by 4% 
However, if we stick to using 1226429, this loss in performance is fine.
Best shifts are now:
* +14.423795 PSI to align 1226420 with AirPressureShallow
* +14.556290 PSI to align 1226429 with AirPressureShallow

In [None]:
df100hz_elevationRemoved_aircolumnonly_noT = LLE.correctBarometricData(df100hz_thermal, ['1226420', '1226429'], transducersDF, temperatureCorrect=False, heightCorrect=True)

LLE.round_datetime(df100hz_elevationRemoved_aircolumnonly_noT, freq='min' )
dfmerged2 = LLE.merge_and_drop(df100hz_elevationRemoved_aircolumnonly_noT, dfbaro_elevationRemoved_PSI, on='nearestminute')
display(dfmerged2)

xcorrdf2, dcshiftdf2 = LLE.xcorr_columns(dfmerged2, aircolumns)
dfshifted2, dcshifts2 = LLE.apply_dcshifts(dfmerged2, xcorrdf2, dcshiftdf2)
dfshifted2.plot(x='nearestminute', y=aircolumns, kind='line', ylabel='PSI')

dfxcorrdiff = xcorrdf2-xcorrdf1
dfxcorrdiff_styled = dfxcorrdiff.style.set_caption('Difference in cross-correlation')
dfshiftdiff = dcshiftdf2-dcshiftdf1
dfshiftdiff_styled = dfshiftdiff.style.set_caption('Difference in DC levels')

display(dfxcorrdiff_styled)
display(dfshiftdiff_styled)

## 9. Full workflow for 100 Hz data


In [None]:
# 10.1: Get raw 100 Hz data and remove anything with spikes

# 10.2: Correct analog and digital air column transducers for calibration, elevation above water, and apply DC shift (no temperature correction)
dfbaro_dcshifted = LLE.correctBarometricData(all_dataframes['baro'], aircolumns[:2], transducersDF, temperatureCorrect=False, heightCorrect=True, dcshifts=dcshifts2)
df100hz_dcshifted = LLE.correctBarometricData(all_dataframes['100hz'], aircolumns[2:], transducersDF, temperatureCorrect=False, heightCorrect=True, dcshifts=dcshifts2)

# 10.3: Merge and plot
LLE.round_datetime(df100hz_dcshifted, freq='min')
LLE.round_datetime(dfbaro_dcshifted, freq='min')
dfmerged_dcshifted = LLE.merge_and_drop(dfbaro_dcshifted, df100hz_dcshifted, on='nearestminute', drop=False)
display(dfmerged_dcshifted)
dfmerged_dcshifted.plot(x='nearestminute', y=aircolumns, style='.', ylabel='PSI')

# 10.4: Correct digital water column transducers for calibration and barometric pressure
correctedAllSensorsPSI = LLE.rawdf2psidf(dfmerged_dcshifted, transducersDF, temperatureCorrect=False, airpressureColumn='1226429', depthCorrect=False)
watercolumns = ['1226419', '1226421', '2151691', '2149882']
display(correctedAllSensorsPSI[watercolumns])

# 10.5: plot PSI
correctedAllSensorsPSI.plot(x='nearestminute', y=aircolumns+watercolumns, style='.', ylabel='PSI')

# 10.6: convert to water levels in meters
correctedAllSensorsMeters = LLE.psi2meters(correctedAllSensorsPSI, watercolumns)
correctedAllSensorsMeters.plot(x='nearestminute', y=watercolumns, style='.', ylabel='Meters')

# 10.7: convert to water levels in meters relative to the set depth measured by Steve Krupa
relativeAllSensorsMeters = LLE.relative_to_set_depth(correctedAllSensorsMeters, transducersDF, watercolumns)
relativeAllSensorsMeters.plot(x='nearestminute', y=watercolumns, style='.', ylabel='Meters')

# 10.8: estimate correct set depths from median of each, and shift by this amount
estimatedAllSensorsMeters = LLE.estimate_sensor_depths(correctedAllSensorsMeters, watercolumns)
estimatedAllSensorsMeters.plot(x='nearestminute', y=watercolumns, style='.', ylabel='Meters')   