# Exploratory Data Analysis

## Load Required Libraries

In [1]:
# General packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import matplotlib.ticker as ticker
import matplotlib.dates as mdates
import datetime as dt

In [2]:
## (Optional chunk)
# Current session information
import session_info
session_info.show(dependencies=False)

## Load data

Files of interest:
- `weir_calibration.csv` includes calibration points for the weir
- `bci_lutzweir_combined.csv` includes raw runoff measurement, corrected runoff measurement, data source (*Chart measurements can be removed)
- `bci_cl_ra_elect2.CSV` has corrected rainfall (`ra`) in mm with measurements of `0` as `NA`s (`bci_cl_ra_elect.csv` has `0`s)
- `bci_lutz_deep_gsm_man.csv`, `bci_lutz_shallow_gsm_man.csv` have soil moisture measurements (water by wet weight and water by dry weight; one can be chosen for analysis as they are linearly related)
<!-- `bci_cl_ra_elect.csv` has corrected rainfall (`ra`) in mm, contains `0`s (large file) -->

All values level values are in mm, and datetime is in UTC-5 (Panama time zone).


### Import

In [3]:
## Calibrations dataset
data_calibrations = pd.read_csv(
    "data/weir_calibration.csv",
    usecols = ['datetime', 'weir_level'], # weir_hour is a repeat of the time in datetime and can be skipped
    parse_dates=['datetime'],
    date_format='%d/%m/%Y %H:%M:%S',
    index_col='datetime'
)

data_calibrations.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6465 entries, 1994-01-03 08:46:00 to 2025-09-02 08:50:00
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   weir_level  6465 non-null   int64
dtypes: int64(1)
memory usage: 101.0 KB


In [4]:
# Combined data
# # Checking if the dataset is already loaded into the workspace
# try:
#     if data_combined.empty == False:
#         print("Data loaded, random sample shown below")
#         print(data_combined.sample(n=5))
# except NameError:
#     print("Data has not yet been read in, loading now...")
#     data_combined = pd.read_csv(
#         "data/bci_lutzweir_combined.csv",
#         usecols = ['datetime', 'level', 'raw', 'chk_note', 'chk_fail', 'comment', 'source'],
#         parse_dates=['datetime'],
#         dtype = {'source':'category', 'chk_note':'category', 'chk_fail':'str', 'comment':'str'},
#         date_format='%d/%m/%Y %H:%M:%S'
#     )

data_all_combined = pd.read_csv(
    "data/bci_lutzweir_combined.csv",
    usecols = ['datetime', 'level', 'raw', 'chk_note', 'chk_fail', 'comment', 'source'],
    parse_dates=['datetime'],
    dtype = {'source':'category', 'chk_note':'category', 'chk_fail':'str', 'comment':'str'},
    date_format='%d/%m/%Y %H:%M:%S',
    index_col='datetime'
)

data_all_combined.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3951119 entries, 1972-01-01 01:00:00 to 1977-03-06 23:45:00
Data columns (total 6 columns):
 #   Column    Dtype   
---  ------    -----   
 0   level     float64 
 1   raw       float64 
 2   chk_note  category
 3   chk_fail  object  
 4   comment   object  
 5   source    category
dtypes: category(2), float64(2), object(2)
memory usage: 158.3+ MB


In [5]:
# Rainfall dataset

# This data set includes the 0 readings:
# data_rainfall_zeroes = pd.read_csv(
#         "data/bci_elect_cl_ra/bci_cl_ra_elect.csv",
#         usecols = ['datetime', 'ra', 'raw', 'chk_note', 'chk_fail'],
#         # "data/bci_elect_cl_ra/bci_cl_ra_elect2.CSV",
#         # usecols = ['datetime', 'level', 'raw', 'chk_note', 'chk_fail', 'comment', 'source'],
#         parse_dates=['datetime'],
#         dtype = {'chk_note':'category', 'chk_fail':'str'},
#         # dtype = {'source':'category', 'chk_note':'category', 'chk_fail':'str', 'comment':'str'},
#         date_format='%d/%m/%Y %H:%M:%S'
#     )

# This data set skips the 0 readings (therefore much smaller):
data_all_rainfall = pd.read_csv(
    "data/bci_elect_cl_ra/bci_cl_ra_elect2.CSV",
    parse_dates=['datetime'],
    dtype = {'chk_note':'category', 'chk_fail':'str'},
    date_format='%d/%m/%Y %H:%M:%S',
    index_col='datetime'
)

data_all_rainfall.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 179640 entries, 1929-01-02 08:00:00 to 2025-08-04 11:55:00
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype   
---  ------    --------------   -----   
 0   ra        179640 non-null  float64 
 1   raw       179640 non-null  float64 
 2   chk_note  179640 non-null  category
 3   chk_fail  29 non-null      object  
dtypes: category(1), float64(2), object(1)
memory usage: 5.7+ MB


In [6]:
# Soil datasets

# Shallow
data_all_soil_shallow = pd.read_csv(
    "data/bci_manual_soilh/bci_lutz_shallow_gsm_man.csv",
    parse_dates=['date'],
    usecols = ['date', 'depth', 'sample', 'h2o_by_wet', 'chk_note', 'chk_fail'],
    dtype = {'depth':'category', 'sample':'category', 'chk_note':'category', 'chk_fail':'str'},
    date_format='%d/%m/%Y',
    index_col='date'
)

# Deep
data_all_soil_deep = pd.read_csv(
    "data/bci_manual_soilh/bci_lutz_deep_gsm_man.csv",
    parse_dates=['date'],
    usecols = ['date', 'depth', 'sample', 'h2o_by_wet', 'chk_note', 'chk_fail'],
    dtype = {'depth':'category', 'sample':'category', 'chk_note':'category', 'chk_fail':'str'},
    date_format='%d/%m/%Y',
    index_col='date'
)

# print(data_soil_shallow['sample'].value_counts(dropna = False))
# print(data_soil_shallow['depth'].value_counts(dropna = False))
# print(data_soil_deep['depth'].value_counts(dropna = False))
# print(data_soil_deep['sample'].value_counts(dropna = False))

# print("Shallow",
#       data_all_soil_shallow['sample'].value_counts(dropna = False),
#       data_all_soil_shallow['depth'].value_counts(dropna = False)
#     )
# print("Deep",
#       data_all_soil_deep['depth'].value_counts(dropna = False),
#       data_all_soil_deep['sample'].value_counts(dropna = False)
#       )

data_all_soil_shallow.info()
data_all_soil_deep.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 18556 entries, 1972-03-03 to 2025-06-26
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   depth       18556 non-null  category
 1   sample      18556 non-null  category
 2   h2o_by_wet  18556 non-null  float64 
 3   chk_note    18556 non-null  category
 4   chk_fail    178 non-null    object  
dtypes: category(3), float64(1), object(1)
memory usage: 490.8+ KB
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 15637 entries, 1972-03-03 to 2025-06-26
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   depth       15637 non-null  category
 1   sample      15637 non-null  category
 2   h2o_by_wet  15637 non-null  float64 
 3   chk_note    15637 non-null  category
 4   chk_fail    20 non-null     object  
dtypes: category(3), float64(1), object(1)
memory usage: 413.4+ KB


### Cleanup

In [7]:
# Get earliest and latest dates of sources

cat_source = data_all_combined.sort_index()['source'].unique().tolist()
print("Earliest", "\t    ", "Latest", "\t\t", "Source")
for cat in cat_source:
    if pd.isna(cat) == True:
        temp_subset = data_all_combined[data_all_combined["source"].isnull()]
    else:
        temp_subset = data_all_combined[data_all_combined["source"]==cat]
    temp_subset = temp_subset.sort_index()
    print(temp_subset.index[0], "", temp_subset.index[-1], cat)
    # print(min(temp_subset['datetime']), "", max(temp_subset['datetime']), cat)

# Save space, remove no longer needed items
del cat_source, cat, temp_subset

Earliest 	     Latest 		 Source
1972-01-01 01:00:00  2015-03-18 14:15:00 CHART
1972-09-16 00:15:00  2025-08-01 13:00:00 nan
1989-07-19 11:55:00  1996-10-01 23:55:00 CHART+AF
1996-10-02 00:00:00  2013-01-13 05:50:00 ISCO
2012-04-23 08:30:00  2012-04-24 08:35:00 ESTIMATED
2014-08-22 10:30:00  2021-05-19 09:40:00 RADAR
2018-08-31 10:05:00  2018-09-05 12:55:00 TROLL


In [8]:
# print(data_combined.dtypes)
# print("Source:", data_combined['source'].cat.categories.tolist())
# print("Notes:", data_combined['chk_note'].cat.categories.tolist())
# print("Fail mode:", data_combined['chk_fail'].unique())
# print("Comments:", data_combined['comment'].unique())

In [9]:
# Filtering data sets for relevant dates

# Exclude old chart data
# data_combined = data_all_combined[~data_all_combined['source'].str.contains("CHART", na=False)]
# data_combined = data_all_combined[~data_all_combined['source']=="CHART"]
data_combined = data_all_combined[data_all_combined['source']!='CHART']
# Remove missing values
data_combined = data_combined[data_combined['chk_note']!='missing']

# Arrange for visualization & indexing
data_combined = data_combined.sort_index()
# Remove a few extra points
data_combined = data_combined['1978-01-01 00:00:01':]

# Get earliest and latest dates
date_weir_start = data_combined.index[0]
date_weir_end = data_combined.index[-1]

# Create function to filter dates
def filter_dates(input_dataset, input_date_start = date_weir_start, input_date_end = date_weir_end):
    # Sort the dataframe
    data_subset = input_dataset.sort_index()
    # Filter between dates
    data_subset = data_subset.loc[input_date_start:input_date_end]
    return data_subset

# Apply filter
data_rainfall = filter_dates(data_all_rainfall)
data_soil_deep = filter_dates(data_all_soil_deep)
data_soil_shallow = filter_dates(data_all_soil_shallow)
# data_nochart_soil_shallow[~data_nochart_soil_shallow['sample'].isin(["1","2","3","4","5","6","7","8","9","10"])]

In [None]:
# Remove old stuff
del data_all_combined, data_all_rainfall, data_all_soil_shallow, data_all_soil_deep

In [10]:
data_shallow_explore = data_soil_shallow[~data_soil_shallow['sample'].isin(["1","2","3","4","5","6","7","8","9","10"])]
# data_shallow_explore.index.unique()[0]
data_shallow_explore = data_soil_shallow.loc[data_shallow_explore.index.unique()[0]]
data_shallow_explore['sample'] = data_shallow_explore['sample'].astype('int')
data_shallow_explore.sort_values(by='sample')
# data_shallow_explore

# data_shallow_explore


# data_soil_shallow['2005-03-02 00:00:00':'2005-03-02 23:59:59'].sort_values(by='sample')
# data_soil_shallow['2005-03-02 00:00:00':'2005-03-02 23:59:59']
# data_soil_shallow[data_shallow_explore.index[0]]
# print("Shallow",
#       data_soil_shallow['sample'].value_counts(),
#       data_soil_shallow['depth'].value_counts(),
#       sep="\n"
#     )
# print("----------")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_shallow_explore['sample'] = data_shallow_explore['sample'].astype('int')


Unnamed: 0_level_0,depth,sample,h2o_by_wet,chk_note,chk_fail
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2005-03-02,1-10,3,38.7,good,
2005-03-02,1-10,4,35.1,good,
2005-03-02,1-10,5,41.3,good,
2005-03-02,1-10,6,40.1,good,
2005-03-02,1-10,7,36.4,good,
2005-03-02,1-10,8,36.4,good,
2005-03-02,1-10,9,29.1,good,
2005-03-02,1-10,10,29.9,good,
2005-03-02,1-10,68,39.5,good,
2005-03-02,1-10,70,43.5,good,


In [11]:
# data_deep_explore
data_deep_explore = data_soil_deep[data_soil_deep['depth']=='0-10']
data_deep_explore = data_soil_deep.loc[data_deep_explore.index.unique()]
data_deep_explore['sample'] = data_deep_explore['sample'].astype('int')
data_deep_explore = data_deep_explore.sort_values(by='sample')
data_deep_explore.sort_index()
# print("Deep",
#       data_soil_deep['depth'].value_counts(),
#       data_soil_deep['sample'].value_counts(),
#       sep="\n"
#       )

Unnamed: 0_level_0,depth,sample,h2o_by_wet,chk_note,chk_fail
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1989-09-08,30-40,10,32.5,good,
1989-09-08,0-10,2,41.9,good,
1989-09-08,30-40,2,38.3,good,
1989-09-08,0-10,7,44.8,good,
1989-09-08,0-10,3,43.9,good,
...,...,...,...,...,...
2006-03-24,30-40,8,27.6,good,
2006-03-24,0-10,9,32.5,good,
2006-03-24,0-10,8,40.5,good,
2006-03-24,30-40,4,38.6,good,


In [None]:
# Identify all rows that are duplicated based on the 'date' index and 'categoryB' column
# data_deep_explore['sample'] = data_deep_explore['sample'].astype('category')
# data_deep_explore.index.name
temp_df = data_deep_explore.reset_index()
# temp_df
duplicates_mask = temp_df.duplicated(subset=['date', "sample"], keep=False)

# Filter the DataFrame to show only the entries that have duplicates

filtered_df = temp_df[duplicates_mask]

print("Filtered DataFrame showing entries with repeated categoryB on the same date:")
filtered_df = filtered_df.set_index('date')
filtered_df['sample'] = filtered_df['sample'].astype('int')
filtered_df = filtered_df.sort_values(by='sample')
filtered_df.sort_index()


Filtered DataFrame showing entries with repeated categoryB on the same date:


Unnamed: 0_level_0,depth,sample,h2o_by_wet,chk_note,chk_fail
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2006-03-24,0-10,1,38.3,good,
2006-03-24,30-40,1,37.3,good,
2006-03-24,0-10,2,39.9,good,
2006-03-24,30-40,2,33.5,good,
2006-03-24,0-10,3,39.4,good,
...,...,...,...,...,...
2005-06-16,30-40,8,24.0,good,
2005-06-16,30-40,9,26.5,good,
2005-06-16,0-10,9,33.2,good,
2005-06-16,30-40,10,27.8,good,


In [99]:
# data_deep_explore
# pd.merge(data_soil_deep, data_soil_shallow, left_index=True, right_index=True, how='inner')
data_deep_join = data_soil_deep.reset_index()
data_deep_join['source'] = 'deep'
data_shallow_join = data_soil_shallow.reset_index()
data_shallow_join['source'] = 'shallow'
# pd.merge(data_deep_join, data_shallow_join, left_on = ["date", "depth"], right_on = ["date", "depth"])
# pd.merge(data_deep_join, data_shallow_join, on='date', how='inner')
# data_joined = pd.concat([data_deep_join, data_shallow_join], names=['source'])
data_joined = pd.concat([data_deep_join, data_shallow_join])
data_joined = data_joined.sort_values(by=['date', 'sample'])

# print("Filtered DataFrame showing entries with repeated categoryB on the same date:")
# data_joined = data_joined.set_index('date')
# filtered_df['sample'] = filtered_df['sample'].astype('int')
# filtered_df = filtered_df.sort_values(by='sample')
# filtered_df = filtered_df.sort_index()
data_joined = data_joined[data_joined['depth'].isin(["0-10","1-10"])]
data_joined = data_joined.set_index('date')
data_joined = data_joined.loc[data_deep_explore.index.unique()]
data_joined
# data_soil_shallow.loc[filtered_df.index.unique()].sort_index()

Unnamed: 0_level_0,depth,sample,h2o_by_wet,chk_note,chk_fail,source
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2006-03-24,0-10,1,38.3,good,,deep
2006-03-24,1-10,1,38.3,duplicate,,shallow
2006-03-24,0-10,10,31.3,good,,deep
2006-03-24,1-10,10,31.3,duplicate,,shallow
2006-03-24,0-10,2,39.9,good,,deep
...,...,...,...,...,...,...
2005-06-16,1-10,7,32.7,good,,shallow
2005-06-16,0-10,8,28.8,good,,deep
2005-06-16,1-10,8,28.1,good,,shallow
2005-06-16,0-10,9,33.2,good,,deep


In [100]:
data_deep_filtered = data_soil_deep[data_soil_deep['depth'] == "0-10"].sort_index()
data_deep_filtered = data_deep_filtered.drop('depth', axis=1)
data_deep_filtered = data_deep_filtered.reset_index()

data_shallow_filtered = data_soil_shallow.sort_index()[data_deep_filtered.index[0]:data_deep_filtered.index[-1]]
data_shallow_filtered = data_shallow_filtered.drop('depth', axis=1)
data_shallow_filtered = data_shallow_filtered.reset_index()
# # data_shallow_filtered['depth'].unique()
# pd.merge(data_deep_filtered, data_shallow_filtered, on=['date', 'sample', 'depth'], how='inner')
# data_deep_filtered.compare(data_shallow_filtered)
# pd.concat([data_deep_filtered, data_shallow_filtered]).drop_duplicates(keep=False)

# data_joined_mini = data_joined.drop('depth', axis=1).drop('source',axis=1)
data_joined_mini = data_joined.drop('depth', axis=1)
# data_joined_mini = data_joined_mini.drop_duplicates(keep=False)
data_joined_mini = data_joined_mini.drop_duplicates(subset=data_joined_mini.columns.difference(['source']), keep=False)
data_joined_mini = data_joined_mini[['sample', 'source', 'h2o_by_wet', 'chk_note', 'chk_fail']]
data_joined_mini

Unnamed: 0_level_0,sample,source,h2o_by_wet,chk_note,chk_fail
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2006-03-24,1,deep,38.3,good,
2006-03-24,1,shallow,38.3,duplicate,
2006-03-24,10,deep,31.3,good,
2006-03-24,10,shallow,31.3,duplicate,
2006-03-24,2,deep,39.9,good,
2006-03-24,2,shallow,39.9,duplicate,
2006-03-24,3,deep,39.4,good,
2006-03-24,3,shallow,39.4,duplicate,
2006-03-24,4,shallow,39.5,duplicate,
2006-03-24,5,deep,43.0,good,


In [144]:
data_deep_match = data_soil_deep[data_soil_deep["depth"] == "0-10"].sort_index().drop('depth', axis=1)
data_deep_match = data_deep_match.reset_index()
# dates_deep = data_deep_match.index
data_shallow_match = data_soil_shallow[data_soil_shallow.sort_index().index.isin(data_deep_match.index)].drop('depth', axis=1)
data_shallow_match = data_shallow_match.reset_index()

# data_deep_match
# pd.merge(data_deep_match, data_shallow_match, on=["date", "sample"], suffixes=("_deep", "_shallow"), how="inner")
match_result = pd.merge(data_deep_match, data_soil_shallow.reset_index().drop('depth', axis=1), on=["date", "sample"], suffixes=("_deep", "_shallow"), how="inner")
match_result["match"] = (match_result["h2o_by_wet_deep"] == match_result["h2o_by_wet_shallow"])
match_result["sample"] = match_result["sample"].astype('int')
match_result = match_result.sort_values(by=['date', 'sample'])
match_result = match_result.drop(['chk_fail_shallow', 'chk_fail_deep'], axis=1)
match_result[match_result["match"]==False]
# match_result[["date", "match"]]


Unnamed: 0,date,sample,h2o_by_wet_deep,chk_note_deep,h2o_by_wet_shallow,chk_note_shallow,match
129,2005-06-16,1,40.8,good,43.0,good,False
128,2005-06-16,2,36.5,good,37.8,good,False
127,2005-06-16,3,36.4,good,36.1,good,False
126,2005-06-16,4,38.6,good,38.4,good,False
125,2005-06-16,5,37.5,good,37.7,good,False
124,2005-06-16,6,37.5,good,37.9,good,False
123,2005-06-16,7,34.1,good,32.7,good,False
122,2005-06-16,8,28.8,good,28.1,good,False
121,2005-06-16,9,33.2,good,34.2,good,False
120,2005-06-16,10,32.2,good,32.1,good,False


In [139]:
deep_result = data_soil_deep.sort_index().loc["2005-06-16 00:00:00"]
deep_result["sample"] = deep_result["sample"].astype('int')
deep_result.sort_values(by=['date', 'depth', 'sample'])

Unnamed: 0_level_0,depth,sample,h2o_by_wet,chk_note,chk_fail
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2005-06-16,0-10,1,40.8,good,
2005-06-16,0-10,2,36.5,good,
2005-06-16,0-10,3,36.4,good,
2005-06-16,0-10,4,38.6,good,
2005-06-16,0-10,5,37.5,good,
2005-06-16,0-10,6,37.5,good,
2005-06-16,0-10,7,34.1,good,
2005-06-16,0-10,8,28.8,good,
2005-06-16,0-10,9,33.2,good,
2005-06-16,0-10,10,32.2,good,


In [140]:
data_soil_shallow.sort_index().loc["2006-03-24 00:00:00"]

Unnamed: 0_level_0,depth,sample,h2o_by_wet,chk_note,chk_fail
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2006-03-24,1-10,10,31.3,duplicate,
2006-03-24,1-10,9,32.5,duplicate,
2006-03-24,1-10,8,40.5,duplicate,
2006-03-24,1-10,6,38.0,duplicate,
2006-03-24,1-10,7,38.9,duplicate,
2006-03-24,1-10,4,39.5,duplicate,
2006-03-24,1-10,3,39.4,duplicate,
2006-03-24,1-10,2,39.9,duplicate,
2006-03-24,1-10,1,38.3,duplicate,
2006-03-24,1-10,5,43.0,duplicate,


In [163]:
shallow_all = pd.read_csv(
    "data/bci_manual_soilh/bci_lutz_shallow_gsm_man.csv",
    parse_dates=['date'],
    # nrows=100,
    usecols = ['date', 'depth', 'sample', 'h2o_by_wet', 'h2o_by_dry', 'chk_note', 'chk_fail'],
    dtype = {'depth':'category', 'sample':'category', 'chk_note':'category', 'chk_fail':'str'},
    date_format='%d/%m/%Y',
    index_col='date'
)

# shallow_all = shallow_all.sort_index().loc[date_weir_start:date_weir_end]
shallow_all = filter_dates(shallow_all)
# shallow_all = shallow_all.reset_index()

# # Deep
deep_all = pd.read_csv(
    "data/bci_manual_soilh/bci_lutz_deep_gsm_man.csv",
    parse_dates=['date'],
    usecols = ['date', 'depth', 'sample', 'h2o_by_wet', 'h2o_by_dry', 'chk_note', 'chk_fail'],
    dtype = {'depth':'category', 'sample':'category', 'chk_note':'category', 'chk_fail':'str'},
    date_format='%d/%m/%Y',
    index_col='date'
)

deep_all = filter_dates(deep_all)
deep_all = deep_all[deep_all["depth"] != "30-40"]#.reset_index()

# Filter set to only be of dates where deep set has shallow values
shallow_all = shallow_all[shallow_all.index.isin(deep_all.index)]
shallow_all.reset_index()

match_all = pd.merge(deep_all.reset_index(), shallow_all.reset_index(), on=["date", "sample"], suffixes=("_deep", "_shallow"), how="inner")
match_all["match_wet"] = (match_all["h2o_by_wet_deep"] == match_all["h2o_by_wet_shallow"])
match_all["match_dry"] = (match_all["h2o_by_dry_deep"] == match_all["h2o_by_dry_shallow"])
match_all["sample"] = match_all["sample"].astype('int')
match_all = match_all.sort_values(by=['date', 'sample'])
match_all = match_all.drop(['chk_fail_shallow', 'chk_fail_deep'], axis=1)
# match_all[match_all["match_wet"] & match_all["match_dry"]]
match_all = match_all[((match_all["match_wet"]==False) | (match_all["match_wet"]==False))]
match_all = match_all[['date', 'depth_shallow', 'depth_deep', 'sample', 'h2o_by_wet_shallow', 'h2o_by_wet_deep', 'h2o_by_dry_shallow', 'h2o_by_dry_deep', 'chk_note_shallow', 'chk_note_deep']]
# match_all = match_all.drop(["depth_deep", "depth_shallow"],axis=1)
# match_all = match_all[['date', 'sample', 'h2o_by_wet_shallow', 'h2o_by_wet_deep', 'h2o_by_dry_shallow', 'h2o_by_dry_deep', 'chk_note_shallow', 'chk_note_deep']]

# match_all = match_all[['date', 'sample', 'h2o_by_wet_shallow', 'h2o_by_wet_deep', 'match_wet', 'h2o_by_dry_shallow', 'h2o_by_dry_deep', 'match_dry', 'chk_note_shallow', 'chk_note_deep']]
# # data_deep_match = data_soil_deep[data_soil_deep["depth"] == "0-10"].sort_index().drop('depth', axis=1)
# # data_deep_match = data_deep_match.reset_index()
# # dates_deep = data_deep_match.index
# # data_shallow_match = data_soil_shallow[data_soil_shallow.sort_index().index.isin(data_deep_match.index)].drop('depth', axis=1)
# data_shallow_match = data_shallow_match.reset_index()

# # data_deep_match
# # pd.merge(data_deep_match, data_shallow_match, on=["date", "sample"], suffixes=("_deep", "_shallow"), how="inner")
# match_result = pd.merge(data_deep_match, data_soil_shallow.reset_index().drop('depth', axis=1), on=["date", "sample"], suffixes=("_deep", "_shallow"), how="inner")
# match_result["match"] = (match_result["h2o_by_wet_deep"] == match_result["h2o_by_wet_shallow"])
# match_result["sample"] = match_result["sample"].astype('int')
# match_result = match_result.sort_values(by=['date', 'sample'])
# match_result = match_result.drop(['chk_fail_shallow', 'chk_fail_deep'], axis=1)
# match_result[match_result["match"]==False]
# # match_result[["date", "match"]]
match_all


Unnamed: 0,date,depth_shallow,depth_deep,sample,h2o_by_wet_shallow,h2o_by_wet_deep,h2o_by_dry_shallow,h2o_by_dry_deep,chk_note_shallow,chk_note_deep
129,2005-06-16,1-10,0-10,1,43.0,40.8,75.5,69.1,good,good
128,2005-06-16,1-10,0-10,2,37.8,36.5,60.8,57.6,good,good
127,2005-06-16,1-10,0-10,3,36.1,36.4,56.5,57.1,good,good
126,2005-06-16,1-10,0-10,4,38.4,38.6,62.2,63.0,good,good
125,2005-06-16,1-10,0-10,5,37.7,37.5,60.4,60.1,good,good
124,2005-06-16,1-10,0-10,6,37.9,37.5,60.9,59.9,good,good
123,2005-06-16,1-10,0-10,7,32.7,34.1,48.7,51.8,good,good
122,2005-06-16,1-10,0-10,8,28.1,28.8,39.1,40.4,good,good
121,2005-06-16,1-10,0-10,9,34.2,33.2,51.9,49.8,good,good
120,2005-06-16,1-10,0-10,10,32.1,32.2,47.3,47.6,good,good


In [162]:
data_all_soil_shallow

Unnamed: 0_level_0,depth,sample,h2o_by_wet,chk_note,chk_fail
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1972-03-03,0-5,1,36.8,good,
1972-03-03,0-5,2,43.5,good,
1972-03-03,0-5,3,41.5,good,
1972-03-03,0-5,4,35.7,good,
1972-03-03,0-5,5,34.2,good,
...,...,...,...,...,...
2025-06-26,1-10,6,47.6,nc,
2025-06-26,1-10,7,33.5,nc,
2025-06-26,1-10,8,46.9,nc,
2025-06-26,1-10,9,40.0,nc,


## General Variable Exploration

In [None]:
# Explore weir combined data, comments, etc.

# print(
#     data_nochart_combined.dtypes,
#     # "\n\n", "Source:", data_nochart_combined['source'].cat.categories.tolist(),
#     # "\n\n", "Notes:", data_nochart_combined['chk_note'].cat.categories.tolist(),
#     "\n\n", "Source:", data_nochart_combined['source'].unique(),
#     "\n\n", "Notes:", data_nochart_combined['chk_note'].unique(),
#     "\n\n", "Comments:", data_nochart_combined['comment'].unique(),
#     "\n\n", "Fail mode:", data_nochart_combined['chk_fail'].unique()
# )

# Counts of each 'source' type
print(
    data_combined['source'].value_counts(dropna = False),
    data_combined['chk_note'].value_counts(dropna = False),
    data_combined['comment'].value_counts(dropna=False),
    data_combined['chk_fail'].unique(),
    sep="\n\n"
)

In [None]:
# soil info

# print(data_nochart_soil_shallow['sample'].value_counts(dropna = False),
#       data_nochart_soil_shallow['depth'].value_counts(dropna = False),
#       data_nochart_soil_deep['sample'].value_counts(dropna = False),
#       data_nochart_soil_deep['depth'].value_counts(dropna = False),
#       sep = "\n\n"
#       )

# print(data_nochart_soil_deep[~data_nochart_soil_deep['depth'].str.contains("30-40", na=False)])

# # data_soil_deep[~data_soil_deep['sample'].str.contains("1|2|3|4|5|6|7|8|9|10", na=False)]
# data_nochart_soil_shallow[~data_nochart_soil_shallow['sample'].isin(["1","2","3","4","5","6","7","8","9","10"])]

# # print(data_soil_deep['sample'].value_counts(dropna = False))
# # print(data_soil_deep['depth'].value_counts(dropna = False))

## Visualization

In [None]:
def plot_between(input_date_start, input_date_end):
    data_subset = data_combined.loc[input_date_start:input_date_end]

    data_subset_rain = data_rainfall.sort_index()
    data_subset_rain = data_subset_rain.loc[input_date_start:input_date_end]

    data_subset_cal = data_calibrations.sort_index()
    data_subset_cal = data_subset_cal.loc[input_date_start:input_date_end]
    # plt.figure(figsize = (10,6))
    # plt.plot(data_subset.index, data_subset['raw'])

    fig, ax = plt.subplots(figsize=(10, 6))
    # ax.figure(figsize=(10, 6))
    plt.axhline(y=0, color = "grey", linestyle = ":")
    ax.vlines(data_subset_rain.index, ymin=0, ymax=data_subset_rain['ra']*3, color = "blue", label = "Rain (x3)")
    ax.plot(data_subset.index, data_subset['level'], color = "red", label = "Adjusted")
    ax.plot(data_subset.index, data_subset['raw'], color = "green", label = "Raw")
    ax.plot(data_subset_cal.index, data_subset_cal['weir_level'], linestyle='none', marker='x', color="red", label = "Calibration")
    # plt.plot(data_subset_rain.index, data_subset_rain['ra']*3, color = "blue", linestyle='none', marker='o')
    # plt.plot(data_calibrations.index, data_calibrations['level'], color = "red")
    ax.set_xlabel('Date (YYYY-MM-DD)')
    ax.set_ylabel('Level (mm)')
    # ax.set_title('Simple Time Series Plot')
    # ax.set_ylim(bottom=0) 
    # ax.grid(True)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.legend(loc = 'upper right')
    return plt.show()

In [None]:
# plot_between('2010-05-20 00:00:01','2010-05-25 23:59:59')
plot_between('2020-05-31 00:00:01','2020-06-16 23:59:59')
# plot_between('2023-05-14 00:00:01','2023-06-15 23:59:59')
# plot_between('2002-07-30 00:00:01','2002-08-02 23:59:59')

In [None]:
# print('chk_note:', data_combined_sources['chk_note'].cat.categories.tolist())
# print('chk_fail:', data_combined_sources['chk_fail'].cat.categories.tolist())
# print('source:', data_combined_sources['source'].cat.categories.tolist())

# data_combined_sources[data_combined_sources['source']=='TROLL']
# data_combined_sources.T['source']
# data_combined_sources.sample(10)["source"]

# counts of each 'source' type
print(data_combined['chk_note'].value_counts(dropna = False))
print(data_combined['comment'].value_counts(dropna=False))
print(data_combined['source'].value_counts(dropna = False))

# data_combined.info()


## Statistics

In [None]:
# data_dec_explore = data_all_combined.sort_index()['1989-01-01 00:00:01':'1989-12-12 23:59:59']
# data_dec_explore = data_dec_explore[data_dec_explore['source']!='CHART']
# data_dec_explore

In [None]:
# time_series = pd.Series(data_combined, index='datetime')
# time_series
# pd.DatetimeIndex.to_series(data_combined)
# time_series = pd.to_datetime(data_combined.index())

# data_combined.index.year
# data_mini = data_combined['2010-05-20 00:00:01':'2010-05-25 23:59:59']
# data_mini = data_combined['2010-05-01 00:00:01':'2010-10-31 23:59:59']
# data_mini['raw'].resample('1ME').agg(['mean','std'])

# data_mini.rolling(2, on='raw').sum()
# data_mini.rolling('1D', on='raw').sum()
# data_mini['raw'].rolling('1D').mean()
# data_mini['raw'].rolling('1D', closed='left').mean()
# data_mini['raw'].resample('1D').mean()

# data_mini['raw'].resample('1ME').mean()
# data_combined['raw'].dropna()
# data_combined['raw'].resample('YE').agg(['mean','std'])
# data_mini = data_combined['raw'].dropna()
# data_combined['1972-01-01 00:00:01':'1973-12-31 23:59:59']

# # Remove missing values
# data_mini = data_combined[data_combined['chk_note']!='missing']
# # Remove a few extra points
# data_mini = data_mini['1978-01-01 00:00:01':]
# data_mini.resample('YE').agg(['mean','std'])

# data_sumstats = data_mini['1989-01-01 00:00:01':'1989-12-31 23:59:59']
data_sumstats = data_combined['raw'].dropna().resample('6ME').agg(['mean','std']).dropna()
data_sumstats
# plt.figure(figsize=(12,6))
# plt.plot(data_sumstats.index, data_sumstats['mean'])
# plt.plot(data_sumstats.index, data_sumstats['std'])
# plt.show()

# print(data_mini['source'].value_counts(dropna = False))
# data_mini.rolling('1D').sum()
# data_mini.rolling(5).sum()

In [None]:
#Plot

fig, ax = plt.subplots(figsize=(10, 6))
    # ax.figure(figsize=(10, 6))
plt.axhline(y=0, color = "grey", linestyle = ":")
# ax.vlines(data_subset_rain.index, ymin=0, ymax=data_subset_rain['ra']*3, color = "blue", label = "Rain (x3)")
ax.plot(data_sumstats.index, data_sumstats['mean'], color = "red", label = "Mean", marker = 'x')
# ax.plot(data_sumstats.index, data_sumstats['std'], color = "green", label = "std")
ax.fill_between(data_sumstats.index, data_sumstats['mean']-data_sumstats['std'], data_sumstats['mean']+data_sumstats['std'], color = "pink", label = "std")
# ax.plot(data_subset_cal.index, data_subset_cal['weir_level'], linestyle='none', marker='x', color="red", label = "Calibration")
# plt.plot(data_subset_rain.index, data_subset_rain['ra']*3, color = "blue", linestyle='none', marker='o')
# plt.plot(data_calibrations.index, data_calibrations['level'], color = "red")
ax.set_xlabel('Year')
ax.set_ylabel('Level (mm)')
ax.set_title('Average raw values every 6mo')
# ax.set_title('Simple Time Series Plot')
# ax.set_ylim(bottom=0) 
ax.set_xlim(left = dt.date(1989, 1, 1), right=dt.date(2026, 1, 1))
# ax.grid(True)
# ax.xaxis.set_major_locator(mdates.MonthLocator(interval=6)) # Show ticks every month
ax.xaxis.set_major_locator(mdates.YearLocator(month=1)) # Show ticks every month
ax.xaxis.set_minor_locator(mdates.YearLocator(month=7)) # Show ticks every month
# plt.xticks(np.arange(min(data_sumstats.index), max(data_sumstats.index)+1, 1.0),rotation=45, ha='right')
plt.xticks(rotation=90)
# ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
plt.tight_layout()
plt.legend(loc = 'upper right')