In [None]:
import numpy as np
import pandas as pd

In [None]:
# Combined data
# # Checking if the dataset is already loaded into the workspace
# try:
#     if data_combined.empty == False:
#         print("Data loaded, random sample shown below")
#         print(data_combined.sample(n=5))
# except NameError:
#     print("Data has not yet been read in, loading now...")
#     data_combined = pd.read_csv(
#         "data/bci_lutzweir_combined.csv",
#         usecols = ['datetime', 'level', 'raw', 'chk_note', 'chk_fail', 'comment', 'source'],
#         parse_dates=['datetime'],
#         dtype = {'source':'category', 'chk_note':'category', 'chk_fail':'str', 'comment':'str'},
#         date_format='%d/%m/%Y %H:%M:%S'
#     )

data_all_combined = pd.read_csv(
    "data/bci_lutzweir_combined.csv",
    usecols = ['datetime', 'level', 'raw', 'chk_note', 'chk_fail', 'comment', 'source'],
    parse_dates=['datetime'],
    dtype = {'source':'category', 'chk_note':'category', 'chk_fail':'str', 'comment':'str'},
    date_format='%d/%m/%Y %H:%M:%S',
    index_col='datetime'
)

data_all_combined.info()

In [None]:
# Soil datasets

# Shallow
data_all_soil_shallow = pd.read_csv(
    "data/bci_manual_soilh/bci_lutz_shallow_gsm_man.csv",
    parse_dates=['date'],
    usecols = ['date', 'depth', 'sample', 'h2o_by_wet', 'chk_note', 'chk_fail'],
    dtype = {'depth':'category', 'sample':'category', 'chk_note':'category', 'chk_fail':'str'},
    date_format='%d/%m/%Y',
    index_col='date'
)

# Deep
data_all_soil_deep = pd.read_csv(
    "data/bci_manual_soilh/bci_lutz_deep_gsm_man.csv",
    parse_dates=['date'],
    usecols = ['date', 'depth', 'sample', 'h2o_by_wet', 'chk_note', 'chk_fail'],
    dtype = {'depth':'category', 'sample':'category', 'chk_note':'category', 'chk_fail':'str'},
    date_format='%d/%m/%Y',
    index_col='date'
)

# print(data_soil_shallow['sample'].value_counts(dropna = False))
# print(data_soil_shallow['depth'].value_counts(dropna = False))
# print(data_soil_deep['depth'].value_counts(dropna = False))
# print(data_soil_deep['sample'].value_counts(dropna = False))

# print("Shallow",
#       data_all_soil_shallow['sample'].value_counts(dropna = False),
#       data_all_soil_shallow['depth'].value_counts(dropna = False)
#     )
# print("Deep",
#       data_all_soil_deep['depth'].value_counts(dropna = False),
#       data_all_soil_deep['sample'].value_counts(dropna = False)
#       )

data_all_soil_shallow.info()
data_all_soil_deep.info()

In [None]:
# Filtering data sets for relevant dates

# Exclude old chart data
# data_combined = data_all_combined[~data_all_combined['source'].str.contains("CHART", na=False)]
# data_combined = data_all_combined[~data_all_combined['source']=="CHART"]
data_combined = data_all_combined[data_all_combined['source']!='CHART']
# Remove missing values
data_combined = data_combined[data_combined['chk_note']!='missing']

# Arrange for visualization & indexing
data_combined = data_combined.sort_index()
# Remove a few extra points
data_combined = data_combined['1978-01-01 00:00:01':]

# Get earliest and latest dates
date_weir_start = data_combined.index[0]
date_weir_end = data_combined.index[-1]

# Create function to filter dates
def filter_dates(input_dataset, input_date_start = date_weir_start, input_date_end = date_weir_end):
    # Sort the dataframe
    data_subset = input_dataset.sort_index()
    # Filter between dates
    data_subset = data_subset.loc[input_date_start:input_date_end]
    return data_subset

# Apply filter
# data_rainfall = filter_dates(data_all_rainfall)
data_soil_deep = filter_dates(data_all_soil_deep)
data_soil_shallow = filter_dates(data_all_soil_shallow)
# data_nochart_soil_shallow[~data_nochart_soil_shallow['sample'].isin(["1","2","3","4","5","6","7","8","9","10"])]

In [None]:
data_shallow_explore = data_soil_shallow[~data_soil_shallow['sample'].isin(["1","2","3","4","5","6","7","8","9","10"])]
# data_shallow_explore.index.unique()[0]
data_shallow_explore = data_soil_shallow.loc[data_shallow_explore.index.unique()[0]]
data_shallow_explore['sample'] = data_shallow_explore['sample'].astype('int')
data_shallow_explore.sort_values(by='sample')
# data_shallow_explore

# data_shallow_explore


# data_soil_shallow['2005-03-02 00:00:00':'2005-03-02 23:59:59'].sort_values(by='sample')
# data_soil_shallow['2005-03-02 00:00:00':'2005-03-02 23:59:59']
# data_soil_shallow[data_shallow_explore.index[0]]
# print("Shallow",
#       data_soil_shallow['sample'].value_counts(),
#       data_soil_shallow['depth'].value_counts(),
#       sep="\n"
#     )
# print("----------")

In [None]:
# data_deep_explore
data_deep_explore = data_soil_deep[data_soil_deep['depth']=='0-10']
data_deep_explore = data_soil_deep.loc[data_deep_explore.index.unique()]
data_deep_explore['sample'] = data_deep_explore['sample'].astype('int')
data_deep_explore = data_deep_explore.sort_values(by='sample')
data_deep_explore.sort_index()
# print("Deep",
#       data_soil_deep['depth'].value_counts(),
#       data_soil_deep['sample'].value_counts(),
#       sep="\n"
#       )

In [None]:
# Identify all rows that are duplicated based on the 'date' index and 'categoryB' column
# data_deep_explore['sample'] = data_deep_explore['sample'].astype('category')
# data_deep_explore.index.name
temp_df = data_deep_explore.reset_index()
# temp_df
duplicates_mask = temp_df.duplicated(subset=['date', "sample"], keep=False)

# Filter the DataFrame to show only the entries that have duplicates

filtered_df = temp_df[duplicates_mask]

print("Filtered DataFrame showing entries with repeated categoryB on the same date:")
filtered_df = filtered_df.set_index('date')
filtered_df['sample'] = filtered_df['sample'].astype('int')
filtered_df = filtered_df.sort_values(by='sample')
filtered_df.sort_index()


In [None]:
# data_deep_explore
# pd.merge(data_soil_deep, data_soil_shallow, left_index=True, right_index=True, how='inner')
data_deep_join = data_soil_deep.reset_index()
data_deep_join['source'] = 'deep'
data_shallow_join = data_soil_shallow.reset_index()
data_shallow_join['source'] = 'shallow'
# pd.merge(data_deep_join, data_shallow_join, left_on = ["date", "depth"], right_on = ["date", "depth"])
# pd.merge(data_deep_join, data_shallow_join, on='date', how='inner')
# data_joined = pd.concat([data_deep_join, data_shallow_join], names=['source'])
data_joined = pd.concat([data_deep_join, data_shallow_join])
data_joined = data_joined.sort_values(by=['date', 'sample'])

# print("Filtered DataFrame showing entries with repeated categoryB on the same date:")
# data_joined = data_joined.set_index('date')
# filtered_df['sample'] = filtered_df['sample'].astype('int')
# filtered_df = filtered_df.sort_values(by='sample')
# filtered_df = filtered_df.sort_index()
data_joined = data_joined[data_joined['depth'].isin(["0-10","1-10"])]
data_joined = data_joined.set_index('date')
data_joined = data_joined.loc[data_deep_explore.index.unique()]
data_joined
# data_soil_shallow.loc[filtered_df.index.unique()].sort_index()

In [None]:
data_deep_filtered = data_soil_deep[data_soil_deep['depth'] == "0-10"].sort_index()
data_deep_filtered = data_deep_filtered.drop('depth', axis=1)
data_deep_filtered = data_deep_filtered.reset_index()

data_shallow_filtered = data_soil_shallow.sort_index()[data_deep_filtered.index[0]:data_deep_filtered.index[-1]]
data_shallow_filtered = data_shallow_filtered.drop('depth', axis=1)
data_shallow_filtered = data_shallow_filtered.reset_index()
# # data_shallow_filtered['depth'].unique()
# pd.merge(data_deep_filtered, data_shallow_filtered, on=['date', 'sample', 'depth'], how='inner')
# data_deep_filtered.compare(data_shallow_filtered)
# pd.concat([data_deep_filtered, data_shallow_filtered]).drop_duplicates(keep=False)

# data_joined_mini = data_joined.drop('depth', axis=1).drop('source',axis=1)
data_joined_mini = data_joined.drop('depth', axis=1)
# data_joined_mini = data_joined_mini.drop_duplicates(keep=False)
data_joined_mini = data_joined_mini.drop_duplicates(subset=data_joined_mini.columns.difference(['source']), keep=False)
data_joined_mini = data_joined_mini[['sample', 'source', 'h2o_by_wet', 'chk_note', 'chk_fail']]
data_joined_mini

In [None]:
data_deep_match = data_soil_deep[data_soil_deep["depth"] == "0-10"].sort_index().drop('depth', axis=1)
data_deep_match = data_deep_match.reset_index()
# dates_deep = data_deep_match.index
data_shallow_match = data_soil_shallow[data_soil_shallow.sort_index().index.isin(data_deep_match.index)].drop('depth', axis=1)
data_shallow_match = data_shallow_match.reset_index()

# data_deep_match
# pd.merge(data_deep_match, data_shallow_match, on=["date", "sample"], suffixes=("_deep", "_shallow"), how="inner")
match_result = pd.merge(data_deep_match, data_soil_shallow.reset_index().drop('depth', axis=1), on=["date", "sample"], suffixes=("_deep", "_shallow"), how="inner")
match_result["match"] = (match_result["h2o_by_wet_deep"] == match_result["h2o_by_wet_shallow"])
match_result["sample"] = match_result["sample"].astype('int')
match_result = match_result.sort_values(by=['date', 'sample'])
match_result = match_result.drop(['chk_fail_shallow', 'chk_fail_deep'], axis=1)
match_result[match_result["match"]==False]
# match_result[["date", "match"]]


In [None]:
deep_result = data_soil_deep.sort_index().loc["2005-06-16 00:00:00"]
deep_result["sample"] = deep_result["sample"].astype('int')
deep_result.sort_values(by=['date', 'depth', 'sample'])

In [None]:
data_soil_shallow.sort_index().loc["2006-03-24 00:00:00"]

In [None]:
shallow_all = pd.read_csv(
    "data/bci_manual_soilh/bci_lutz_shallow_gsm_man.csv",
    parse_dates=['date'],
    # nrows=100,
    usecols = ['date', 'depth', 'sample', 'h2o_by_wet', 'h2o_by_dry', 'chk_note', 'chk_fail'],
    dtype = {'depth':'category', 'sample':'category', 'chk_note':'category', 'chk_fail':'str'},
    date_format='%d/%m/%Y',
    index_col='date'
)

# shallow_all = shallow_all.sort_index().loc[date_weir_start:date_weir_end]
shallow_all = filter_dates(shallow_all)
# shallow_all = shallow_all.reset_index()

# # Deep
deep_all = pd.read_csv(
    "data/bci_manual_soilh/bci_lutz_deep_gsm_man.csv",
    parse_dates=['date'],
    usecols = ['date', 'depth', 'sample', 'h2o_by_wet', 'h2o_by_dry', 'chk_note', 'chk_fail'],
    dtype = {'depth':'category', 'sample':'category', 'chk_note':'category', 'chk_fail':'str'},
    date_format='%d/%m/%Y',
    index_col='date'
)

deep_all = filter_dates(deep_all)
deep_all = deep_all[deep_all["depth"] != "30-40"]#.reset_index()

# Filter set to only be of dates where deep set has shallow values
shallow_all = shallow_all[shallow_all.index.isin(deep_all.index)]
shallow_all.reset_index()

match_all = pd.merge(deep_all.reset_index(), shallow_all.reset_index(), on=["date", "sample"], suffixes=("_deep", "_shallow"), how="inner")
match_all["match_wet"] = (match_all["h2o_by_wet_deep"] == match_all["h2o_by_wet_shallow"])
match_all["match_dry"] = (match_all["h2o_by_dry_deep"] == match_all["h2o_by_dry_shallow"])
match_all["sample"] = match_all["sample"].astype('int')
match_all = match_all.sort_values(by=['date', 'sample'])
match_all = match_all.drop(['chk_fail_shallow', 'chk_fail_deep'], axis=1)
# match_all[match_all["match_wet"] & match_all["match_dry"]]
match_all = match_all[((match_all["match_wet"]==False) | (match_all["match_wet"]==False))]
match_all = match_all[['date', 'depth_shallow', 'depth_deep', 'sample', 'h2o_by_wet_shallow', 'h2o_by_wet_deep', 'h2o_by_dry_shallow', 'h2o_by_dry_deep', 'chk_note_shallow', 'chk_note_deep']]
# match_all = match_all.drop(["depth_deep", "depth_shallow"],axis=1)
# match_all = match_all[['date', 'sample', 'h2o_by_wet_shallow', 'h2o_by_wet_deep', 'h2o_by_dry_shallow', 'h2o_by_dry_deep', 'chk_note_shallow', 'chk_note_deep']]

# match_all = match_all[['date', 'sample', 'h2o_by_wet_shallow', 'h2o_by_wet_deep', 'match_wet', 'h2o_by_dry_shallow', 'h2o_by_dry_deep', 'match_dry', 'chk_note_shallow', 'chk_note_deep']]
# # data_deep_match = data_soil_deep[data_soil_deep["depth"] == "0-10"].sort_index().drop('depth', axis=1)
# # data_deep_match = data_deep_match.reset_index()
# # dates_deep = data_deep_match.index
# # data_shallow_match = data_soil_shallow[data_soil_shallow.sort_index().index.isin(data_deep_match.index)].drop('depth', axis=1)
# data_shallow_match = data_shallow_match.reset_index()

# # data_deep_match
# # pd.merge(data_deep_match, data_shallow_match, on=["date", "sample"], suffixes=("_deep", "_shallow"), how="inner")
# match_result = pd.merge(data_deep_match, data_soil_shallow.reset_index().drop('depth', axis=1), on=["date", "sample"], suffixes=("_deep", "_shallow"), how="inner")
# match_result["match"] = (match_result["h2o_by_wet_deep"] == match_result["h2o_by_wet_shallow"])
# match_result["sample"] = match_result["sample"].astype('int')
# match_result = match_result.sort_values(by=['date', 'sample'])
# match_result = match_result.drop(['chk_fail_shallow', 'chk_fail_deep'], axis=1)
# match_result[match_result["match"]==False]
# # match_result[["date", "match"]]
match_all
