# Data Separation

Author: Gillian A. McGinnis, final-semester M.S. Information Science - Machine Learning  
The University of Arizona College of Information  
INFO 698 - Capstone  
Start date: 21 October 2025  
Last updated: 21 October 2025

In [1]:
"""
Module providing code for test/train split and sliding window creation. Relies on 01_eda.ipynb completion.
"""

'\nModule providing code for test/train split and sliding window creation. Relies on 01_eda.ipynb completion.\n'

## Setup

### Packages

In [2]:
# General packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# # import matplotlib.ticker as ticker
# import matplotlib.dates as mdates
# import datetime as dt
from sklearn.model_selection import TimeSeriesSplit, train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error, confusion_matrix, ConfusionMatrixDisplay, f1_score

In [3]:
## (Optional chunk)
# Current session information
import session_info
session_info.show(dependencies=False)

### Data

In [4]:
united_water = pd.read_parquet('data/clean/water.parquet')
united_soil = pd.read_parquet('data/clean/soil.parquet')

## Prepare

## Feature Engineering

In [5]:
# Select columns of interest
data_water = united_water.drop(columns=['raw_rain', 'chk_note_rain', 'chk_fail_rain', 'chk_note_ro', 'chk_fail_ro', 'comment_ro', 'source_ro'])

In [6]:
### Note ###
# Remove this later -- just a smaller subset for feature engineering testing!!
data_water = data_water['2015-01-01 00:00:00':'2015-12-31 23:59:59']
######

Create feature which tracks how recent a calibration was conducted.

In [7]:
# Create index of instances where there is a calibration point
cal_instances = data_water['weir_level_cal'].notna()
# Create groupings based on most recent instance
cal_group_id = cal_instances.cumsum()
# Create new column to count number of records since the calibration point
# which resets to 0 at each new calibration
data_water['records_since_cal'] = data_water.groupby(cal_group_id).cumcount()

# Clean up environment
del cal_instances, cal_group_id

# data_water

Create feature which tracks how recent a rain event occurred.

In [8]:
# Create index of instances where there is a calibration point
rain_instances = data_water['ra_rain'].notna()
# Create groupings based on most recent instance
rain_group_id = rain_instances.cumsum()
# Create new column to count number of records since the calibration point
# which resets to 0 at each new calibration
data_water['records_since_rain'] = data_water.groupby(rain_group_id).cumcount()

# Clean up environment
del rain_instances, rain_group_id

# Replace NAs with 0
data_water['ra_rain'] = data_water['ra_rain'].fillna(0)

data_water.sample(10)
# data_water.dropna(subset='raw_ro')

Unnamed: 0_level_0,weir_level_cal,ra_rain,level_ro,raw_ro,obstruction_ro,gap_fill_ro,weir_cleaning_ro,spike_ro,calibration_ro,records_since_cal,records_since_rain
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015-01-01 08:55:00,,0.0,42.0,42.0,False,False,False,False,False,107,107
2015-07-18 01:50:00,,0.0,16.1,16.1,False,False,False,False,False,184,77
2015-10-31 17:00:00,,0.0,76.5,76.5,False,False,False,False,False,389,32
2015-07-25 08:40:00,,0.0,14.0,14.0,False,False,False,False,False,290,522
2015-02-02 11:10:00,,0.0,20.2,20.2,False,False,False,False,False,28,282
2015-11-02 07:40:00,,0.0,45.1,48.3,True,False,False,False,False,853,190
2015-03-01 07:35:00,,0.0,14.0,14.0,False,False,False,False,False,565,2552
2015-07-05 14:15:00,,0.0,0.0,0.0,False,False,False,False,False,645,1611
2015-08-24 07:25:00,,0.0,36.17,44.1,True,False,False,False,False,849,53
2015-12-22 03:20:00,,0.0,34.11,51.3,True,False,False,False,False,224,108


Lag features: rain

In [9]:
def lag_feats(input_df, input_cols, input_lags):
    output_df = input_df.copy()
    for col in input_cols:
        for lag in input_lags:
            output_df[f"{col}_lag{lag}"] = output_df[col].shift(lag)
    return output_df

In [10]:
# lag_feats(data_water, ['raw_ro'], [1, 2, 3, 24]).dropna(subset='raw_ro')[['raw_ro', 'raw_ro_lag1', 'raw_ro_lag2']]
# lag_feats(data_water, ['raw_ro'], [1, 2, 3, 24]).dropna(subset='raw_ro')[['raw_ro', 'raw_ro_lag1', 'raw_ro_lag24']]

# Columns to get temporal stats on
cols_to_shift = ['raw_ro', 'ra_rain']
# data at 5-min increments -- lag to record values at 5m, 10m, 15m, 30m, 1h, and 2h prior
lags_of_interest = [1, 2, 3, 6, 12, 24]

data_water = lag_feats(data_water, cols_to_shift, lags_of_interest)

data_water.sample(10)

Unnamed: 0_level_0,weir_level_cal,ra_rain,level_ro,raw_ro,obstruction_ro,gap_fill_ro,weir_cleaning_ro,spike_ro,calibration_ro,records_since_cal,...,raw_ro_lag3,raw_ro_lag6,raw_ro_lag12,raw_ro_lag24,ra_rain_lag1,ra_rain_lag2,ra_rain_lag3,ra_rain_lag6,ra_rain_lag12,ra_rain_lag24
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-02-13 04:35:00,,0.0,19.3,19.3,False,False,False,False,False,517,...,20.7,18.2,18.6,18.7,0.0,0.0,0.0,0.0,0.0,0.0
2015-07-27 12:55:00,,0.0,12.0,12.0,False,False,False,False,False,52,...,12.0,12.0,12.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-08-19 02:05:00,,0.0,19.65,27.9,True,False,False,False,False,210,...,27.8,28.0,27.5,26.5,0.0,0.0,0.0,0.0,0.0,0.0
2015-12-08 19:30:00,,0.0,55.6,65.6,True,False,False,False,False,420,...,65.1,64.7,64.4,64.7,0.0,0.0,0.0,0.0,0.0,0.0
2015-06-08 12:15:00,,0.0,19.9,19.9,False,False,False,False,False,43,...,20.5,20.7,21.8,22.6,0.0,0.0,0.0,0.0,0.0,0.0
2015-01-25 11:20:00,,0.0,27.0,27.0,False,False,False,False,False,610,...,26.0,26.0,26.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-04-18 01:00:00,,0.0,0.0,0.0,False,False,False,False,False,197,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-11-27 20:55:00,,0.0,47.7,47.7,False,False,False,False,False,149,...,47.6,47.5,47.3,47.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-04-11 22:35:00,,0.0,0.0,0.0,False,False,False,False,False,740,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-12-12 04:30:00,,0.0,43.63,52.2,True,False,False,False,False,244,...,52.0,52.1,52.7,50.9,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
def rolling_feats(input_df, input_cols, input_windows):
    output_df = input_df.copy()
    for col in input_cols:
        for window in input_windows:
            output_df[f"{col}_rollmean_{window}"] = output_df[col].rolling(window).mean()
            output_df[f"{col}_rollstd_{window}"] = output_df[col].rolling(window).std()
            output_df[f"{col}_rollslope_{window}"] = (output_df[col].rolling(window).apply(lambda x: np.polyfit(range(len(x)), x, 1)[0], raw=True))
    return output_df

In [12]:
# data_water_mini = data_water['1990-01-01 00:00:00':'1990-01-30 23:59:59']
# rolling_feats(data_water_mini, cols_to_lag, [6, 12, 36])
# 30m, 1h, 6h
windows_of_interest = [6, 12, 72]


data_water = rolling_feats(data_water, cols_to_shift, windows_of_interest)

data_water.sample(10)

Unnamed: 0_level_0,weir_level_cal,ra_rain,level_ro,raw_ro,obstruction_ro,gap_fill_ro,weir_cleaning_ro,spike_ro,calibration_ro,records_since_cal,...,raw_ro_rollslope_72,ra_rain_rollmean_6,ra_rain_rollstd_6,ra_rain_rollslope_6,ra_rain_rollmean_12,ra_rain_rollstd_12,ra_rain_rollslope_12,ra_rain_rollmean_72,ra_rain_rollstd_72,ra_rain_rollslope_72
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-11-09 10:35:00,,0.0,39.86,41.9,True,False,False,False,False,22,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-09-23 13:05:00,,0.0,40.6,40.6,False,False,False,False,False,45,...,-0.033412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-07-25 08:25:00,,0.0,14.0,14.0,False,False,False,False,False,287,...,0.056193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-09-15 11:35:00,,0.0,29.4,29.4,False,False,False,False,False,35,...,,0.338667,0.499462,0.043543,0.169333,0.380358,0.047958,0.028222,0.162642,0.002181
2015-05-11 05:15:00,,0.0,0.0,0.0,False,False,False,False,False,822,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-02-07 02:45:00,,0.0,19.2,19.2,False,False,False,False,False,217,...,-0.006769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-08-30 04:50:00,,0.0,15.58,25.5,True,False,False,False,False,528,...,0.041895,0.0,0.0,0.0,0.0,0.0,0.0,0.003528,0.029934,0.00011
2015-09-24 16:40:00,,0.0,50.1,50.1,False,False,False,False,False,102,...,-0.03903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-01-11 22:10:00,,0.0,41.0,41.0,False,False,False,False,False,1030,...,0.142919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-08-29 21:35:00,,0.0,16.46,21.8,True,False,False,False,False,441,...,0.009332,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Change since last value

In [13]:
data_water['raw_ro_change'] = data_water['raw_ro'].diff()

# cal_na_mask = data_water['weir_level_cal'].notna() & data_water['raw_ro'].notna()
# # cal_na_mask
# (data_water['weir_level_cal'] - data_water['raw_ro']).dropna()
# del cal_na_mask
data_water['diff_ro_call'] = (data_water['weir_level_cal'] - data_water['raw_ro'])
# data_water['rain_diff']

data_water.sample(10)

Unnamed: 0_level_0,weir_level_cal,ra_rain,level_ro,raw_ro,obstruction_ro,gap_fill_ro,weir_cleaning_ro,spike_ro,calibration_ro,records_since_cal,...,ra_rain_rollstd_6,ra_rain_rollslope_6,ra_rain_rollmean_12,ra_rain_rollstd_12,ra_rain_rollslope_12,ra_rain_rollmean_72,ra_rain_rollstd_72,ra_rain_rollslope_72,raw_ro_change,diff_ro_call
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-02-02 12:45:00,,0.0,19.7,19.7,False,False,False,False,False,47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2015-01-24 15:35:00,,0.0,31.0,31.0,False,False,False,False,False,373,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2015-08-04 16:55:00,,0.0,0.0,0.0,False,False,False,False,False,102,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2015-11-06 11:30:00,,0.0,40.5,40.5,False,False,False,False,False,36,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.1,
2015-07-29 22:40:00,,0.0,9.0,9.0,False,False,False,False,False,168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2015-02-05 19:00:00,,0.0,24.09,28.1,False,False,False,True,False,705,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.1,
2015-07-06 16:05:00,,0.0,0.0,0.0,False,False,False,False,False,93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2015-01-12 10:40:00,,0.0,31.0,31.0,False,False,False,False,False,27,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,
2015-06-24 01:20:00,,0.0,9.1,9.1,False,False,False,False,False,199,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.2,
2015-12-12 08:30:00,,0.0,43.21,51.1,True,False,False,False,False,292,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [None]:
# data_water = data_water.dropna(subset='obstruction_ro')

## DRAFTING

In [None]:
u_w_mini = united_water.copy()[['weir_level_cal', 'ra_rain']]['2015-01-01 00:00:00':'2015-02-01 00:00:00']
# print(u_w_mini.info())

is_cal = u_w_mini['weir_level_cal'].notna()
g_id = is_cal.cumsum()
# g_id
u_w_mini['records_since_cal'] = u_w_mini.groupby(g_id).cumcount()
del is_cal, g_id

u_w_mini

In [None]:
# water_m = united_water[['raw_ro', 'level_ro', 'ra_rain', 'obstruction_ro']]['2010-12-28 10:00:00':'2011-01-05 23:59:59']
water_m = united_water[['raw_ro', 'level_ro', 'ra_rain', 'obstruction_ro']]

null_mask = water_m['ra_rain'].isnull()
g_id_event = null_mask.cumsum()
water_m['r_event_sum'] = water_m.groupby(g_id_event)['ra_rain'].cumsum()

is_rain = water_m['ra_rain'].notna()
g_id = is_rain.cumsum()
# g_id
water_m['since_rain'] = water_m.groupby(g_id).cumcount()
water_m['dec'] = np.exp(-0.1*water_m['since_rain'])
water_m['rain_fill'] = water_m['r_event_sum'].ffill()
# data_u['1_shallow_f'] = data_u['1_shallow'].ffill()
water_m['rain_dec'] = (water_m['rain_fill']*water_m['dec'])

# water_m.drop(columns=['level_ro'])
# water_m = water_mini['2010-12-28 10:00:00':'2011-01-05 23:59:59']
# water_m['1992-04-23 18:00:00':'1992-04-23 23:59:59']

In [None]:
water_mini = pd.merge(
    united_water[['raw_ro', 'level_ro', 'ra_rain', 'obstruction_ro']],
    water_m[['rain_dec', 'since_rain']],
    left_index=True,
    right_index=True,
    # soil_mini_shallow.reset_index(),
    # soil_mini_deep.reset_index(),
    # on = ["date", "sample"],
    # suffixes = ("_shallow", "_deep"),
    how = "outer"
    )
# g_id
# is_rain_event = (water_mini['since_rain'] == 0)
# g_id_rain = is_rain_event.cumsum()
# water_mini.groupby(g_id_rain).cumcount()

del water_m, null_mask, g_id_event, is_rain, g_id

In [None]:
## Subset for testing
# water_mini = united_water['2010-01-01 00:00:00':'2012-12-31 23:59:59']
# water_mini = united_water.copy()
# water_mini
# water_mini = united_water[['raw_ro', 'level_ro', 'ra_rain', 'obstruction_ro']]

water_mini['ra_rain'] = water_mini['ra_rain'].fillna(0)
water_mini = water_mini.dropna()
water_mini.head()

Calculate distance from previous rainfall

In [None]:
# w_m = water_mini['1989-07-19 11:55:00':'1991-07-19 11:55:00'].copy()

In [None]:
# # is_rain = (w_m['ra_rain'] != 0)
# # g_id = is_rain.cumsum()
# # w_m['since_rain'] = w_m.groupby(g_id).cumcount()
# # w_m

# is_rain = (water_mini['ra_rain'] != 0)
# g_id = is_rain.cumsum()
# water_mini['since_rain'] = water_mini.groupby(g_id).cumcount()
# water_mini.head()

# # del is_rain, g_id

In [None]:
# # g_id
# is_rain_event = (water_mini['since_rain'] == 0)
# g_id_rain = is_rain_event.cumsum()
# water_mini.groupby(g_id_rain).cumcount()

Calculate the difference from previous value

In [None]:
water_mini['diff_ro'] = water_mini['raw_ro'].diff()
# fix first NA
water_mini['diff_ro'] = water_mini['diff_ro'].fillna(0)
water_mini[['raw_ro', 'diff_ro']].head()

In [None]:
water_mini['1992-04-23 18:00:00':'1992-04-23 18:30:00']

In [None]:
# # soil_mini = united_soil[['sample', 'h2o_by_wet_shallow', 'h2o_by_wet_deep']]
# soil_mini = united_soil.copy()
# # soil_mini = soil_mini.reset_index()
# # # soil_mini['sample'] = soil_mini['sample'].astype('category')
# # # soil_mini['dup'] = soil_mini.duplicated()
# # print(len(soil_mini[soil_mini.duplicated()==False]))
# # print(len(soil_mini.drop_duplicates()))
# # # united_soil[united_soil['chk_note_shallow']=='doubtful']
# # # soil_mini['sample'] = soil_mini['sample'].astype('category')
# # # soil_mini = soil_mini[['sample', 'h2o_by_wet_shallow', 'h2o_by_wet_deep']]
# # # soil_mini

In [None]:
# water_mini = water_mini[['raw_ro', 'level_ro', 'ra_rain', 'obstruction_ro']]
# water_mini

In [None]:
# len(soil_mini)

## Soil

Pivot the soil data such that each sample has its own columns, and separated by depth.

In [14]:
# Drop irrelevant column
data_soil_shallow = united_soil.copy().drop('h2o_by_wet_deep', axis=1)
# Pivot wider
data_soil_shallow = data_soil_shallow.pivot(columns='sample', values='h2o_by_wet_shallow')

# Drop irrelevant column
data_soil_deep = united_soil.copy().drop('h2o_by_wet_shallow', axis=1)
# Pivot wider
data_soil_deep = data_soil_deep.pivot(columns='sample', values='h2o_by_wet_deep')

# Combine
data_soil = pd.merge(
    data_soil_shallow,
    data_soil_deep,
    left_index = True,
    right_index = True,
    suffixes = ("_shallow", "_deep"),
    how = "outer"
)

del data_soil_shallow, data_soil_deep

data_soil.sample(10)

sample,1_shallow,2_shallow,3_shallow,4_shallow,5_shallow,6_shallow,7_shallow,8_shallow,9_shallow,10_shallow,1_deep,2_deep,3_deep,4_deep,5_deep,6_deep,7_deep,8_deep,9_deep,10_deep
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1994-12-09,36.5,45.9,44.1,40.8,47.9,37.9,38.5,44.8,32.2,35.7,34.8,38.4,37.6,34.9,40.9,29.7,36.6,27.9,30.1,32.0
2008-03-27,31.8,31.9,35.0,31.5,31.4,35.8,33.4,33.8,26.6,26.4,32.4,31.7,25.8,32.8,37.6,35.1,32.5,29.0,27.4,26.1
2017-05-25,36.0,42.2,46.1,39.0,42.8,46.2,38.8,44.6,35.7,33.0,38.7,30.7,43.7,35.7,43.9,44.3,33.2,41.1,32.0,30.9
2015-02-27,36.2,32.2,34.9,33.3,34.3,36.7,30.3,37.0,29.9,25.8,36.8,34.3,36.2,31.9,37.4,39.6,30.8,28.9,30.9,25.4
2010-09-16,40.6,41.1,40.3,42.3,38.8,45.4,45.2,51.1,38.3,36.7,36.4,38.6,48.6,38.2,39.5,47.0,37.4,36.4,31.8,30.1
2023-04-21,28.4,29.9,30.7,25.1,31.5,35.0,29.3,28.0,23.5,29.4,38.5,29.2,33.2,27.7,24.9,36.0,23.8,32.8,31.1,25.3
2017-11-09,36.7,38.1,48.8,46.2,48.4,,38.4,46.7,35.9,36.3,36.9,35.2,42.7,39.1,46.9,46.8,33.0,29.6,33.9,29.5
2010-09-02,41.5,45.1,45.3,42.4,43.7,47.4,45.9,,39.3,38.1,36.9,36.6,31.9,39.1,40.5,44.9,37.9,33.4,31.8,28.4
2021-11-18,53.4,51.5,51.7,43.0,43.6,50.3,49.5,46.4,45.1,44.4,36.6,43.4,41.0,39.4,38.5,39.6,38.0,36.9,34.1,34.3
1992-12-03,42.1,46.1,42.6,41.2,49.6,36.8,47.4,38.9,37.6,42.5,37.9,38.9,34.3,33.1,39.1,30.6,38.7,22.7,32.4,31.1


In [None]:
# soil_mini_shallow = united_soil.copy().drop('h2o_by_wet_deep', axis=1)
# soil_mini_shallow = soil_mini_shallow.pivot(columns='sample', values='h2o_by_wet_shallow')

# soil_mini_deep = united_soil.copy().drop('h2o_by_wet_shallow', axis=1)
# soil_mini_deep = soil_mini_deep.pivot(columns='sample', values='h2o_by_wet_deep')

# soil_mini = pd.merge(
#     soil_mini_shallow,
#     soil_mini_deep,
#     left_index=True,
#     right_index=True,
#     # soil_mini_shallow.reset_index(),
#     # soil_mini_deep.reset_index(),
#     # on = ["date", "sample"],
#     suffixes = ("_shallow", "_deep"),
#     how = "outer"
#     )

# soil_mini.head()

## Unite

In [15]:
data_united = pd.merge(
    data_water,
    data_soil,
    left_index = True,
    right_index = True,
    how = 'outer'
)

# Extend soil vals
cols_to_fill = [col for col in data_united.columns if (col.endswith('shallow') | col.endswith('deep'))]
data_united[cols_to_fill] = data_united[cols_to_fill].ffill()

data_united.sample(10)

Unnamed: 0,weir_level_cal,ra_rain,level_ro,raw_ro,obstruction_ro,gap_fill_ro,weir_cleaning_ro,spike_ro,calibration_ro,records_since_cal,...,1_deep,2_deep,3_deep,4_deep,5_deep,6_deep,7_deep,8_deep,9_deep,10_deep
2015-05-19 20:05:00,,0.0,0.2,0.2,False,False,False,False,False,139.0,...,36.0,28.0,33.8,30.4,35.8,37.6,33.2,26.8,31.0,27.9
2015-06-15 21:10:00,,0.0,14.99,22.0,True,False,False,False,False,153.0,...,36.9,33.1,37.3,38.0,38.2,43.7,34.8,34.1,37.7,30.3
2015-01-05 19:05:00,,0.0,36.0,36.0,False,False,False,False,False,128.0,...,31.8,31.9,35.8,35.0,34.1,39.1,32.5,22.8,30.0,28.3
2015-11-28 23:20:00,,0.0,49.5,49.5,False,False,False,False,False,466.0,...,37.1,38.4,42.5,35.6,37.8,43.3,37.6,33.8,32.8,31.7
2015-09-16 10:00:00,,0.0,33.67,42.5,True,False,False,False,False,304.0,...,40.4,35.8,42.1,36.2,38.2,43.0,35.1,27.4,35.1,37.8
2015-02-07 08:20:00,,0.0,24.4,24.4,False,False,False,False,False,284.0,...,34.5,32.3,37.7,33.7,37.3,39.5,36.9,22.1,30.8,28.3
2015-12-16 08:25:00,,0.0,37.6,37.6,False,False,False,False,False,1.0,...,38.8,39.3,45.1,47.1,38.5,44.4,36.2,28.7,33.6,30.7
2015-09-26 03:05:00,,0.0,39.59,42.0,True,False,False,False,False,213.0,...,37.7,36.1,40.2,33.1,38.4,43.3,34.7,24.1,34.6,30.8
2015-06-22 09:50:00,,0.0,14.1,14.1,False,False,False,False,False,8.0,...,37.0,38.1,40.8,34.2,37.8,42.5,34.9,27.9,32.6,30.5
2015-06-15 12:25:00,,0.0,13.7,13.7,False,False,False,False,False,48.0,...,36.9,33.1,37.3,38.0,38.2,43.7,34.8,34.1,37.7,30.3


### DRAFTING2

In [None]:
# water_mini_test = water_mini['1990-01-01 00:00:00':'1992-12-31 23:23:59']
# print(len(water_mini_test))
# soil_mini_test = soil_mini['1990-01-01 00:00:00':'1992-12-31 23:23:59']

In [None]:
data_united = pd.merge(
    water_mini,
    soil_mini,
    left_index=True,
    right_index=True,
    # soil_mini_shallow.reset_index(),
    # soil_mini_deep.reset_index(),
    # on = ["date", "sample"],
    # suffixes = ("_shallow", "_deep"),
    how = "outer"
    )

# mini_mini_test = mini_test['1992-04-09 00:00:00':'1992-04-09 23:23:59']
# mini_mini_test
# mini_test['soil_stale'] = 
data_united.head()

In [None]:
data_u = data_united['1989-07-19 11:55:00':'1989-08-15 00:00:00']

In [None]:
# data_u = data_united['1989-07-19 11:55:00':'1989-08-15 00:00:00']
data_u = data_u[['raw_ro', '1_shallow', '2_shallow']]
# cols_to_check = [col for col in data_u.columns if (col.endswith('shallow') | col.endswith('deep'))]
# is_soil = data_u[cols_to_check].notna()
# g_id = is_soil.cumsum()
# g_id

col_to_check = "1_shallow"
is_soil = data_u[col_to_check].notna()
g_id = is_soil.cumsum()
# g_id
data_u['since_soil'] = data_u.groupby(g_id).cumcount()
data_u['dec'] = np.exp(-0.001*data_u['since_soil'])
data_u['1_shallow_f'] = data_u['1_shallow'].ffill()
data_u['1_shallow_w'] = (data_u['1_shallow_f']*data_u['dec'])
data_u

# finding most freq
# g_id['most'] = g_id.mode(axis=1)[0]
# data_u['since_soil'] = data_u.groupby(g_id).cumcount()
# data_u

# g_id = is_rain.cumsum()
# water_mini['since_rain'] = water_mini.groupby(g_id).cumcount()
# water_mini.head()
del col_to_check, is_soil, g_id, data_u

In [None]:
cols_to_fill = [col for col in data_united.columns if (col.endswith('shallow') | col.endswith('deep'))]
data_united[cols_to_fill] = data_united[cols_to_fill].ffill()
## Only fill for the exact day the measurement was taken
# data_united[cols_to_fill] = data_united[cols_to_fill].groupby(pd.Grouper(freq='D')).ffill()
data_united = data_united.dropna(subset='obstruction_ro')
data_united.head()

In [None]:
# # data_united['1989-07-19 11:55:00':'1989-07-30 11:55:00']
# # data_united[data_united['obstruction_ro']==True]
# data_u = data_united['2010-12-28 10:00:00':'2011-01-05 23:59:59']
# data_u['rain_event']
# # data_united[data_united['since_rain']==0]

### Train/Test (80/20)

In [None]:
# # mini_xy = water_mini[['level_ro', 'raw_ro', 'chk_note_ro', 'source_ro']].dropna()
# mini_xy = water_mini.copy().drop('level_ro', axis=1).dropna()
# mini_y = mini_xy['obstruction_ro']
# # mini_x = mini_xy[['raw_ro', 'chk_note_ro', 'source_ro']]
# mini_x = mini_xy.drop('obstruction_ro', axis=1)
# mini_xy
var_of_interest = 'obstruction_ro'
y_drops = ['level_ro', 'obstruction_ro', 'gap_fill_ro', 'weir_cleaning_ro', 'spike_ro', 'calibration_ro']

data_filtered = data_united.copy().dropna(subset = var_of_interest)
# y_drops.remove(var_of_interest)

united_y = data_filtered[var_of_interest]
# united_x = data_united.drop([var_of_interest, 'level_ro'], axis=1)
united_x = data_filtered.drop(y_drops, axis=1)
# united_x.info()

del data_filtered

# united_x.info()

# united_x.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 100537 entries, 2015-01-01 00:00:00 to 2015-12-31 23:55:00
Data columns (total 57 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   weir_level_cal        89 non-null      float64
 1   ra_rain               100537 non-null  float64
 2   raw_ro                100537 non-null  float64
 3   records_since_cal     100537 non-null  float64
 4   records_since_rain    100537 non-null  float64
 5   raw_ro_lag1           100406 non-null  float64
 6   raw_ro_lag2           100404 non-null  float64
 7   raw_ro_lag3           100402 non-null  float64
 8   raw_ro_lag6           100396 non-null  float64
 9   raw_ro_lag12          100384 non-null  float64
 10  raw_ro_lag24          100360 non-null  float64
 11  ra_rain_lag1          100536 non-null  float64
 12  ra_rain_lag2          100535 non-null  float64
 13  ra_rain_lag3          100534 non-null  float64
 14  ra_rain_lag6      

In [27]:
print("Test:\t20p of", len(united_y), "is", round(.2*len(united_y)))
print("Train:\t80p of", len(united_y), "is", round(.8*len(united_y)))
print(round(.2*len(united_y)) + round(.8*len(united_y)))

# mini_x.index[1]

Test:	20p of 100537 is 20107
Train:	80p of 100537 is 80430
100537


In [28]:
x_train, x_test, y_train, y_test = train_test_split(united_x, united_y, test_size = 0.2, shuffle=False)

print(
    "Train:\t", len(x_train), "\t", x_train.index[0], "thru", x_train.index[-1],
    "\nTest:\t", len(x_test), "\t\t", x_test.index[0], "thru", x_test.index[-1]
    # len(x_train), len(x_test), "\n",
    # x_train.index[-1]
)

Train:	 80429 	 2015-01-01 00:00:00 thru 2015-10-23 04:15:00 
Test:	 20108 		 2015-10-23 04:20:00 thru 2015-12-31 23:55:00


### Sliding Window

In [29]:
tscv = TimeSeriesSplit(n_splits=29)
print(tscv)

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=29, test_size=None)


In [30]:
# print(tscv)
for i, (train_index, val_index) in enumerate(tscv.split(x_train)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={val_index}")
    # print("  Train: index=", mini_x.index[train_index])
    # print(f"  Test:  index={val_index}")
    print("--------------------------------------------------")

del i, train_index, val_index

Fold 0:
  Train: index=[   0    1    2 ... 2706 2707 2708]
  Test:  index=[2709 2710 2711 ... 5386 5387 5388]
--------------------------------------------------
Fold 1:
  Train: index=[   0    1    2 ... 5386 5387 5388]
  Test:  index=[5389 5390 5391 ... 8066 8067 8068]
--------------------------------------------------
Fold 2:
  Train: index=[   0    1    2 ... 8066 8067 8068]
  Test:  index=[ 8069  8070  8071 ... 10746 10747 10748]
--------------------------------------------------
Fold 3:
  Train: index=[    0     1     2 ... 10746 10747 10748]
  Test:  index=[10749 10750 10751 ... 13426 13427 13428]
--------------------------------------------------
Fold 4:
  Train: index=[    0     1     2 ... 13426 13427 13428]
  Test:  index=[13429 13430 13431 ... 16106 16107 16108]
--------------------------------------------------
Fold 5:
  Train: index=[    0     1     2 ... 16106 16107 16108]
  Test:  index=[16109 16110 16111 ... 18786 18787 18788]
-------------------------------------------

In [None]:
# val_tracker = y_train.copy().to_frame()
# val_tracker['pred'] = .5
# val_tracker.head()

In [None]:
# # preds
# y_t = y_t.to_frame()
# y_t['preds'] = preds
# pd.concat(y_t)

## Model

In [None]:
# tscv = TimeSeriesSplit(n_splits=15)
# val_tracker = y_train.copy()
# val_tracker['pred'] = .5
# val_tracker = y_train.copy().to_frame()
val_tracker = pd.DataFrame()

for train_index, val_index in tscv.split(x_train):
    x_t, X_val = x_train.iloc[train_index], x_train.iloc[val_index]
    y_t, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # model = xgb.XGBRegressor(enable_categorical=True, tree_method="hist")
    model = xgb.XGBClassifier(enable_categorical=True, tree_method="hist")
    model.fit(x_t, y_t)
    preds = model.predict(X_val)
    #
    y_val_out = y_val.copy().to_frame()
    y_val_out['pred'] = preds
    y_val_out['pred_tf'] = np.where(y_val_out['pred'] == 1, True, False)
    val_tracker = pd.concat([val_tracker, y_val_out])
    #
    mse = mean_squared_error(y_val, preds)
    f1 = f1_score(y_val_out[var_of_interest].tolist(), y_val_out['pred_tf'].tolist())
    # print("Validation RMSE:", mean_squared_error(y_val, preds, squared=False))
    print("Validation MSE:", mse, "\tRMSE:", np.sqrt(mse), "\tF1:", f1)

# val_tracker['pred_tf'] = np.where(val_tracker['pred'] >= 0.5, True, False)

del x_t, X_val, y_t, y_val, model, preds, mse, f1

In [None]:
# val_tracker.head()
# y_val.to_list()
# y_val_out['obstruction_ro']
# f1_score(y_val_out['obstruction_ro'].tolist(), y_val_out['pred_tf'].tolist())
# f1_score(y_val_out['obstruction_ro'], y_val_out['pred_tf'])
# y_val_out['pred_tf'].tolist()

In [None]:
for i, (train_index, val_index) in enumerate(tscv.split(x_train)):
    continue
    # print(f"Fold {i}:")
    # print(f"  Train: index={train_index}")
    # print(f"  Test:  index={val_index}")

# print(train_index, "\n", val_index)

x_t, X_val = x_train.iloc[train_index], x_train.iloc[val_index]
y_t, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

# model = xgb.XGBRegressor(enable_categorical=True, tree_method="hist")
model = xgb.XGBClassifier(enable_categorical=True, tree_method="hist")
model.fit(x_t, y_t)

preds = model.predict(X_val)
mse = mean_squared_error(y_val, preds)
# f1 = f1_score(y_val_out['obstruction_ro'].tolist(), y_val_out['pred_tf'].tolist())
print("Validation MSE:", mse, "\tRMSE:", np.sqrt(mse))

In [None]:
mini_val = y_val.copy()
mini_val = mini_val.reset_index()
mini_val['pred'] = preds
mini_val.set_index('index')
mini_val['pred_tf'] = np.where(mini_val['pred'] == 1, True, False)
mini_val

In [None]:
fig, ax = plt.subplots(figsize=(20, 1.5))
ax.scatter(mini_val.index, mini_val[var_of_interest], s=25, color='blue', marker="|")
ax.scatter(mini_val.index, mini_val['pred_tf']-.06, s=25, color='orange', marker="|")

plt.show()

In [None]:
# Compute the confusion matrix
cm = confusion_matrix(mini_val[var_of_interest].tolist(), mini_val['pred_tf'].tolist())
print("Confusion Matrix:\n", cm)

# F1
print("F1:\n", f1_score(mini_val[var_of_interest].tolist(), mini_val['pred_tf'].tolist()))

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Negative', 'Positive'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

del cm, disp