# Data Separation

Author: Gillian A. McGinnis, final-semester M.S. Information Science - Machine Learning  
The University of Arizona College of Information  
INFO 698 - Capstone  
Start date: 21 October 2025  
Last updated: 21 October 2025

In [1]:
"""
Module providing code for test/train split and sliding window creation. Relies on 01_eda.ipynb completion.
"""

'\nModule providing code for test/train split and sliding window creation. Relies on 01_eda.ipynb completion.\n'

## Setup

### Packages

In [2]:
# General packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# # import matplotlib.ticker as ticker
# import matplotlib.dates as mdates
# import datetime as dt
from sklearn.model_selection import TimeSeriesSplit, train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error, confusion_matrix, ConfusionMatrixDisplay, f1_score, precision_score, recall_score, accuracy_score

In [3]:
## (Optional chunk)
# Current session information
import session_info
session_info.show(dependencies=False)

### Data

In [4]:
united_water = pd.read_parquet('data/clean/water.parquet')
united_soil = pd.read_parquet('data/clean/soil.parquet')

## Prepare

## Feature Engineering

In [5]:
# Select columns of interest
data_water = united_water.drop(columns=['raw_rain', 'chk_note_rain', 'chk_fail_rain', 'chk_note_ro', 'chk_fail_ro', 'comment_ro', 'source_ro'])

In [6]:
### Note ###
# Remove this later -- just a smaller subset for feature engineering testing!!
# data_water = data_water['2015-01-01 00:00:00':'2016-12-31 23:59:59']
data_water = data_water['2000-01-01 00:00:00':'2015-12-31 23:59:59']
######

Create feature which tracks how recent a calibration was conducted.

In [7]:
def since_feat(input_df, input_col):
    output_df = input_df.copy()
    # Create index of instances where there is a data point
    instances = output_df[input_col].notna()
    # Create groupings based on most recent instance
    group_id = instances.cumsum()
    # Create new column to count number of records since the point
    # which resets to 0 at each new point
    output_df[f"since_{input_col}"] = output_df.groupby(group_id).cumcount()
    return output_df

In [8]:
# # since_feat(data_water[['ra_rain', 'raw_ro']], 'ra_rain')
# # data_water[['ra_rain', 'raw_ro']]
# data_w_test = data_water.copy()[['ra_rain', 'raw_ro']]
# data_w_test['ra_rain'] = data_w_test['ra_rain'].replace(0, np.nan)

# data_w_test = since_feat(data_w_test, 'ra_rain')
# data_w_test.head()

In [9]:
# # Create index of instances where there is a calibration point
# cal_instances = data_water['weir_level_cal'].notna()
# # Create groupings based on most recent instance
# cal_group_id = cal_instances.cumsum()
# # Create new column to count number of records since the calibration point
# # which resets to 0 at each new calibration
# data_water['records_since_cal'] = data_water.groupby(cal_group_id).cumcount()

# # Clean up environment
# del cal_instances, cal_group_id

# # data_water

data_water = since_feat(data_water, 'weir_level_cal')

Create feature which tracks how recent a rain event occurred.

In [10]:
# # Create index of instances where there is a calibration point
# rain_instances = data_water['ra_rain'].notna()
# # Create groupings based on most recent instance
# rain_group_id = rain_instances.cumsum()
# # Create new column to count number of records since the calibration point
# # which resets to 0 at each new calibration
# data_water['records_since_rain'] = data_water.groupby(rain_group_id).cumcount()

# # Clean up environment
# del rain_instances, rain_group_id

# # Replace NAs with 0
# data_water['ra_rain'] = data_water['ra_rain'].fillna(0)

# data_water.sample(10)
# # data_water.dropna(subset='raw_ro')

data_water = since_feat(data_water, 'ra_rain')
data_water.sample(10)

Unnamed: 0_level_0,weir_level_cal,ra_rain,level_ro,raw_ro,obstruction_ro,gap_fill_ro,weir_cleaning_ro,spike_ro,calibration_ro,since_weir_level_cal,since_ra_rain
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2012-08-27 10:05:00,,,44.63,45.2,True,False,False,False,False,8,270
2006-04-24 07:35:00,,,11.4,11.4,False,False,False,False,False,828,168
2002-11-02 19:05:00,,,73.5,73.5,False,False,False,False,False,411,674
2005-12-11 17:30:00,,,46.7,46.7,False,False,False,False,False,667,205
2009-11-04 16:25:00,,,82.2,97.2,True,False,False,False,False,91,25
2012-09-22 19:45:00,,,56.9,56.9,False,False,False,False,False,416,378
2012-01-04 20:40:00,,,50.2,50.2,False,False,False,False,False,138,218
2003-10-24 01:40:00,,,68.8,68.8,False,False,False,False,False,202,62
2009-12-04 02:35:00,,,68.4,68.4,False,False,False,False,False,208,763
2012-12-17 11:10:00,,,109.6,109.6,False,False,False,False,False,25,108


Lag features: rain

In [11]:
def lag_feats(input_df, input_cols, input_lags):
    output_df = input_df.copy()
    for col in input_cols:
        for lag in input_lags:
            output_df[f"{col}_lag{lag}"] = output_df[col].shift(lag)
    return output_df

In [12]:
# lag_feats(data_water, ['raw_ro'], [1, 2, 3, 24]).dropna(subset='raw_ro')[['raw_ro', 'raw_ro_lag1', 'raw_ro_lag2']]
# lag_feats(data_water, ['raw_ro'], [1, 2, 3, 24]).dropna(subset='raw_ro')[['raw_ro', 'raw_ro_lag1', 'raw_ro_lag24']]

# Columns to get temporal stats on
cols_to_shift = ['raw_ro', 'ra_rain']
# data at 5-min increments -- lag to record values at 5m, 10m, 15m, 30m, 1h, and 2h prior
lags_of_interest = [1, 2, 3, 6, 12, 24]

data_water = lag_feats(data_water, cols_to_shift, lags_of_interest)

data_water.sample(10)

Unnamed: 0_level_0,weir_level_cal,ra_rain,level_ro,raw_ro,obstruction_ro,gap_fill_ro,weir_cleaning_ro,spike_ro,calibration_ro,since_weir_level_cal,...,raw_ro_lag3,raw_ro_lag6,raw_ro_lag12,raw_ro_lag24,ra_rain_lag1,ra_rain_lag2,ra_rain_lag3,ra_rain_lag6,ra_rain_lag12,ra_rain_lag24
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-03-12 10:55:00,,,17.0,17.0,False,False,False,False,False,600,...,17.0,17.0,17.0,17.0,,,,,,
2009-01-02 11:50:00,,,40.6,51.2,True,False,False,False,False,894,...,51.2,51.3,51.3,51.5,,,,,,
2003-03-21 10:45:00,,,0.0,0.0,False,False,False,False,False,31,...,0.0,0.0,0.0,0.0,,,,,,
2008-01-24 23:55:00,,,24.2,24.2,False,False,False,False,False,178,...,24.4,24.3,24.3,24.3,,,,,,
2015-08-21 13:30:00,,,13.2,13.2,False,False,False,False,False,58,...,13.3,13.3,13.2,14.0,,,,,,
2003-11-11 04:00:00,,,82.1,82.1,False,False,False,False,False,1089,...,82.1,82.1,82.1,82.4,,,,,,
2011-04-16 04:25:00,,,12.1,12.1,False,False,False,False,False,232,...,12.1,12.2,11.9,11.9,,,,,,
2010-11-11 18:10:00,,,77.6,77.6,False,False,False,False,False,107,...,77.8,77.8,77.9,78.0,,,,,,
2001-11-06 22:05:00,,,37.0,37.0,False,False,False,False,False,1022,...,37.0,37.0,37.1,37.1,,,,,,
2005-10-29 08:20:00,,,42.7,42.7,False,False,False,False,False,278,...,42.9,43.0,43.1,43.0,,,,,,


In [13]:
def rolling_feats(input_df, input_cols, input_windows):
    output_df = input_df.copy()
    for col in input_cols:
        for window in input_windows:
            output_df[f"{col}_rollmean_{window}"] = output_df[col].rolling(window).mean()
            output_df[f"{col}_rollstd_{window}"] = output_df[col].rolling(window).std()
            output_df[f"{col}_rollslope_{window}"] = (output_df[col].rolling(window).apply(lambda x: np.polyfit(range(len(x)), x, 1)[0], raw=True))
    return output_df

In [14]:
# data_water_mini = data_water['1990-01-01 00:00:00':'1990-01-30 23:59:59']
# rolling_feats(data_water_mini, cols_to_lag, [6, 12, 36])
# 10m, 30m, 1h, 6h
windows_of_interest = [2, 6, 12, 72]


data_water = rolling_feats(data_water, cols_to_shift, windows_of_interest)

data_water.sample(10)

Unnamed: 0_level_0,weir_level_cal,ra_rain,level_ro,raw_ro,obstruction_ro,gap_fill_ro,weir_cleaning_ro,spike_ro,calibration_ro,since_weir_level_cal,...,ra_rain_rollslope_2,ra_rain_rollmean_6,ra_rain_rollstd_6,ra_rain_rollslope_6,ra_rain_rollmean_12,ra_rain_rollstd_12,ra_rain_rollslope_12,ra_rain_rollmean_72,ra_rain_rollstd_72,ra_rain_rollslope_72
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2002-11-10 14:05:00,,,53.5,53.5,False,False,False,False,False,636,...,,,,,,,,,,
2005-06-26 10:05:00,,,59.6,59.6,False,False,False,False,False,596,...,,,,,,,,,,
2011-10-19 04:10:00,,,74.3,74.3,False,False,False,False,False,230,...,,,,,,,,,,
2010-03-21 12:20:00,,,20.2,20.2,False,False,False,False,False,613,...,,,,,,,,,,
2004-03-30 00:20:00,,,6.7,6.7,False,False,False,False,False,180,...,,,,,,,,,,
2001-01-03 23:05:00,,,82.7,82.7,False,False,False,False,False,164,...,,,,,,,,,,
2009-05-31 17:25:00,,0.254,20.0,20.0,False,False,False,False,False,674,...,,,,,,,,,,
2008-12-31 08:05:00,,,38.6,38.6,False,False,False,False,False,273,...,,,,,,,,,,
2008-03-15 21:05:00,,,11.3,11.3,False,False,False,False,False,430,...,,,,,,,,,,
2007-06-11 02:10:00,,,59.9,59.9,False,False,False,False,False,774,...,,,,,,,,,,


Change since last value

In [15]:
data_water['raw_ro_change'] = data_water['raw_ro'].diff()

# cal_na_mask = data_water['weir_level_cal'].notna() & data_water['raw_ro'].notna()
# # cal_na_mask
# (data_water['weir_level_cal'] - data_water['raw_ro']).dropna()
# del cal_na_mask
data_water['diff_ro_cal'] = (data_water['weir_level_cal'] - data_water['raw_ro'])
# data_water['rain_diff']

data_water.sample(10)

Unnamed: 0_level_0,weir_level_cal,ra_rain,level_ro,raw_ro,obstruction_ro,gap_fill_ro,weir_cleaning_ro,spike_ro,calibration_ro,since_weir_level_cal,...,ra_rain_rollstd_6,ra_rain_rollslope_6,ra_rain_rollmean_12,ra_rain_rollstd_12,ra_rain_rollslope_12,ra_rain_rollmean_72,ra_rain_rollstd_72,ra_rain_rollslope_72,raw_ro_change,diff_ro_cal
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-06-23 17:00:00,,,39.9,39.9,False,False,False,False,False,87,...,,,,,,,,,-0.4,
2008-02-13 15:40:00,,,17.9,17.9,False,False,False,False,False,83,...,,,,,,,,,-0.1,
2001-10-03 17:55:00,,,29.3,29.3,False,False,False,False,False,107,...,,,,,,,,,0.0,
2008-04-14 03:00:00,,,9.6,9.6,False,False,False,False,False,789,...,,,,,,,,,0.4,
2000-12-12 17:05:00,,,91.4,91.4,False,False,False,False,False,101,...,,,,,,,,,1.6,
2002-03-07 12:10:00,,,17.8,17.8,False,False,False,False,False,38,...,,,,,,,,,-0.1,
2008-02-08 01:40:00,,,22.0,22.0,False,False,False,False,False,484,...,,,,,,,,,0.0,
2001-04-25 07:25:00,,,6.4,6.4,False,False,False,False,False,272,...,,,,,,,,,0.0,
2005-07-25 00:05:00,,,35.0,35.0,False,False,False,False,False,756,...,,,,,,,,,0.2,
2003-01-01 23:10:00,,,26.4,26.4,False,False,False,False,False,457,...,,,,,,,,,0.0,


In [16]:
# data_water = data_water.dropna(subset='obstruction_ro')

## Soil

Pivot the soil data such that each sample has its own columns, and separated by depth.

In [17]:
# Drop irrelevant column
data_soil_shallow = united_soil.copy().drop('h2o_by_wet_deep', axis=1)
# Pivot wider
data_soil_shallow = data_soil_shallow.pivot(columns='sample', values='h2o_by_wet_shallow')

# Drop irrelevant column
data_soil_deep = united_soil.copy().drop('h2o_by_wet_shallow', axis=1)
# Pivot wider
data_soil_deep = data_soil_deep.pivot(columns='sample', values='h2o_by_wet_deep')

# Combine
data_soil = pd.merge(
    data_soil_shallow,
    data_soil_deep,
    left_index = True,
    right_index = True,
    suffixes = ("_shallow", "_deep"),
    how = "outer"
)

del data_soil_shallow, data_soil_deep

data_soil.sample(10)

sample,1_shallow,2_shallow,3_shallow,4_shallow,5_shallow,6_shallow,7_shallow,8_shallow,9_shallow,10_shallow,1_deep,2_deep,3_deep,4_deep,5_deep,6_deep,7_deep,8_deep,9_deep,10_deep
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1996-05-09,39.2,41.1,40.4,37.0,39.9,41.3,37.7,47.3,33.4,33.4,35.7,39.4,31.0,33.1,38.0,29.4,35.6,31.0,31.8,30.1
2018-06-07,41.4,39.1,43.1,37.6,46.3,48.4,40.1,45.0,40.1,41.3,33.1,43.9,42.5,36.1,38.1,47.6,40.2,34.7,33.6,35.7
1999-01-29,33.1,43.4,36.8,34.7,40.2,34.9,32.9,40.5,28.7,29.4,34.2,38.4,31.7,35.6,37.8,30.1,45.4,24.2,28.4,27.8
1992-01-09,36.5,45.0,43.4,38.0,46.4,40.7,40.2,45.4,31.0,35.2,34.2,35.1,35.8,32.6,38.6,29.7,33.7,26.8,30.6,28.3
1991-07-18,38.4,38.7,41.0,39.2,44.4,44.1,43.0,48.0,36.2,36.7,36.1,38.6,37.4,40.8,40.8,35.0,37.5,30.4,33.0,32.7
2012-03-23,31.7,28.6,30.5,29.5,30.1,35.0,32.7,31.8,27.3,24.4,33.0,29.8,35.9,27.5,31.3,36.9,28.7,28.5,29.2,24.6
2014-09-04,37.9,35.0,41.5,38.3,39.0,41.9,37.8,45.6,33.8,35.0,36.6,36.7,35.5,34.6,37.9,43.5,34.7,26.4,34.1,31.6
2008-02-21,37.4,38.4,39.9,38.4,40.3,42.1,36.0,48.8,30.3,30.7,38.1,35.1,35.2,34.7,40.4,41.5,34.8,25.8,31.0,27.6
1996-10-07,40.7,41.9,42.7,36.4,45.1,44.8,38.6,48.3,37.9,37.9,35.2,35.9,34.7,34.4,40.0,28.5,35.3,35.1,30.6,30.1
1997-10-02,37.5,43.2,44.3,40.1,40.4,39.4,39.6,48.9,35.9,36.0,35.2,39.9,37.8,33.5,39.0,29.5,36.0,25.0,29.3,28.6


In [18]:
# soil_mini_shallow = united_soil.copy().drop('h2o_by_wet_deep', axis=1)
# soil_mini_shallow = soil_mini_shallow.pivot(columns='sample', values='h2o_by_wet_shallow')

# soil_mini_deep = united_soil.copy().drop('h2o_by_wet_shallow', axis=1)
# soil_mini_deep = soil_mini_deep.pivot(columns='sample', values='h2o_by_wet_deep')

# soil_mini = pd.merge(
#     soil_mini_shallow,
#     soil_mini_deep,
#     left_index=True,
#     right_index=True,
#     # soil_mini_shallow.reset_index(),
#     # soil_mini_deep.reset_index(),
#     # on = ["date", "sample"],
#     suffixes = ("_shallow", "_deep"),
#     how = "outer"
#     )

# soil_mini.head()

## Unite

In [19]:
# data_u_test = pd.merge(
#     data_water,
#     data_soil['2015-01-01 00:00:00':'2016-12-31 23:59:59'],
#     left_index = True,
#     right_index = True,
#     how = 'outer'
# )

# data_u_test

In [20]:
# # def since_feat(input_df, input_col):
# #     output_df = input_df.copy()
# #     # Create index of instances where there is a data point
# #     instances = output_df[input_col].notna()
# #     # Create groupings based on most recent instance
# #     group_id = instances.cumsum()
# #     # Create new column to count number of records since the point
# #     # which resets to 0 at each new point
# #     output_df[f"since_{input_col}"] = output_df.groupby(group_id).cumcount()
# #     return output_df

# cols_soil = [col for col in data_u_test.columns if (col.endswith('shallow') | col.endswith('deep'))]
# soil_instances = data_u_test[cols_soil].notna()
# soil_group_id = soil_instances.cumsum().max(axis=1)
# data_u_test["since_soil"] = data_u_test.groupby(soil_group_id).cumcount()
# # data_u_test.groupby(soil_group_id).cumcount()
# # data_u_test["since_soil"] = data_u_test.groupby(soil_group_id).cumcount()
# # data_u_test[cols_soil].notna().cumsum().max(axis=1)

In [28]:
data_united = pd.merge(
    data_water,
    # REMOVE LATER
    # data_soil['2015-01-01 00:00:00':'2016-12-31 23:59:59'],
    data_soil['2000-01-01 00:00:00':'2015-12-31 23:59:59'],
    # data_soil,
    #
    left_index = True,
    right_index = True,
    how = 'outer'
)

In [29]:
# Create feature to track soil value staleness
cols_soil = [col for col in data_united.columns if (col.endswith('shallow') | col.endswith('deep'))]
soil_instances = data_united[cols_soil].notna()
soil_group_id = soil_instances.cumsum().max(axis=1)
data_united["since_soil"] = data_united.groupby(soil_group_id).cumcount()

del soil_instances, soil_group_id

In [30]:
# Extend soil vals
# cols_to_fill = [col for col in data_united.columns if (col.endswith('shallow') | col.endswith('deep'))]
# data_united[cols_to_fill] = data_united[cols_to_fill].ffill()
data_united[cols_soil] = data_united[cols_soil].ffill()

del cols_soil
data_united.sample(10)

Unnamed: 0,weir_level_cal,ra_rain,level_ro,raw_ro,obstruction_ro,gap_fill_ro,weir_cleaning_ro,spike_ro,calibration_ro,since_weir_level_cal,...,2_deep,3_deep,4_deep,5_deep,6_deep,7_deep,8_deep,9_deep,10_deep,since_soil
2005-12-12 13:40:00,,,61.0,61.0,False,False,False,False,False,63.0,...,41.1,38.6,38.5,37.4,46.3,37.1,26.0,32.0,32.5,3048
2011-01-08 22:00:00,,,83.7,83.7,False,False,False,False,False,435.0,...,37.2,38.1,38.0,41.8,42.5,38.8,30.0,33.2,31.7,553
2004-06-23 10:40:00,,,42.2,42.2,False,False,False,False,False,11.0,...,40.1,29.8,33.2,35.1,44.5,35.4,27.3,31.6,31.9,1570
2009-11-30 21:30:00,,,96.5,96.5,False,False,False,False,False,147.0,...,37.0,31.0,37.8,39.9,43.5,33.1,28.5,34.1,36.2,3144
2006-10-14 01:25:00,,,62.8,62.8,False,False,False,False,False,192.0,...,34.7,37.5,39.4,42.3,44.5,38.3,24.3,32.6,31.1,595
2004-07-03 16:25:00,,,51.4,51.4,False,False,False,False,False,370.0,...,40.1,29.8,33.2,35.1,44.5,35.4,27.3,31.6,31.9,4521
2005-12-16 13:35:00,,,47.0,47.0,False,False,False,False,False,59.0,...,40.0,33.4,37.4,37.0,44.5,37.6,28.8,31.6,31.6,451
2002-09-13 09:10:00,,,45.0,45.0,False,False,False,False,False,5.0,...,38.8,31.3,24.7,39.4,43.3,36.6,25.1,31.5,31.2,110
2002-02-07 06:05:00,,,31.4,31.4,False,False,False,False,False,256.0,...,37.2,33.3,33.1,35.5,38.5,32.8,21.0,29.3,27.1,73
2011-06-07 13:55:00,,,67.9,67.9,False,False,False,False,False,59.0,...,29.6,34.2,39.1,33.4,38.5,42.1,32.7,30.8,32.1,3623


### Train/Test (80/20)

In [31]:
# # mini_xy = water_mini[['level_ro', 'raw_ro', 'chk_note_ro', 'source_ro']].dropna()
# mini_xy = water_mini.copy().drop('level_ro', axis=1).dropna()
# mini_y = mini_xy['obstruction_ro']
# # mini_x = mini_xy[['raw_ro', 'chk_note_ro', 'source_ro']]
# mini_x = mini_xy.drop('obstruction_ro', axis=1)
# mini_xy
var_of_interest = 'obstruction_ro'
y_drops = ['level_ro', 'obstruction_ro', 'gap_fill_ro', 'weir_cleaning_ro', 'spike_ro', 'calibration_ro']

data_filtered = data_united.copy().dropna(subset = var_of_interest)
# y_drops.remove(var_of_interest)

united_y = data_filtered[var_of_interest]
# united_x = data_united.drop([var_of_interest, 'level_ro'], axis=1)
united_x = data_filtered.drop(y_drops, axis=1)
# united_x.info()

del data_filtered

# united_x.info()

# united_x.info()

In [32]:
print("Test:\t20p of", len(united_y), "is", round(.2*len(united_y)))
print("Train:\t80p of", len(united_y), "is", round(.8*len(united_y)))
print(round(.2*len(united_y)) + round(.8*len(united_y)))

# mini_x.index[1]

Test:	20p of 1486142 is 297228
Train:	80p of 1486142 is 1188914
1486142


In [33]:
x_train, x_test, y_train, y_test = train_test_split(united_x, united_y, test_size = 0.2, shuffle=False)

print(
    "Train:\t", len(x_train), "\t", x_train.index[0], "thru", x_train.index[-1],
    "\nTest:\t", len(x_test), "\t\t", x_test.index[0], "thru", x_test.index[-1]
    # len(x_train), len(x_test), "\n",
    # x_train.index[-1]
)

Train:	 1188913 	 2000-01-12 14:55:00 thru 2011-06-18 18:00:00 
Test:	 297229 		 2011-06-18 18:05:00 thru 2015-12-31 23:55:00


### Sliding Window

In [34]:
tscv = TimeSeriesSplit(n_splits=20)
print(tscv)

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=20, test_size=None)


In [35]:
# print(tscv)
for i, (train_index, val_index) in enumerate(tscv.split(x_train)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={val_index}")
    # print("  Train: index=", mini_x.index[train_index])
    # print(f"  Test:  index={val_index}")
    print("--------------------------------------------------")

del i, train_index, val_index

Fold 0:
  Train: index=[    0     1     2 ... 56630 56631 56632]
  Test:  index=[ 56633  56634  56635 ... 113244 113245 113246]
--------------------------------------------------
Fold 1:
  Train: index=[     0      1      2 ... 113244 113245 113246]
  Test:  index=[113247 113248 113249 ... 169858 169859 169860]
--------------------------------------------------
Fold 2:
  Train: index=[     0      1      2 ... 169858 169859 169860]
  Test:  index=[169861 169862 169863 ... 226472 226473 226474]
--------------------------------------------------
Fold 3:
  Train: index=[     0      1      2 ... 226472 226473 226474]
  Test:  index=[226475 226476 226477 ... 283086 283087 283088]
--------------------------------------------------
Fold 4:
  Train: index=[     0      1      2 ... 283086 283087 283088]
  Test:  index=[283089 283090 283091 ... 339700 339701 339702]
--------------------------------------------------
Fold 5:
  Train: index=[     0      1      2 ... 339700 339701 339702]
  Test:  i

In [None]:
# val_tracker = y_train.copy().to_frame()
# val_tracker['pred'] = .5
# val_tracker.head()

In [None]:
# # preds
# y_t = y_t.to_frame()
# y_t['preds'] = preds
# pd.concat(y_t)

## Model

In [60]:
len(y_val.unique())

2

In [61]:
# tscv = TimeSeriesSplit(n_splits=15)
# val_tracker = y_train.copy()
# val_tracker['pred'] = .5
# val_tracker = y_train.copy().to_frame()
val_tracker = pd.DataFrame()
win_tracker = pd.DataFrame(columns=["fold", "mse", "rmse", "f1", "acc"])
i = 0

for train_index, val_index in tscv.split(x_train):
    x_t, X_val = x_train.iloc[train_index], x_train.iloc[val_index]
    y_t, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
    if len(y_t.unique()) != 2:
        print("Skipping fold", i)
        i += 1
        continue
    # model = xgb.XGBRegressor(enable_categorical=True, tree_method="hist")
    # model = xgb.XGBClassifier(enable_categorical=True, tree_method="hist")
    model = xgb.XGBClassifier(tree_method="hist")
    # if len(y_val.unique()) != 2:
    #     print("Skipping fold", i)
    #     i += 1
    #     continue
    # i += 1
    model.fit(x_t, y_t)
    preds = model.predict(X_val)
    #
    y_val_out = y_val.copy().to_frame()
    y_val_out['pred'] = preds
    y_val_out['pred_tf'] = np.where(y_val_out['pred'] == 1, True, False)
    val_tracker = pd.concat([val_tracker, y_val_out])
    #
    mse = mean_squared_error(y_val, preds)
    f1 = f1_score(y_val_out[var_of_interest].tolist(), y_val_out['pred_tf'].tolist())
    accuracy = accuracy_score(y_val_out[var_of_interest].tolist(), y_val_out['pred_tf'].tolist())
    # print("Validation RMSE:", mean_squared_error(y_val, preds, squared=False))
    print(i, "\tMSE:", round(mse, 4), "\tRMSE:", round(np.sqrt(mse), 4), "\tF1:", round(f1, 4), "\tAcc:", round(accuracy, 4))
    win_tracker.loc[len(win_tracker)] = {"fold":i, "mse": mse, "rmse": np.sqrt(mse), "f1": f1, "acc": accuracy}
    i += 1

# val_tracker['pred_tf'] = np.where(val_tracker['pred'] >= 0.5, True, False)

del i, x_t, X_val, y_t, y_val, model, preds, mse, f1, accuracy

Skipping fold 0
1 	MSE: 0.0038 	RMSE: 0.0613 	F1: 0.0 	Acc: 0.9962
2 	MSE: 0.0126 	RMSE: 0.1124 	F1: 0.0 	Acc: 0.9874
3 	MSE: 0.0017 	RMSE: 0.0416 	F1: 0.0 	Acc: 0.9983
4 	MSE: 0.0022 	RMSE: 0.0466 	F1: 0.0 	Acc: 0.9978
5 	MSE: 0.0057 	RMSE: 0.0755 	F1: 0.0 	Acc: 0.9943
6 	MSE: 0.0017 	RMSE: 0.0418 	F1: 0.0 	Acc: 0.9983
7 	MSE: 0.0089 	RMSE: 0.0943 	F1: 0.0 	Acc: 0.9911
8 	MSE: 0.0249 	RMSE: 0.1577 	F1: 0.0 	Acc: 0.9751
9 	MSE: 0.0043 	RMSE: 0.0655 	F1: 0.0 	Acc: 0.9957
10 	MSE: 0.254 	RMSE: 0.504 	F1: 0.0755 	Acc: 0.746
11 	MSE: 0.2366 	RMSE: 0.4865 	F1: 0.26 	Acc: 0.7634
12 	MSE: 0.0707 	RMSE: 0.2658 	F1: 0.3247 	Acc: 0.9293
13 	MSE: 0.1365 	RMSE: 0.3695 	F1: 0.1478 	Acc: 0.8635
14 	MSE: 0.2194 	RMSE: 0.4684 	F1: 0.1481 	Acc: 0.7806
15 	MSE: 0.2328 	RMSE: 0.4824 	F1: 0.4557 	Acc: 0.7672
16 	MSE: 0.1234 	RMSE: 0.3513 	F1: 0.3517 	Acc: 0.8766
17 	MSE: 0.2581 	RMSE: 0.508 	F1: 0.2802 	Acc: 0.7419
18 	MSE: 0.0764 	RMSE: 0.2765 	F1: 0.3317 	Acc: 0.9236
19 	MSE: 0.1102 	RMSE: 0.332 	F1: 0.

In [None]:
val_tracker.head()

In [None]:
print(round(win_tracker, 4))

In [None]:
# val_tracker.head()
# y_val.to_list()
# y_val_out['obstruction_ro']
# f1_score(y_val_out['obstruction_ro'].tolist(), y_val_out['pred_tf'].tolist())
# f1_score(y_val_out['obstruction_ro'], y_val_out['pred_tf'])
# y_val_out['pred_tf'].tolist()

In [36]:
for i, (train_index, val_index) in enumerate(tscv.split(x_train)):
    continue
    # print(f"Fold {i}:")
    # print(f"  Train: index={train_index}")
    # print(f"  Test:  index={val_index}")

# print(train_index, "\n", val_index)

x_t, X_val = x_train.iloc[train_index], x_train.iloc[val_index]
y_t, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

# model = xgb.XGBRegressor(enable_categorical=True, tree_method="hist")
model = xgb.XGBClassifier(enable_categorical=True, tree_method="hist")
model.fit(x_t, y_t)

preds = model.predict(X_val)
mse = mean_squared_error(y_val, preds)
# f1 = f1_score(y_val_out['obstruction_ro'].tolist(), y_val_out['pred_tf'].tolist())
print("Validation MSE:", mse, "\tRMSE:", np.sqrt(mse))

del i, train_index, val_index

Validation MSE: 0.11020242342883385 	RMSE: 0.33196750357351823


In [41]:
# f1_score(y_val, preds)
# preds[1]
# y_val_out2 = pd.DataFrame()
y_val_out2 = y_val.copy().to_frame()
y_val_out2['pred'] = preds
y_val_out2['pred_tf'] = np.where(y_val_out2['pred'] == 1, True, False)
f1_score(y_val_out2[var_of_interest].tolist(), y_val_out2['pred_tf'].tolist())

# del y_val_out2

0.465610278372591

In [None]:
mini_val = y_val.copy()
mini_val = mini_val.reset_index()
mini_val['pred'] = preds
mini_val.set_index('index')
mini_val['pred_tf'] = np.where(mini_val['pred'] == 1, True, False)
mini_val.head()

In [None]:
fig, ax = plt.subplots(figsize=(20, 1.5))
ax.scatter(mini_val['index'], mini_val[var_of_interest], s=25, color='blue', marker="|")
ax.scatter(mini_val['index'], mini_val['pred_tf']-.06, s=25, color='orange', marker="|")

plt.show()

del fig, ax

In [None]:
def plot_preds(input_date_start, input_date_end, include_preds=True, include_calibration=True):
    """Plot values between two dates in the style of the Visual FoxPro interface.

    Args:
        input_date_start (Timestamp): The start date.
        input_date_end (Timestamp): The end date.
        include_calibration (boolean): Include X-markers for the calibration points.
    
    Returns:
        Time series plot.
    """
    # Filter the data sets
    data_subset = data_united.copy()[input_date_start:input_date_end]
    # data_subset_rain = data_rainfall.loc[input_date_start:input_date_end]
    # data_subset_cal = data_calibration.loc[input_date_start:input_date_end]

    fig, ax = plt.subplots(figsize=(10, 6))
    plt.axhline(y=0, color ='grey', linestyle = ':')
    # Plot the rain as a bar chart with a multiplier for visibility
    ax.vlines(data_subset.index, ymin=0, ymax=data_subset['ra_rain']*3, color = 'blue', label = "Rain (x3)")
    ax.plot(data_subset.index, data_subset['level_ro'], color = 'red', label = "Adjusted")
    ax.plot(data_subset.index, data_subset['raw_ro'], color = 'green', label = "Raw")
    # Include calibration points unless otherwise specified or unless there are none in the subset
    if include_calibration == True and not data_subset['weir_level_cal'].empty:
        ax.plot(data_subset.index, data_subset['weir_level_cal'], linestyle='none', marker='x', color='red', label = "Calibration")
    if include_preds == True:
        mini_val_subset = mini_val.copy().set_index('index')[input_date_start:input_date_end]
        ax.scatter(mini_val_subset.index, (mini_val_subset[var_of_interest]-3)*10, color='blue', marker="|")
        ax.scatter(mini_val_subset.index, (mini_val_subset['pred_tf']-5)*10, color='orange', marker="|")

    # Plot labels
    ax.set_xlabel("Date (YYYY-MM-DD)")
    ax.set_ylabel("Level (mm)")
    # ax.set_title('Simple Time Series Plot')
    ax.set_title("Runoff time series from " + str(input_date_start) + " through " + str(input_date_end))
    # ax.set_ylim(bottom=0) 
    # ax.grid(True)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    # Reverse the order of the legend
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles[::-1], labels[::-1], loc='upper right')
    # plt.legend(loc = 'upper right')
    return plt.show()

In [None]:
# plot_preds(X_val.index[0], X_val.index[-1])
plot_preds('2016-08-01 00:00:00', '2016-08-09 00:00:00')

In [None]:
# data_subset = data_united[X_val.index[0]:X_val.index[-1]]

# fig, ax = plt.subplots(figsize=(10, 6))
# plt.axhline(y=0, color ='grey', linestyle = ':')
# # Plot the rain as a bar chart with a multiplier for visibility
# ax.vlines(data_subset.index, ymin=0, ymax=data_subset['ra_rain']*3, color = 'blue', label = "Rain (x3)")
# ax.plot(data_subset.index, data_subset['level_ro'], color = 'red', label = "Adjusted")
# ax.plot(data_subset.index, data_subset['raw_ro'], color = 'green', label = "Raw")
# # Include calibration points unless otherwise specified or unless there are none in the subset
# # if include_calibration == True and not data_subset_cal.empty:
# ax.plot(data_subset.index, data_subset['weir_level_cal'], linestyle='none', marker='x', color='red', label = "Calibration")

# # Plot labels
# ax.set_xlabel("Date (YYYY-MM-DD)")
# ax.set_ylabel("Level (mm)")
# # ax.set_title('Simple Time Series Plot')
# # ax.set_title("Runoff time series from " + input_date_start + " through " + input_date_end)
# # ax.set_ylim(bottom=0) 
# # ax.grid(True)
# plt.xticks(rotation=45, ha='right')
# plt.tight_layout()
# # Reverse the order of the legend
# handles, labels = ax.get_legend_handles_labels()
# ax.legend(handles[::-1], labels[::-1], loc='upper right')
# # plt.legend(loc = 'upper right')
# plt.show()

# del data_subset, fig, ax, handles, labels

In [None]:
# f1_score, precision_score, recall_score, accuracy_score
y_true = mini_val[var_of_interest].tolist()
y_pred = mini_val['pred_tf'].tolist()

# Compute the confusion matrix
# cm = confusion_matrix(mini_val[var_of_interest].tolist(), mini_val['pred_tf'].tolist())
metric_cm = confusion_matrix(y_true, y_pred)
# print("Confusion Matrix:\n", metric_cm)

# Precision
metric_precision = precision_score(y_true, y_pred)


# Recall
metric_recall = recall_score(y_true, y_pred)

# F1
# f1_score = f1_score(mini_val[var_of_interest].tolist(), mini_val['pred_tf'].tolist())
metric_f1 = f1_score(y_true, y_pred)
# print("F1:\n", metric_f1)

# Accuracy - the total number of correct predictions performed by hte model
metric_accuracy = accuracy_score(y_true, y_pred)

print(
    "\nConfusion Matrix:\n", metric_cm,
    "\nPrecision:\t", metric_precision,
    "\nRecall:\t\t", metric_recall,
    "\nF1 Score:\t", metric_f1,
    "\nAccuracy:\t", metric_accuracy
)

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=metric_cm, display_labels=['Negative', 'Positive'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

# Environment cleanup
del y_true, y_pred, metric_cm, metric_precision, metric_recall, metric_f1, metric_accuracy, disp