# Data Separation

Author: Gillian A. McGinnis, final-semester M.S. Information Science - Machine Learning  
The University of Arizona College of Information  
INFO 698 - Capstone  
Start date: 21 October 2025  
Last updated: 21 October 2025

In [1]:
"""
Module providing code for test/train split and sliding window creation. Relies on 01_eda.ipynb completion.
"""

'\nModule providing code for test/train split and sliding window creation. Relies on 01_eda.ipynb completion.\n'

## Setup

### Packages

In [2]:
# General packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# # import matplotlib.ticker as ticker
# import matplotlib.dates as mdates
# import datetime as dt
# from datetime import date
from sklearn.model_selection import TimeSeriesSplit, train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error, confusion_matrix, ConfusionMatrixDisplay, f1_score, precision_score, recall_score, accuracy_score

In [3]:
## (Optional chunk)
# Current session information
import session_info
session_info.show(dependencies=False)

### Data

In [4]:
# united_water = pd.read_parquet('data/clean/water.parquet')
# united_soil = pd.read_parquet('data/clean/soil.parquet')

In [5]:
united_water = pd.read_parquet('data/clean/water_nocal.parquet')
data_cal = pd.read_parquet('data/clean/calibration.parquet')
data_cal = data_cal.rename(columns={'weir_level':'weir_level_cal'})

united_soil = pd.read_parquet('data/clean/soil.parquet')

## Prepare

## Feature Engineering

In [6]:
# Select columns of interest
data_water = united_water.drop(columns=['raw_rain', 'chk_note_rain', 'chk_fail_rain', 'chk_note_ro', 'chk_fail_ro', 'comment_ro', 'source_ro'])

# Cleanup
data_water = data_water.reset_index().drop_duplicates(keep='first').set_index('datetime')

In [7]:
### Note ###
# Remove this later -- just a smaller subset for feature engineering testing!!
# data_water = data_water['2015-01-01 00:00:00':'2016-12-31 23:59:59']
data_water = data_water['2000-01-01 00:00:00':'2015-12-31 23:59:59']
######

In [8]:
del united_water

### Distance from Event

In [9]:
def since_feat(input_df, input_col):
    output_df = input_df.copy()
    # Create index of instances where there is a data point
    instances = output_df[input_col].notna()
    # Create groupings based on most recent instance
    group_id = instances.cumsum()
    # Exclude the first grouping
    # otherwise it assumes there was an event just prior to the first entry
    group_id = group_id.replace(0, np.nan)
    # Create new column to count number of records since the point
    # which resets to 0 at each new point
    output_df[f"since_{input_col}"] = output_df.groupby(group_id).cumcount()
    return output_df

In [10]:
def minsince_feat(input_df, input_col):
    output_df = input_df.copy()#[input_col].to_frame()
    instances = output_df[input_col].notna()
    # Create groupings based on most recent instance
    group_id = instances.cumsum()
    # Exclude the first grouping
    # otherwise it assumes there was an event just prior to the first entry
    group_id = group_id.replace(0, np.nan)
    # Create new column to count the distance in minutes since the point
    # which resets to 0 at each new point
    output_df['timestamp'] = pd.to_datetime(output_df.index)
    # Get start timestamp of the group
    output_df['ts_start'] = output_df.groupby(group_id)['timestamp'].transform('min')
    # Calculate the distance
    output_df[f"minsince_{input_col}"] = (output_df['timestamp'] - output_df['ts_start']).dt.total_seconds()/60
    # Remove extra cols
    output_df = output_df.drop(columns=['timestamp', 'ts_start'])
    return output_df

In [11]:
def daysince_feat(input_df, input_col):
    output_df = input_df.copy()#[input_col].to_frame()
    instances = output_df[input_col].notna()
    # Create groupings based on most recent instance
    group_id = instances.cumsum()
    # Exclude the first grouping
    # otherwise it assumes there was an event just prior to the first entry
    group_id = group_id.replace(0, np.nan)
    # Create new column to count the distance in days since the point
    # which resets to 0 at each new point
    output_df['timestamp'] = pd.to_datetime(output_df.index)
    # Get start timestamp of the group
    output_df['ts_start'] = output_df.groupby(group_id)['timestamp'].transform('min')
    # Calculate the distance
    # output_df[f"daysince_{input_col}"] = (output_df['timestamp'] - output_df['ts_start']).dt.total_seconds()/(24 * 60 * 60)
    output_df[f"daysince_{input_col}"] = (output_df['timestamp'] - output_df['ts_start']).dt.days
    # Remove extra cols
    output_df = output_df.drop(columns=['timestamp', 'ts_start'])
    return output_df

In [12]:
def timesince_feat(input_df, input_col, input_unit):
    output_df = input_df.copy()#[input_col].to_frame()
    instances = output_df[input_col].notna()
    # Create groupings based on most recent instance
    group_id = instances.cumsum()
    # Exclude the first grouping
    # otherwise it assumes there was an event just prior to the first entry
    group_id = group_id.replace(0, np.nan)
    # Create new column to count the distance in days since the point
    # which resets to 0 at each new point
    output_df['timestamp'] = pd.to_datetime(output_df.index)
    # Get start timestamp of the group
    output_df['ts_start'] = output_df.groupby(group_id)['timestamp'].transform('min')
    # Calculate the distance
    if input_unit == "minutes":
        output_df[f"minsince_{input_col}"] = (output_df['timestamp'] - output_df['ts_start']).dt.total_seconds()/60
    elif input_unit == "days":
        output_df[f"daysince_{input_col}"] = (output_df['timestamp'] - output_df['ts_start']).dt.days
    # Remove extra cols
    output_df = output_df.drop(columns=['timestamp', 'ts_start'])
    return output_df

In [13]:
# daysince_feat(data_mini, '1_shallow')[['1_shallow', 'level_ro', 'daysince_1_shallow']]
# timesince_feat(data_mini, '1_shallow', "days")[['1_shallow', 'level_ro', 'daysince_1_shallow']]

#### Calibration
Create feature which tracks how recent a calibration was conducted.

In [14]:
# # since_feat(data_water[['ra_rain', 'raw_ro']], 'ra_rain')
# # data_water[['ra_rain', 'raw_ro']]
# data_w_test = data_water.copy()[['ra_rain', 'raw_ro']]
# data_w_test['ra_rain'] = data_w_test['ra_rain'].replace(0, np.nan)

# data_w_test = since_feat(data_w_test, 'ra_rain')
# data_w_test.head()

In [15]:
# # Create index of instances where there is a calibration point
# cal_instances = data_water['weir_level_cal'].notna()
# # Create groupings based on most recent instance
# cal_group_id = cal_instances.cumsum()
# # Create new column to count number of records since the calibration point
# # which resets to 0 at each new calibration
# data_water['records_since_cal'] = data_water.groupby(cal_group_id).cumcount()

# # Clean up environment
# del cal_instances, cal_group_id

# # data_water

# data_water = since_feat(data_water, 'weir_level_cal')

## HERE
# data_water = minsince_feat(data_water, 'weir_level_cal')

#### Rain
Create feature which tracks how recent a rain event occurred.

In [16]:
data_water = timesince_feat(data_water, 'ra_rain', "minutes")

In [17]:
# # Create index of instances where there is a calibration point
# rain_instances = data_water['ra_rain'].notna()
# # Create groupings based on most recent instance
# rain_group_id = rain_instances.cumsum()
# # Create new column to count number of records since the calibration point
# # which resets to 0 at each new calibration
# data_water['records_since_rain'] = data_water.groupby(rain_group_id).cumcount()

# # Clean up environment
# del rain_instances, rain_group_id

# # Replace NAs with 0
# data_water['ra_rain'] = data_water['ra_rain'].fillna(0)

# data_water.sample(10)
# # data_water.dropna(subset='raw_ro')

# data_water = since_feat(data_water, 'ra_rain')
# data_water = minsince_feat(data_water, 'ra_rain')
# data_water.sample(10)

Fill missing rain values with 0

In [18]:
# data_water['ra_rain'] = data_water['ra_rain'].fillna(0)
# data_water.sample(10)

### Rain event

Keep track of cumulative rainfall during a specific event.

In [19]:
# Create index of instances where there is a data point
rain_event = data_water['ra_rain'].isnull()
rain_event = (data_water['ra_rain'].isnull() & ((data_water['minsince_ra_rain'] >= 5.0) & (data_water['minsince_ra_rain'] != 0)))
# Create groupings based on most recent instance
rain_event_id = rain_event.cumsum()
# Create new column to count number of records since the point
# which resets to 0 at each new point
# del group_id, instances
# water_mini
# group_id = group_id.replace(0, np.nan)
# water_mini['since_ra_rain2'] = water_mini.groupby(group_id).cumcount()
# water_mini
# water_mini.info()
data_water['eventsum_ra_rain'] = data_water.groupby(rain_event_id)['ra_rain'].cumsum()

del rain_event, rain_event_id

In [20]:
# data_water[data_water['minsince_ra_rain'] > 0]
# data_water
# data_water[(data_water['minsince_weir_level_cal'] < 5) & (data_water['minsince_ra_rain'] != 0)]
# data_water[(data_water['minsince_weir_level_cal'] < 5) & (data_water['minsince_ra_rain'] == 2.0)]
# data_water[(data_water['minsince_weir_level_cal'] < 6) & (data_water['eventsum_ra_rain'].notnull())][['ra_rain', 'eventsum_ra_rain','minsince_ra_rain', 'minsince_weir_level_cal']]
# data_water['2008-10-15 00:00:00':'2008-11-04 10:30:00']
# data_water['2006-06-21 09:15:00':'2006-06-21 09:30:00']

In [21]:
# data_water[['ra_rain', 'since_ra_rain', 'rain_event_cumsum']]
# data_water
# 475

In [22]:
# water_mini = data_water.copy()[['weir_level_cal', 'ra_rain', 'raw_ro', 'since_weir_level_cal', 'since_ra_rain']]
# water_mini.sample(10)

In [23]:
# # water_mini = data_water.copy()[['ra_rain', 'level_ro']]
# water_mini = data_water.copy()['ra_rain'].to_frame()
# instances = water_mini['ra_rain'].notna()
# # Create groupings based on most recent instance
# group_id = instances.cumsum()
# # Exclude the first grouping
# # otherwise it assumes there was an event just prior to the first entry
# group_id = group_id.replace(0, np.nan)
# # Create new column to count number of records since the point
# # which resets to 0 at each new point
# # output_df[f"since_{input_col}"] = output_df.groupby(group_id).cumcount()
# # group_id

# water_mini['timestamp'] = pd.to_datetime(water_mini.index)
# water_mini['ts_start'] = water_mini.groupby(group_id)['timestamp'].transform('min')
# water_mini['ts_dist'] = (water_mini['timestamp'] - water_mini['ts_start']).dt.total_seconds()/60
# water_mini

# # water_mini = water_mini.reset_index()
# # water_mini['ts_start'] = water_mini.groupby(group_id)['datetime'].transform('min')
# # water_mini['ts_dist'] = (water_mini['datetime'] - water_mini['ts_start']).dt.total_seconds() / 60
# # water_mini.set_index('datetime')

In [24]:
# # Create index of instances where there is a data point
# instances = water_mini['ra_rain'].notna()
# # Create groupings based on most recent instance
# group_id = instances.cumsum()
# # Create new column to count number of records since the point
# # which resets to 0 at each new point
# # del group_id, instances
# # water_mini
# # group_id = group_id.replace(0, np.nan)
# # water_mini['since_ra_rain2'] = water_mini.groupby(group_id).cumcount()
# # water_mini
# # water_mini.info()
# water_mini['rain_event'] = water_mini.groupby(group_id)['ra_rain'].cumsum()

# # rain_null_mask = water_mini['ra_rain'].isnull()
# # rain_group_id = rain_null_mask.cumsum()
# # water_mini.groupby(rain_group_id)['ra_rain'].cumsum()
# # # rain_null_mask
# # # water_mini['rain_event_cumsum'] = water_mini.groupby(rain_group_id)['ra_rain'].cumsum()

# # # g_id_event = null_mask.cumsum()
# # # water_m['r_event_sum'] = water_m.groupby(g_id_event)['ra_rain'].cumsum()

# # # del rain_null_mask, rain_group_id
# # # data_water['rain_event_sum'] = data_water.groupby(g_id_event)['ra_rain'].cumsum()

In [25]:
# water_mini

In [26]:
# # israin = water_mini['ra_rain'].notna()
# # israin_group_id = israin.cumsum()
# # # g_id
# # water_mini['since_rain_2'] = water_mini.groupby(israin_group_id).cumcount()
# water_mini['dec'] = np.exp(-0.1*water_mini['since_ra_rain'])
# water_mini['rain_fill'] = water_mini['rain_event_cumsum'].ffill()
# # data_u['1_shallow_f'] = data_u['1_shallow'].ffill()
# water_mini['rain_dec'] = (water_mini['rain_fill']*water_mini['dec'])
# # del israin, israin_group_id
# water_mini

### Decay

In [31]:
def decay_feat(input_df, input_col, input_dec_rate = -0.1):
    output_df = input_df.copy()
    # output_df = since_feat(input_df = output_df, input_col = input_col)
    if f"minsince_{input_col}" not in output_df.columns:
        # output_df = minsince_feat(input_df = output_df, input_col = input_col)
        output_df = timesince_feat(input_df = output_df, input_col = input_col, input_unit = "minutes")
    
    output_df[f"decayrate{input_dec_rate}_{input_col}"] = np.exp(input_dec_rate * output_df[f"minsince_{input_col}"])
    output_df[f"ffill_{input_col}"] = output_df[input_col].ffill()
    output_df[f"decay{input_dec_rate}_{input_col}"] = (output_df[f"ffill_{input_col}"] * output_df[f"decayrate{input_dec_rate}_{input_col}"])

    return output_df

# water_m = united_water[['raw_ro', 'level_ro', 'ra_rain', 'obstruction_ro']]

# null_mask = water_m['ra_rain'].isnull()
# g_id_event = null_mask.cumsum()
# water_m['r_event_sum'] = water_m.groupby(g_id_event)['ra_rain'].cumsum()

# is_rain = water_m['ra_rain'].notna()
# g_id = is_rain.cumsum()
# # g_id
# water_m['since_rain'] = water_m.groupby(g_id).cumcount()
# water_m['dec'] = np.exp(-0.1*water_m['since_rain'])
# water_m['rain_fill'] = water_m['r_event_sum'].ffill()
# # data_u['1_shallow_f'] = data_u['1_shallow'].ffill()
# water_m['rain_dec'] = (water_m['rain_fill']*water_m['dec'])

In [28]:
# water_mini = data_water.copy()
# water_mini['ra_rain'] = water_mini['ra_rain'].fillna(0)

In [29]:
# water_mini = data_water.copy()
# water_mini['ra_rain'] = water_mini['ra_rain'].fillna(0)
# water_mini = decay_feat(water_mini, 'eventsum_ra_rain')
# selected_columns = water_mini.columns[water_mini.columns.str.contains('ra_rain')]
# water_mini[selected_columns]

In [32]:
data_water['ra_rain'] = data_water['ra_rain'].fillna(0)
data_water = decay_feat(data_water, 'eventsum_ra_rain')

### Fill rain values

In [None]:
# data_water.dropna(subset=['obstruction_ro'])

In [None]:
# Replace NAs in rain with 0s
# data_water['ra_rain'] = data_water['ra_rain'].fillna(0)

### Lag features

#### Consistent cols

Modify the rows to prevent inappropriate data shifts

In [None]:
# # data_mini_min = data_water.copy()['1996-11-01 00:00:00':'1997-01-31 23:59:59']
# # data_mini_min = data_water.copy()['1993-01-01 00:00:00':'1997-12-25 00:00:00']
# # data_water.head()
# data_mini_min = data_water.copy()

# print(len(data_mini_min))

In [None]:
# data_mini_min.sort_index()['2000-04-14 02:35:00':'2000-04-14 03:00:00']

In [None]:
# # full_range = pd.date_range(start = '1996-11-01 00:00:00', end = '1997-02-01 00:00:00', freq = '5min')
# # full_range.duplicated().any()
# # data_mini_min.reindex(full_range)
# # data_mini_min.drop_duplicates(keep='first').asfreq(freq='5min')['2000-04-14 02:35:00':'2000-04-14 03:00:00']


# # Assuming your df has a unique DatetimeIndex already
# # Ensure the index is sorted first (good practice for time series ops)
# # df = df.sort_index()

# # 1. Define the complete range (e.g., every minute)
# new_index = pd.date_range(start=data_mini_min.index.min(), 
#                           end=data_mini_min.index.max(), 
#                           freq='5min')

# # 2. Reindex to this full range
# # data_mini_min.drop_duplicates(keep='first').reindex(new_index)['2000-04-14 02:35:00':'2000-04-14 03:00:00']
# data_mini_min.reset_index().drop_duplicates(keep='first').set_index('datetime').reindex(new_index)['2000-04-14 02:35:00':'2000-04-14 03:00:00']


# # data_mini_min = data_mini_min.reset_index()


# # 3. Use ffill/bfill to fill the new NaNs
# # df_complete = df_complete.ffill() 


# # print(f"Index type: {type(data_mini_min.index)}")
# # print(f"Index dtype: {data_mini_min.index.dtype}")
# # print(f"Index has duplicates: {data_mini_min.index.duplicated().any()}")
# # print(f"Number of rows: {len(data_mini_min)}")
# # print(f"Number of unique index values: {len(data_mini_min.index.unique())}")

# # # data_mini_min.index.duplicated().any()

# # duplicated_index_mask = data_mini_min.index.duplicated(keep=False)

# # # Filter the DataFrame using the boolean mask
# # data_mini_min[duplicated_index_mask]

# # # type(data_mini_min.index)
# # # full_range

In [33]:
new_index = pd.date_range(start = data_water.index.min(), 
                          end = data_water.index.max(), 
                          freq = '5min')

# 2. Reindex to this full range
# data_mini_min.drop_duplicates(keep='first').reindex(new_index)['2000-04-14 02:35:00':'2000-04-14 03:00:00']
data_water = data_water.reindex(new_index)

del new_index

In [34]:
data_water['ra_rain'] = data_water['ra_rain'].fillna(0)

Get values from other recent time stamps.

In [None]:
def lag_feats(input_df, input_cols, input_lags):
    output_df = input_df.copy()
    for col in input_cols:
        for lag in input_lags:
            output_df[f"{col}_lag{lag}"] = output_df[col].shift(lag)
    return output_df

In [None]:
# lag_feats(data_water, ['raw_ro'], [1, 2, 3, 24]).dropna(subset='raw_ro')[['raw_ro', 'raw_ro_lag1', 'raw_ro_lag2']]
# lag_feats(data_water, ['raw_ro'], [1, 2, 3, 24]).dropna(subset='raw_ro')[['raw_ro', 'raw_ro_lag1', 'raw_ro_lag24']]

# Columns to get temporal stats on
cols_to_shift = ['raw_ro', 'ra_rain']
# data at 5-min increments -- lag to record values at 5m, 10m, 15m, 30m, 1h, and 2h prior
lags_of_interest = [1, 2, 3, 6, 12, 24]

data_water = lag_feats(data_water, cols_to_shift, lags_of_interest)

data_water.sample(10)

### Rolling stats

Get stat values from range of recent time stamps.

In [35]:
def rolling_feats(input_df, input_cols, input_windows):
    output_df = input_df.copy()
    for col in input_cols:
        for window in input_windows:
            output_df[f"{col}_rollmean_{window}"] = output_df[col].rolling(window).mean()
            output_df[f"{col}_rollstd_{window}"] = output_df[col].rolling(window).std()
            output_df[f"{col}_rollslope_{window}"] = (output_df[col].rolling(window).apply(lambda x: np.polyfit(range(len(x)), x, 1)[0], raw=True))
    return output_df

In [38]:
# data_water_mini = data_water['1990-01-01 00:00:00':'1990-01-30 23:59:59']
# rolling_feats(data_water_mini, cols_to_lag, [6, 12, 36])
# 10m, 30m, 1h, 6h
# windows_of_interest = [2, 6, 12, 72]


windows_of_interest = [2, 6, 12]
cols_to_shift = ['raw_ro', 'ra_rain']
data_water_slow = rolling_feats(data_water['2000-01-01 00:00:00':'2000-12-31 23:59:59'], cols_to_shift, windows_of_interest)

# data_water_slow.sample(10)
# data_water = rolling_feats(data_water, cols_to_shift, windows_of_interest)

# data_water.sample(10)

In [41]:
data_water_slow

Unnamed: 0,ra_rain,level_ro,raw_ro,obstruction_ro,gap_fill_ro,weir_cleaning_ro,spike_ro,calibration_ro,minsince_ra_rain,eventsum_ra_rain,...,raw_ro_rollslope_12,ra_rain_rollmean_2,ra_rain_rollstd_2,ra_rain_rollslope_2,ra_rain_rollmean_6,ra_rain_rollstd_6,ra_rain_rollslope_6,ra_rain_rollmean_12,ra_rain_rollstd_12,ra_rain_rollslope_12
2000-01-01 00:00:00,1.270,,,,,,,,0.0,1.270,...,,,,,,,,,,
2000-01-01 00:05:00,0.508,,,,,,,,0.0,1.778,...,,0.889,0.538815,-0.762,,,,,,
2000-01-01 00:10:00,2.032,,,,,,,,0.0,3.810,...,,1.270,1.077631,1.524,,,,,,
2000-01-01 00:15:00,0.762,,,,,,,,0.0,4.572,...,,1.397,0.898026,-1.270,,,,,,
2000-01-01 00:20:00,0.254,,,,,,,,0.0,4.826,...,,0.508,0.359210,-0.508,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2000-12-31 23:35:00,0.000,163.6,163.6,False,False,False,False,False,135.0,,...,-0.379720,0.000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
2000-12-31 23:40:00,0.000,163.1,163.1,False,False,False,False,False,140.0,,...,-0.370280,0.000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
2000-12-31 23:45:00,0.000,161.9,161.9,False,False,False,False,False,145.0,,...,-0.415035,0.000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
2000-12-31 23:50:00,0.000,161.2,161.2,False,False,False,False,False,150.0,,...,-0.460839,0.000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
# import pandas as pd
# import numpy as np

def optimized_rolling_feats_vectorized(input_df, input_cols, input_windows):
    output_df = input_df.copy()
    
    # Create a dummy series of index values (0, 1, 2, ... N) once
    # 'x' represents the position within the dataframe for the regression calculation
    x_series = pd.Series(np.arange(len(output_df)), index=output_df.index)
    
    for col in input_cols:
        for window in input_windows:
            # 1. Calculate Mean and Std Dev (already fast)
            output_df[f"{col}_rollmean_{window}"] = output_df[col].rolling(window).mean()
            output_df[f"{col}_rollstd_{window}"] = output_df[col].rolling(window).std()

            # 2. Calculate Slope using vectorized operations (Fast)
            # Slope = Cov(Y, X) / Var(X)
            
            # Calculate Covariance of Y (your data) vs X (the index position)
            rolling_cov = output_df[col].rolling(window).cov(x_series)
            
            # Calculate Variance of X (index position)
            rolling_var_x = x_series.rolling(window).var()
            
            # The slope is Cov(X, Y) / Var(X)
            output_df[f"{col}_rollslope_{window}"] = rolling_cov / rolling_var_x
            
            # Note on edge cases: 
            # The initial 'window-1' values for rolling_var_x will be NaN/incorrect. 
            # Pandas automatically handles alignment, so the division result will also be NaN where appropriate.
            # This method works very well for standard time series analysis.

    return output_df

cols_to_shift = ['raw_ro', 'ra_rain']
windows_of_interest = [2, 6, 12]
data_owater = optimized_rolling_feats_vectorized(data_water['2000-01-01 00:00:00':'2000-12-31 23:59:59'], cols_to_shift, windows_of_interest)
data_owater

Unnamed: 0,ra_rain,level_ro,raw_ro,obstruction_ro,gap_fill_ro,weir_cleaning_ro,spike_ro,calibration_ro,minsince_ra_rain,eventsum_ra_rain,...,raw_ro_rollslope_12,ra_rain_rollmean_2,ra_rain_rollstd_2,ra_rain_rollslope_2,ra_rain_rollmean_6,ra_rain_rollstd_6,ra_rain_rollslope_6,ra_rain_rollmean_12,ra_rain_rollstd_12,ra_rain_rollslope_12
2000-01-01 00:00:00,1.270,,,,,,,,0.0,1.270,...,,,,,,,,,,
2000-01-01 00:05:00,0.508,,,,,,,,0.0,1.778,...,,0.889,0.538815,-0.762,,,,,,
2000-01-01 00:10:00,2.032,,,,,,,,0.0,3.810,...,,1.270,1.077631,1.524,,,,,,
2000-01-01 00:15:00,0.762,,,,,,,,0.0,4.572,...,,1.397,0.898026,-1.270,,,,,,
2000-01-01 00:20:00,0.254,,,,,,,,0.0,4.826,...,,0.508,0.359210,-0.508,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2000-12-31 23:35:00,0.000,163.6,163.6,False,False,False,False,False,135.0,,...,-0.379720,0.000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
2000-12-31 23:40:00,0.000,163.1,163.1,False,False,False,False,False,140.0,,...,-0.370280,0.000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
2000-12-31 23:45:00,0.000,161.9,161.9,False,False,False,False,False,145.0,,...,-0.415035,0.000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0
2000-12-31 23:50:00,0.000,161.2,161.2,False,False,False,False,False,150.0,,...,-0.460839,0.000,0.000000,0.000,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
data_water_slow.round(2).equals(data_owater.round(2))

True

In [44]:
# data_water_slow.equals(data_owater)
data_water_slow.round(2).head()

Unnamed: 0,ra_rain,level_ro,raw_ro,obstruction_ro,gap_fill_ro,weir_cleaning_ro,spike_ro,calibration_ro,minsince_ra_rain,eventsum_ra_rain,...,raw_ro_rollslope_12,ra_rain_rollmean_2,ra_rain_rollstd_2,ra_rain_rollslope_2,ra_rain_rollmean_6,ra_rain_rollstd_6,ra_rain_rollslope_6,ra_rain_rollmean_12,ra_rain_rollstd_12,ra_rain_rollslope_12
2000-01-01 00:00:00,1.27,,,,,,,,0.0,1.27,...,,,,,,,,,,
2000-01-01 00:05:00,0.51,,,,,,,,0.0,1.78,...,,0.89,0.54,-0.76,,,,,,
2000-01-01 00:10:00,2.03,,,,,,,,0.0,3.81,...,,1.27,1.08,1.52,,,,,,
2000-01-01 00:15:00,0.76,,,,,,,,0.0,4.57,...,,1.4,0.9,-1.27,,,,,,
2000-01-01 00:20:00,0.25,,,,,,,,0.0,4.83,...,,0.51,0.36,-0.51,,,,,,


In [43]:
data_owater.head()

Unnamed: 0,ra_rain,level_ro,raw_ro,obstruction_ro,gap_fill_ro,weir_cleaning_ro,spike_ro,calibration_ro,minsince_ra_rain,eventsum_ra_rain,...,raw_ro_rollslope_12,ra_rain_rollmean_2,ra_rain_rollstd_2,ra_rain_rollslope_2,ra_rain_rollmean_6,ra_rain_rollstd_6,ra_rain_rollslope_6,ra_rain_rollmean_12,ra_rain_rollstd_12,ra_rain_rollslope_12
2000-01-01 00:00:00,1.27,,,,,,,,0.0,1.27,...,,,,,,,,,,
2000-01-01 00:05:00,0.508,,,,,,,,0.0,1.778,...,,0.889,0.538815,-0.762,,,,,,
2000-01-01 00:10:00,2.032,,,,,,,,0.0,3.81,...,,1.27,1.077631,1.524,,,,,,
2000-01-01 00:15:00,0.762,,,,,,,,0.0,4.572,...,,1.397,0.898026,-1.27,,,,,,
2000-01-01 00:20:00,0.254,,,,,,,,0.0,4.826,...,,0.508,0.35921,-0.508,,,,,,


In [None]:
def optimized_rolling_feats(input_df, input_cols, input_windows):
    output_df = input_df.copy()
    
    # Pre-calculate rolling index variance once (the x-axis values are constant)
    # The variance of 'x' values (0 to N-1) for a window size N is constant for a given N.
    
    for col in input_cols:
        for window in input_windows:
            # 1. Calculate Mean and Std Dev (already fast)
            output_df[f"{col}_rollmean_{window}"] = output_df[col].rolling(window).mean()
            output_df[f"{col}_rollstd_{window}"] = output_df[col].rolling(window).std()

            # 2. Calculate Slope using vectorized operations (Fast)
            # Create a Series representing the 'x' values (0 to window-1 repeated)
            x_values = pd.Series(range(window), index=output_df.index[-window:]) # This part needs careful handling

            # A more robust way to handle the constant variance:
            # Create a dummy series of index values (0, 1, 2, ... N)
            x_series = pd.Series(np.arange(len(output_df)), index=output_df.index)
            
            # Calculate Covariance of Y (your data) vs X (the index position)
            rolling_cov = output_df[col].rolling(window).cov(x_series)
            
            # Calculate Variance of X (index position)
            rolling_var_x = x_series.rolling(window).var()
            
            # The slope is Cov(X, Y) / Var(X)
            output_df[f"{col}_rollslope_{window}"] = rolling_cov / rolling_var_x
            
            # Note: For speed, you might need to handle the initial NaNs created by rolling_var_x
            # You can fill those initial NaNs with the correct constant variance value if needed.

    return output_df

cols_to_shift = ['raw_ro', 'ra_rain']
windows_of_interest = [2, 6, 12]
data_owater = optimized_rolling_feats(data_water, cols_to_shift, windows_of_interest)
data_owater

Change since last value

In [None]:
data_water['raw_ro_change'] = data_water['raw_ro'].diff()

# cal_na_mask = data_water['weir_level_cal'].notna() & data_water['raw_ro'].notna()
# # cal_na_mask
# (data_water['weir_level_cal'] - data_water['raw_ro']).dropna()
# del cal_na_mask
data_water['diff_ro_cal'] = (data_water['weir_level_cal'] - data_water['raw_ro'])
# data_water['rain_diff']

In [None]:
data_water.sample(10)

## Soil

Pivot the soil data such that each sample has its own columns, and separated by depth.

In [None]:
# Drop irrelevant column
data_soil_shallow = united_soil.copy().drop('h2o_by_wet_deep', axis=1)
# Pivot wider
data_soil_shallow = data_soil_shallow.pivot(columns='sample', values='h2o_by_wet_shallow')

# Drop irrelevant column
data_soil_deep = united_soil.copy().drop('h2o_by_wet_shallow', axis=1)
# Pivot wider
data_soil_deep = data_soil_deep.pivot(columns='sample', values='h2o_by_wet_deep')

# Combine
data_soil = pd.merge(
    data_soil_shallow,
    data_soil_deep,
    left_index = True,
    right_index = True,
    suffixes = ("_shallow", "_deep"),
    how = "outer"
)

del data_soil_shallow, data_soil_deep

data_soil.sample(10)

In [None]:
# soil_mini_shallow = united_soil.copy().drop('h2o_by_wet_deep', axis=1)
# soil_mini_shallow = soil_mini_shallow.pivot(columns='sample', values='h2o_by_wet_shallow')

# soil_mini_deep = united_soil.copy().drop('h2o_by_wet_shallow', axis=1)
# soil_mini_deep = soil_mini_deep.pivot(columns='sample', values='h2o_by_wet_deep')

# soil_mini = pd.merge(
#     soil_mini_shallow,
#     soil_mini_deep,
#     left_index=True,
#     right_index=True,
#     # soil_mini_shallow.reset_index(),
#     # soil_mini_deep.reset_index(),
#     # on = ["date", "sample"],
#     suffixes = ("_shallow", "_deep"),
#     how = "outer"
#     )

# soil_mini.head()

## Unite

In [None]:
# data_u_test = pd.merge(
#     data_water,
#     data_soil['2015-01-01 00:00:00':'2016-12-31 23:59:59'],
#     left_index = True,
#     right_index = True,
#     how = 'outer'
# )

# data_u_test

In [None]:
# # def since_feat(input_df, input_col):
# #     output_df = input_df.copy()
# #     # Create index of instances where there is a data point
# #     instances = output_df[input_col].notna()
# #     # Create groupings based on most recent instance
# #     group_id = instances.cumsum()
# #     # Create new column to count number of records since the point
# #     # which resets to 0 at each new point
# #     output_df[f"since_{input_col}"] = output_df.groupby(group_id).cumcount()
# #     return output_df

# cols_soil = [col for col in data_u_test.columns if (col.endswith('shallow') | col.endswith('deep'))]
# soil_instances = data_u_test[cols_soil].notna()
# soil_group_id = soil_instances.cumsum().max(axis=1)
# data_u_test["since_soil"] = data_u_test.groupby(soil_group_id).cumcount()
# # data_u_test.groupby(soil_group_id).cumcount()
# # data_u_test["since_soil"] = data_u_test.groupby(soil_group_id).cumcount()
# # data_u_test[cols_soil].notna().cumsum().max(axis=1)

In [None]:
# data_united = pd.merge(
#     data_water,
#     # REMOVE LATER
#     # data_soil['2015-01-01 00:00:00':'2016-12-31 23:59:59'],
#     data_soil['2000-01-01 00:00:00':'2015-12-31 23:59:59'],
#     # data_soil,
#     #
#     left_index = True,
#     right_index = True,
#     how = 'outer'
# )

data_united = pd.merge(
    data_water,
    # REMOVE LATER
    # data_soil['2015-01-01 00:00:00':'2016-12-31 23:59:59'],
    # data_soil['2000-01-01 00:00:00':'2015-12-31 23:59:59'],
    data_cal,
    #
    left_index = True,
    right_index = True,
    how = 'outer'
)

data_united = pd.merge(
    data_united,
    # REMOVE LATER
    # data_soil['2015-01-01 00:00:00':'2016-12-31 23:59:59'],
    # data_soil['2000-01-01 00:00:00':'2015-12-31 23:59:59'],
    data_soil,
    #
    left_index = True,
    right_index = True,
    how = 'outer'
)

data_united = minsince_feat(data_united, 'weir_level_cal')

In [None]:
# for col in data_united.columns:
#     if (col.endswith('shallow') | col.endswith('deep')):
#         data_united = minsince_feat(data_united, col)

In [None]:
# # Create feature to track soil value staleness
# cols_soil = [col for col in data_united.columns if (col.endswith('shallow') | col.endswith('deep'))]
# soil_instances = data_united[cols_soil].notna()
# soil_group_id = soil_instances.cumsum().max(axis=1)
# data_united["since_soil"] = data_united.groupby(soil_group_id).cumcount()

# del soil_instances, soil_group_id

In [None]:
# create features to track soil value staleness
cols_soil = [col for col in data_united.columns if (col.endswith('shallow') | col.endswith('deep'))]

for col in cols_soil:
# for col in data_united.columns:
    # if (col.endswith('shallow') | col.endswith('deep')):
    data_united = minsince_feat(data_united, col)

# Extend soil vals
data_united[cols_soil] = data_united[cols_soil].ffill()

# Cutoff


del cols_soil
data_united.sample(10)

In [None]:
# # Extend soil vals
# # cols_to_fill = [col for col in data_united.columns if (col.endswith('shallow') | col.endswith('deep'))]
# # data_united[cols_to_fill] = data_united[cols_to_fill].ffill()
# data_united[cols_soil] = data_united[cols_soil].ffill()

# del cols_soil
# data_united.sample(10)

### Train/Test (80/20)

In [None]:
# # mini_xy = water_mini[['level_ro', 'raw_ro', 'chk_note_ro', 'source_ro']].dropna()
# mini_xy = water_mini.copy().drop('level_ro', axis=1).dropna()
# mini_y = mini_xy['obstruction_ro']
# # mini_x = mini_xy[['raw_ro', 'chk_note_ro', 'source_ro']]
# mini_x = mini_xy.drop('obstruction_ro', axis=1)
# mini_xy
var_of_interest = 'obstruction_ro'
y_drops = ['level_ro', 'obstruction_ro', 'gap_fill_ro', 'weir_cleaning_ro', 'spike_ro', 'calibration_ro']

data_filtered = data_united.copy().dropna(subset = var_of_interest)
# y_drops.remove(var_of_interest)

united_y = data_filtered[var_of_interest]
# united_x = data_united.drop([var_of_interest, 'level_ro'], axis=1)
united_x = data_filtered.drop(y_drops, axis=1)
# united_x.info()

del data_filtered

# united_x.info()

# united_x.info()

In [None]:
print("Test:\t20p of", len(united_y), "is", round(.2*len(united_y)))
print("Train:\t80p of", len(united_y), "is", round(.8*len(united_y)))
print(round(.2*len(united_y)) + round(.8*len(united_y)))

# mini_x.index[1]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(united_x, united_y, test_size = 0.2, shuffle=False)

print(
    "Train:\t", len(x_train), "\t", x_train.index[0], "thru", x_train.index[-1],
    "\nTest:\t", len(x_test), "\t\t", x_test.index[0], "thru", x_test.index[-1]
    # len(x_train), len(x_test), "\n",
    # x_train.index[-1]
)

### Sliding Window

In [None]:
tscv = TimeSeriesSplit(n_splits=20)
print(tscv)

In [None]:
# print(tscv)
for i, (train_index, val_index) in enumerate(tscv.split(x_train)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={val_index}")
    # print("  Train: index=", mini_x.index[train_index])
    # print(f"  Test:  index={val_index}")
    print("--------------------------------------------------")

del i, train_index, val_index

In [None]:
# val_tracker = y_train.copy().to_frame()
# val_tracker['pred'] = .5
# val_tracker.head()

In [None]:
# # preds
# y_t = y_t.to_frame()
# y_t['preds'] = preds
# pd.concat(y_t)

## Model

In [None]:
# tscv = TimeSeriesSplit(n_splits=15)
# val_tracker = y_train.copy()
# val_tracker['pred'] = .5
# val_tracker = y_train.copy().to_frame()
val_tracker = pd.DataFrame()
win_tracker = pd.DataFrame(columns=["fold", "mse", "rmse", "f1", "acc"])
i = 0

for train_index, val_index in tscv.split(x_train):
    x_t, X_val = x_train.iloc[train_index], x_train.iloc[val_index]
    y_t, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
    if len(y_t.unique()) != 2:
        print("Skipping fold", i)
        i += 1
        continue
    # model = xgb.XGBRegressor(enable_categorical=True, tree_method="hist")
    # model = xgb.XGBClassifier(enable_categorical=True, tree_method="hist")
    # model = xgb.XGBClassifier(tree_method="hist")
    model = xgb.XGBClassifier(
        tree_method="hist",
        # learning_rate=0.1,
        # max_depth=5,
        random_state=42,
        # handle class imbalance -- sum(negative instances) / sum(positive instances)
        scale_pos_weight = (y_t.value_counts()[False] / y_t.value_counts()[True]).item()
    )
    # if len(y_val.unique()) != 2:
    #     print("Skipping fold", i)
    #     i += 1
    #     continue
    # i += 1
    model.fit(x_t, y_t)
    preds = model.predict(X_val)
    #
    y_val_out = y_val.copy().to_frame()
    y_val_out['pred'] = preds
    y_val_out['pred_tf'] = np.where(y_val_out['pred'] == 1, True, False)
    val_tracker = pd.concat([val_tracker, y_val_out])
    #
    mse = mean_squared_error(y_val, preds)
    f1 = f1_score(y_val_out[var_of_interest].tolist(), y_val_out['pred_tf'].tolist())
    accuracy = accuracy_score(y_val_out[var_of_interest].tolist(), y_val_out['pred_tf'].tolist())
    # print("Validation RMSE:", mean_squared_error(y_val, preds, squared=False))
    print(i, "\tMSE:", round(mse, 4), "\tRMSE:", round(np.sqrt(mse), 4), "\tF1:", round(f1, 4), "\tAcc:", round(accuracy, 4))
    win_tracker.loc[len(win_tracker)] = {"fold":i, "mse": mse, "rmse": np.sqrt(mse), "f1": f1, "acc": accuracy}
    i += 1

# val_tracker['pred_tf'] = np.where(val_tracker['pred'] >= 0.5, True, False)

del i, x_t, X_val, y_t, y_val, model, preds, mse, f1, accuracy

In [None]:
val_tracker.head()

In [None]:
print(round(win_tracker, 4))

In [None]:
# val_tracker.head()
# y_val.to_list()
# y_val_out['obstruction_ro']
# f1_score(y_val_out['obstruction_ro'].tolist(), y_val_out['pred_tf'].tolist())
# f1_score(y_val_out['obstruction_ro'], y_val_out['pred_tf'])
# y_val_out['pred_tf'].tolist()

In [None]:
for i, (train_index, val_index) in enumerate(tscv.split(x_train)):
    continue
    # print(f"Fold {i}:")
    # print(f"  Train: index={train_index}")
    # print(f"  Test:  index={val_index}")

# print(train_index, "\n", val_index)

x_t, X_val = x_train.iloc[train_index], x_train.iloc[val_index]
y_t, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

# model = xgb.XGBRegressor(enable_categorical=True, tree_method="hist")
model = xgb.XGBClassifier(
    # enable_categorical=True,
    random_state=42,
    scale_pos_weight = (y_t.value_counts()[False] / y_t.value_counts()[True]).item(),
    tree_method="hist"
    )
model.fit(x_t, y_t)

preds = model.predict(X_val)
mse = mean_squared_error(y_val, preds)
# f1 = f1_score(y_val_out['obstruction_ro'].tolist(), y_val_out['pred_tf'].tolist())
print("Validation MSE:", mse, "\tRMSE:", np.sqrt(mse))

del i, train_index, val_index

In [None]:
# f1_score(y_val, preds)
# preds[1]
# y_val_out2 = pd.DataFrame()
y_val_out2 = y_val.copy().to_frame()
y_val_out2['pred'] = preds
y_val_out2['pred_tf'] = np.where(y_val_out2['pred'] == 1, True, False)
f1_score(y_val_out2[var_of_interest].tolist(), y_val_out2['pred_tf'].tolist())

# del y_val_out2

In [None]:
mini_val = y_val.copy()
mini_val = mini_val.reset_index()
mini_val['pred'] = preds
mini_val.set_index('index')
mini_val['pred_tf'] = np.where(mini_val['pred'] == 1, True, False)
mini_val.head()

In [None]:
fig, ax = plt.subplots(figsize=(20, 1.5))
ax.scatter(mini_val['index'], mini_val[var_of_interest], s=25, color='blue', marker="|")
ax.scatter(mini_val['index'], mini_val['pred_tf']-.06, s=25, color='orange', marker="|")

plt.show()

del fig, ax

In [None]:
def plot_preds(input_date_start, input_date_end, include_preds=True, include_calibration=True):
    """Plot values between two dates in the style of the Visual FoxPro interface.

    Args:
        input_date_start (Timestamp): The start date.
        input_date_end (Timestamp): The end date.
        include_calibration (boolean): Include X-markers for the calibration points.
    
    Returns:
        Time series plot.
    """
    # Filter the data sets
    data_subset = data_united.copy()[input_date_start:input_date_end]
    # data_subset_rain = data_rainfall.loc[input_date_start:input_date_end]
    # data_subset_cal = data_calibration.loc[input_date_start:input_date_end]

    fig, ax = plt.subplots(figsize=(10, 6))
    plt.axhline(y=0, color ='grey', linestyle = ':')
    # Plot the rain as a bar chart with a multiplier for visibility
    ax.vlines(data_subset.index, ymin=0, ymax=data_subset['ra_rain']*3, color = 'blue', label = "Rain (x3)")
    ax.plot(data_subset.index, data_subset['level_ro'], color = 'red', label = "Adjusted")
    ax.plot(data_subset.index, data_subset['raw_ro'], color = 'green', label = "Raw")
    # Include calibration points unless otherwise specified or unless there are none in the subset
    if include_calibration == True and not data_subset['weir_level_cal'].empty:
        ax.plot(data_subset.index, data_subset['weir_level_cal'], linestyle='none', marker='x', color='red', label = "Calibration")
    if include_preds == True:
        mini_val_subset = mini_val.copy().set_index('index')[input_date_start:input_date_end]
        ax.scatter(mini_val_subset.index, (mini_val_subset[var_of_interest]-3)*10, color='blue', marker="|")
        ax.scatter(mini_val_subset.index, (mini_val_subset['pred_tf']-5)*10, color='orange', marker="|")

    # Plot labels
    ax.set_xlabel("Date (YYYY-MM-DD)")
    ax.set_ylabel("Level (mm)")
    # ax.set_title('Simple Time Series Plot')
    ax.set_title("Runoff time series from " + str(input_date_start) + " through " + str(input_date_end))
    # ax.set_ylim(bottom=0) 
    # ax.grid(True)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    # Reverse the order of the legend
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles[::-1], labels[::-1], loc='upper right')
    # plt.legend(loc = 'upper right')
    return plt.show()

In [None]:
# plot_preds(X_val.index[0], X_val.index[-1])
# plot_preds('2016-08-01 00:00:00', '2016-08-09 00:00:00')
# plot_preds('2011-01-16 14:00:00', '2011-01-16 22:00:00')
# plot_preds(max(mini_val['index']))
# print(min(mini_val['index']), max(mini_val['index']))
plot_preds('2011-06-01 00:00:00', '2011-06-15 23:59:59')

In [None]:
# data_subset = data_united[X_val.index[0]:X_val.index[-1]]

# fig, ax = plt.subplots(figsize=(10, 6))
# plt.axhline(y=0, color ='grey', linestyle = ':')
# # Plot the rain as a bar chart with a multiplier for visibility
# ax.vlines(data_subset.index, ymin=0, ymax=data_subset['ra_rain']*3, color = 'blue', label = "Rain (x3)")
# ax.plot(data_subset.index, data_subset['level_ro'], color = 'red', label = "Adjusted")
# ax.plot(data_subset.index, data_subset['raw_ro'], color = 'green', label = "Raw")
# # Include calibration points unless otherwise specified or unless there are none in the subset
# # if include_calibration == True and not data_subset_cal.empty:
# ax.plot(data_subset.index, data_subset['weir_level_cal'], linestyle='none', marker='x', color='red', label = "Calibration")

# # Plot labels
# ax.set_xlabel("Date (YYYY-MM-DD)")
# ax.set_ylabel("Level (mm)")
# # ax.set_title('Simple Time Series Plot')
# # ax.set_title("Runoff time series from " + input_date_start + " through " + input_date_end)
# # ax.set_ylim(bottom=0) 
# # ax.grid(True)
# plt.xticks(rotation=45, ha='right')
# plt.tight_layout()
# # Reverse the order of the legend
# handles, labels = ax.get_legend_handles_labels()
# ax.legend(handles[::-1], labels[::-1], loc='upper right')
# # plt.legend(loc = 'upper right')
# plt.show()

# del data_subset, fig, ax, handles, labels

In [None]:
# f1_score, precision_score, recall_score, accuracy_score
y_true = mini_val[var_of_interest].tolist()
y_pred = mini_val['pred_tf'].tolist()

# Compute the confusion matrix
# cm = confusion_matrix(mini_val[var_of_interest].tolist(), mini_val['pred_tf'].tolist())
metric_cm = confusion_matrix(y_true, y_pred)
# print("Confusion Matrix:\n", metric_cm)

# Precision
metric_precision = precision_score(y_true, y_pred)


# Recall
metric_recall = recall_score(y_true, y_pred)

# F1
# f1_score = f1_score(mini_val[var_of_interest].tolist(), mini_val['pred_tf'].tolist())
metric_f1 = f1_score(y_true, y_pred)
# print("F1:\n", metric_f1)

# Accuracy - the total number of correct predictions performed by hte model
metric_accuracy = accuracy_score(y_true, y_pred)

print(
    "\nConfusion Matrix:\n", metric_cm,
    "\nPrecision:\t", metric_precision,
    "\nRecall:\t\t", metric_recall,
    "\nF1 Score:\t", metric_f1,
    "\nAccuracy:\t", metric_accuracy
)

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=metric_cm, display_labels=['Negative', 'Positive'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

# Environment cleanup
del y_true, y_pred, metric_cm, metric_precision, metric_recall, metric_f1, metric_accuracy, disp