# Data Splitting

Author: Gillian A. McGinnis, final-semester M.S. Information Science - Machine Learning  
The University of Arizona College of Information  
INFO 698 - Capstone  
Start date: 21 October 2025  
Last updated: 20 November 2025

In [1]:
"""
Module providing code for test/train split and sliding window creation. Relies on 01_clean.ipynb completion.
"""

'\nModule providing code for test/train split and sliding window creation. Relies on 01_clean.ipynb completion.\n'

## Setup

In [2]:
var_of_interest = "obstruction_ro"

### Packages

In [3]:
# General packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import randint, uniform
# # import matplotlib.ticker as ticker
# import matplotlib.dates as mdates
# import datetime as dt
# from datetime import date
from sklearn.model_selection import TimeSeriesSplit, train_test_split, RandomizedSearchCV
import xgboost as xgb
from sklearn.metrics import mean_squared_error, confusion_matrix, ConfusionMatrixDisplay, f1_score, precision_score, recall_score, accuracy_score, precision_recall_curve, make_scorer

from skforecast.recursive import ForecasterRecursive
from skforecast.model_selection import backtesting_forecaster, TimeSeriesFold

# For saving models
import joblib

# For data importing
from helper_utils import get_path

In [4]:
## (Optional chunk)
# Current session information
import session_info
session_info.show(dependencies=False)

In [5]:
# To make it easier to tell when processes have completed
from playsound3 import playsound

In [6]:
# 3+3

# _ = playsound(get_path('completed.mp3', 'code'), block=False)

### Data

In [7]:
# united_water = pd.read_parquet('data/clean/water_nocal.parquet')
# data_cal = pd.read_parquet('data/clean/calibration.parquet')
# data_cal = data_cal.rename(columns={'weir_level':'weir_level_cal'})

# united_soil = pd.read_parquet('data/clean/soil.parquet')

united_water = pd.read_parquet(get_path('clean/water_nocal.parquet'))
united_soil = pd.read_parquet(get_path('clean/soil.parquet'))

# united_water = pd.read_parquet('data/clean/water_nocal.parquet')
data_cal = pd.read_parquet(get_path('clean/calibration.parquet'))
data_cal = data_cal.rename(columns={'weir_level':'weir_level_cal'})

# united_soil = pd.read_parquet('data/clean/soil.parquet')

### Cleanup

Small amount of data wrangling for memory improvements (some as a consequence of importing).

#### Memory improvements

In [8]:
# ## Reduce size
# def mod_float(input_df):
#     output_df = input_df#.copy()
#     for col in output_df.select_dtypes(include = [np.float64, 'Float64']).columns:
#         # print(col)
#         output_df[col] = output_df[col].astype(np.float32)
#     return output_df

In [9]:
# ## Convert T/F
# def mod_obj(input_df):
#     output_df = input_df#.copy()
#     for col in output_df.select_dtypes(include=['object']).columns:
#         u_vals = output_df[col].unique()
#         u_vals_nonna = [u for u in u_vals if pd.notna(u)]
#         isboolean = all(isinstance(u_n, bool) for u_n in u_vals_nonna)
#         if isboolean and len(u_vals_nonna) <= 2:
#             output_df[col] = output_df[col].astype('boolean')
#         # print(col, u_vals_nonna)
#     return output_df

In [10]:
# Select columns of interest
data_water = united_water.drop(columns=['raw_rain', 'chk_note_rain', 'chk_fail_rain', 'chk_note_ro', 'chk_fail_ro', 'comment_ro', 'source_ro'])

# Cleanup
del united_water

# Remove duplicate entries
data_water = data_water.reset_index().drop_duplicates(keep='first').set_index('datetime')

In [11]:
water_drops = ['level_ro', 'obstruction_ro', 'gap_fill_ro', 'weir_cleaning_ro', 'spike_ro', 'calibration_ro']
water_drops.remove(var_of_interest)

data_water = data_water.drop(water_drops, axis=1)

del water_drops

data_water.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3581782 entries, 1989-06-21 13:00:00 to 2025-08-01 13:00:00
Data columns (total 3 columns):
 #   Column          Dtype  
---  ------          -----  
 0   ra_rain         float32
 1   raw_ro          float32
 2   obstruction_ro  boolean
dtypes: boolean(1), float32(2)
memory usage: 61.5 MB


In [12]:
united_soil['sample'] = united_soil['sample'].astype('category')

In [13]:
# data_water = mod_float(data_water)
# united_soil = mod_float(united_soil)

# data_water = mod_obj(data_water)

# data_cal['weir_level_cal'] = data_cal['weir_level_cal'].astype(np.float32)

## Prepare

In [14]:
# data_water['1999-12-31 23:00:00':'2000-01-13 00:00:00']
# Missing, inclusive:
# 1999-12-31 23:25:00
# 2000-01-11 14:05:00
# united_water[united_water['level_ro'].isna()]


In [15]:
### Note ###
# REMOVE this later -- just a smaller subset for feature engineering testing!!!
# temp_subset_start = '2000-01-01 00:00:00'
temp_subset_start = '2001-02-01 00:00:00'
temp_subset_end = '2011-12-31 23:59:59'
# data_water = data_water['2015-01-01 00:00:00':'2016-12-31 23:59:59']
data_water = data_water[temp_subset_start:temp_subset_end]
######

In [16]:
# del united_water
# del mod_float, mod_obj

## Feature Engineering

### Distance from Event

In [17]:
# def since_feat(input_df, input_col):
#     output_df = input_df#.copy()
#     # Create index of instances where there is a data point
#     instances = output_df[input_col].notna()
#     # Create groupings based on most recent instance
#     group_id = instances.cumsum()
#     # Exclude the first grouping
#     # otherwise it assumes there was an event just prior to the first entry
#     group_id = group_id.replace(0, np.nan)
#     # Create new column to count number of records since the point
#     # which resets to 0 at each new point
#     output_df[f"since_{input_col}"] = output_df.groupby(group_id).cumcount()
#     return output_df

In [18]:
# def minsince_feat(input_df, input_col):
#     output_df = input_df#.copy()#[input_col].to_frame()
#     instances = output_df[input_col].notna()
#     # Create groupings based on most recent instance
#     group_id = instances.cumsum()
#     # Exclude the first grouping
#     # otherwise it assumes there was an event just prior to the first entry
#     group_id = group_id.replace(0, np.nan)
#     # Create new column to count the distance in minutes since the point
#     # which resets to 0 at each new point
#     output_df['timestamp'] = pd.to_datetime(output_df.index)
#     # Get start timestamp of the group
#     output_df['ts_start'] = output_df.groupby(group_id)['timestamp'].transform('min')
#     # Calculate the distance
#     output_df[f"minsince_{input_col}"] = (output_df['timestamp'] - output_df['ts_start']).dt.total_seconds()/60
#     # Remove extra cols
#     output_df = output_df.drop(columns=['timestamp', 'ts_start'])
#     return output_df

In [19]:
# def daysince_feat(input_df, input_col):
#     output_df = input_df#.copy()#[input_col].to_frame()
#     instances = output_df[input_col].notna()
#     # Create groupings based on most recent instance
#     group_id = instances.cumsum()
#     # Exclude the first grouping
#     # otherwise it assumes there was an event just prior to the first entry
#     group_id = group_id.replace(0, np.nan)
#     # Create new column to count the distance in days since the point
#     # which resets to 0 at each new point
#     output_df['timestamp'] = pd.to_datetime(output_df.index)
#     # Get start timestamp of the group
#     output_df['ts_start'] = output_df.groupby(group_id)['timestamp'].transform('min')
#     # Calculate the distance
#     # output_df[f"daysince_{input_col}"] = (output_df['timestamp'] - output_df['ts_start']).dt.total_seconds()/(24 * 60 * 60)
#     output_df[f"daysince_{input_col}"] = (output_df['timestamp'] - output_df['ts_start']).dt.days
#     # Remove extra cols
#     output_df = output_df.drop(columns=['timestamp', 'ts_start'])
#     return output_df

In [20]:
def timesince_feat(input_df, input_col, input_unit):
    # output_df = input_df.copy()#[input_col].to_frame()
    output_df = input_df
    instances = output_df[input_col].notna()
    # Create groupings based on most recent instance
    group_id = instances.cumsum()
    # Exclude the first grouping
    # otherwise it assumes there was an event just prior to the first entry
    group_id = group_id.replace(0, np.nan)
    # Create new column to count the distance in days since the point
    # which resets to 0 at each new point
    output_df['timestamp'] = pd.to_datetime(output_df.index)
    # Get start timestamp of the group
    output_df['ts_start'] = output_df.groupby(group_id)['timestamp'].transform('min')
    # Calculate the distance
    if input_unit == "minutes":
        output_df[f"minsince_{input_col}"] = (output_df['timestamp'] - output_df['ts_start']).dt.total_seconds().div(60).astype('Int32')
        # output_df[f"minsince_{input_col}"] = output_df[f"minsince_{input_col}"].astype(np.float32)
    elif input_unit == "days":
        output_df[f"daysince_{input_col}"] = (output_df['timestamp'] - output_df['ts_start']).dt.days.astype('Int32')
        # output_df[f"minsince_{input_col}"] = output_df[f"minsince_{input_col}"].astype(np.float32)
        # output_df[f"daysince_{input_col}"] = output_df[f"daysince_{input_col}"].astype('Int32')
    # Remove extra cols
    output_df = output_df.drop(columns=['timestamp', 'ts_start'])
    return output_df

In [21]:
# daysince_feat(data_mini, '1_shallow')[['1_shallow', 'level_ro', 'daysince_1_shallow']]
# timesince_feat(data_mini, '1_shallow', "days")[['1_shallow', 'level_ro', 'daysince_1_shallow']]

In [22]:
def timeuntil_feat(input_df, input_col, input_unit):
    output_df = input_df#.copy()
    output_df['timestamp'] = output_df.index

    # find where the point occurred (not null)
    instances = output_df[input_col].notna()

    # Create groupings based on the next instance
    # bfill on cumsum to associate ea row with the group ending at the next event
    group_id = instances[::-1].cumsum()[::-1].replace(0, np.nan)

    # Get the end timestamp of the group
    output_df['ts_end'] = output_df.groupby(group_id)['timestamp'].transform('max')

    # Calculate the distance
    if input_unit == "minutes":
        output_df[f"minuntil_{input_col}"] = (output_df['ts_end'] - output_df['timestamp']).dt.total_seconds().div(60).astype('Int32')
        # output_df[f"minuntil_{input_col}"] = output_df[f"minuntil_{input_col}"].astype('Int32')
    elif input_unit == "days":
        output_df[f"dayuntil_{input_col}"] = (output_df['ts_end'] - output_df['timestamp']).dt.days.astype('Int32')
        # output_df[f"dayuntil_{input_col}"] = output_df[f"dayuntil_{input_col}"].astype('Int32')

    # Remove extra cols
    output_df = output_df.drop(columns=['timestamp', 'ts_end'])

    return output_df


In [23]:
# data_water_mini = data_water.copy()['2000-01-01 00:00:00':'2000-06-01 00:00:00'][['ra_rain', 'level_ro']]

# data_water_mini.head()

In [24]:
# timeuntil_feat(data_water_mini, 'ra_rain', 'minutes')

In [25]:
# # since_feat(data_water[['ra_rain', 'raw_ro']], 'ra_rain')
# # data_water[['ra_rain', 'raw_ro']]
# data_w_test = data_water.copy()[['ra_rain', 'raw_ro']]
# data_w_test['ra_rain'] = data_w_test['ra_rain'].replace(0, np.nan)

# data_w_test = since_feat(data_w_test, 'ra_rain')
# data_w_test.head()

In [26]:
# # Create index of instances where there is a calibration point
# cal_instances = data_water['weir_level_cal'].notna()
# # Create groupings based on most recent instance
# cal_group_id = cal_instances.cumsum()
# # Create new column to count number of records since the calibration point
# # which resets to 0 at each new calibration
# data_water['records_since_cal'] = data_water.groupby(cal_group_id).cumcount()

# # Clean up environment
# del cal_instances, cal_group_id

# # data_water

# data_water = since_feat(data_water, 'weir_level_cal')

## HERE
# data_water = minsince_feat(data_water, 'weir_level_cal')

#### Rain
Create feature which tracks how recent a rain event occurred.

In [27]:
data_water = timesince_feat(data_water, 'ra_rain', "minutes")

In [28]:
# # Create index of instances where there is a calibration point
# rain_instances = data_water['ra_rain'].notna()
# # Create groupings based on most recent instance
# rain_group_id = rain_instances.cumsum()
# # Create new column to count number of records since the calibration point
# # which resets to 0 at each new calibration
# data_water['records_since_rain'] = data_water.groupby(rain_group_id).cumcount()

# # Clean up environment
# del rain_instances, rain_group_id

# # Replace NAs with 0
# data_water['ra_rain'] = data_water['ra_rain'].fillna(0)

# data_water.sample(10)
# # data_water.dropna(subset='raw_ro')

# data_water = since_feat(data_water, 'ra_rain')
# data_water = minsince_feat(data_water, 'ra_rain')
# data_water.sample(10)

In [29]:
# data_water['ra_rain'] = data_water['ra_rain'].fillna(0)
# data_water.sample(10)

### Rain event

Keep track of cumulative rainfall during a specific event.

In [30]:
# Create index of instances where there is a data point
# rain_event = data_water['ra_rain'].isnull()
# rain_event = (data_water['ra_rain'].isnull() & ((data_water['minsince_ra_rain'] >= 5.0) & (data_water['minsince_ra_rain'] != 0)))
rain_event = (data_water['ra_rain'].isnull() & ((data_water['minsince_ra_rain'] >= 5.0) & (data_water['minsince_ra_rain'] != 0)))
# Create groupings based on most recent instance
rain_event_id = rain_event.cumsum()
# Create new column to count number of records since the point
# which resets to 0 at each new point
# del group_id, instances
# water_mini
# group_id = group_id.replace(0, np.nan)
# water_mini['since_ra_rain2'] = water_mini.groupby(group_id).cumcount()
# water_mini
# water_mini.info()
data_water['eventsum_ra_rain'] = data_water.groupby(rain_event_id)['ra_rain'].cumsum()

del rain_event, rain_event_id

In [31]:
# data_water[data_water['minsince_ra_rain'] > 0]
# data_water
# data_water[(data_water['minsince_weir_level_cal'] < 5) & (data_water['minsince_ra_rain'] != 0)]
# data_water[(data_water['minsince_weir_level_cal'] < 5) & (data_water['minsince_ra_rain'] == 2.0)]
# data_water[(data_water['minsince_weir_level_cal'] < 6) & (data_water['eventsum_ra_rain'].notnull())][['ra_rain', 'eventsum_ra_rain','minsince_ra_rain', 'minsince_weir_level_cal']]
# data_water['2008-10-15 00:00:00':'2008-11-04 10:30:00']
# data_water['2006-06-21 09:15:00':'2006-06-21 09:30:00']

In [32]:
# data_water[['ra_rain', 'since_ra_rain', 'rain_event_cumsum']]
# data_water
# 475

In [33]:
# water_mini = data_water.copy()[['weir_level_cal', 'ra_rain', 'raw_ro', 'since_weir_level_cal', 'since_ra_rain']]
# water_mini.sample(10)

In [34]:
# # water_mini = data_water.copy()[['ra_rain', 'level_ro']]
# water_mini = data_water.copy()['ra_rain'].to_frame()
# instances = water_mini['ra_rain'].notna()
# # Create groupings based on most recent instance
# group_id = instances.cumsum()
# # Exclude the first grouping
# # otherwise it assumes there was an event just prior to the first entry
# group_id = group_id.replace(0, np.nan)
# # Create new column to count number of records since the point
# # which resets to 0 at each new point
# # output_df[f"since_{input_col}"] = output_df.groupby(group_id).cumcount()
# # group_id

# water_mini['timestamp'] = pd.to_datetime(water_mini.index)
# water_mini['ts_start'] = water_mini.groupby(group_id)['timestamp'].transform('min')
# water_mini['ts_dist'] = (water_mini['timestamp'] - water_mini['ts_start']).dt.total_seconds()/60
# water_mini

# # water_mini = water_mini.reset_index()
# # water_mini['ts_start'] = water_mini.groupby(group_id)['datetime'].transform('min')
# # water_mini['ts_dist'] = (water_mini['datetime'] - water_mini['ts_start']).dt.total_seconds() / 60
# # water_mini.set_index('datetime')

In [35]:
# # Create index of instances where there is a data point
# instances = water_mini['ra_rain'].notna()
# # Create groupings based on most recent instance
# group_id = instances.cumsum()
# # Create new column to count number of records since the point
# # which resets to 0 at each new point
# # del group_id, instances
# # water_mini
# # group_id = group_id.replace(0, np.nan)
# # water_mini['since_ra_rain2'] = water_mini.groupby(group_id).cumcount()
# # water_mini
# # water_mini.info()
# water_mini['rain_event'] = water_mini.groupby(group_id)['ra_rain'].cumsum()

# # rain_null_mask = water_mini['ra_rain'].isnull()
# # rain_group_id = rain_null_mask.cumsum()
# # water_mini.groupby(rain_group_id)['ra_rain'].cumsum()
# # # rain_null_mask
# # # water_mini['rain_event_cumsum'] = water_mini.groupby(rain_group_id)['ra_rain'].cumsum()

# # # g_id_event = null_mask.cumsum()
# # # water_m['r_event_sum'] = water_m.groupby(g_id_event)['ra_rain'].cumsum()

# # # del rain_null_mask, rain_group_id
# # # data_water['rain_event_sum'] = data_water.groupby(g_id_event)['ra_rain'].cumsum()

In [36]:
# # israin = water_mini['ra_rain'].notna()
# # israin_group_id = israin.cumsum()
# # # g_id
# # water_mini['since_rain_2'] = water_mini.groupby(israin_group_id).cumcount()
# water_mini['dec'] = np.exp(-0.1*water_mini['since_ra_rain'])
# water_mini['rain_fill'] = water_mini['rain_event_cumsum'].ffill()
# # data_u['1_shallow_f'] = data_u['1_shallow'].ffill()
# water_mini['rain_dec'] = (water_mini['rain_fill']*water_mini['dec'])
# # del israin, israin_group_id
# water_mini

### Decay

In [37]:
def decay_feat(input_df, input_col, input_dec_rate = -0.1):
    output_df = input_df#.copy()
    # output_df = since_feat(input_df = output_df, input_col = input_col)
    if f"minsince_{input_col}" not in output_df.columns:
        # output_df = minsince_feat(input_df = output_df, input_col = input_col)
        output_df = timesince_feat(input_df = output_df, input_col = input_col, input_unit = "minutes")
    
    output_df[f"decayrate{input_dec_rate}_{input_col}"] = np.exp(input_dec_rate * output_df[f"minsince_{input_col}"]).astype(np.float32)
    output_df[f"ffill_{input_col}"] = output_df[input_col].ffill()
    output_df[f"decay{input_dec_rate}_{input_col}"] = (output_df[f"ffill_{input_col}"] * output_df[f"decayrate{input_dec_rate}_{input_col}"])

    return output_df

# water_m = united_water[['raw_ro', 'level_ro', 'ra_rain', 'obstruction_ro']]

# null_mask = water_m['ra_rain'].isnull()
# g_id_event = null_mask.cumsum()
# water_m['r_event_sum'] = water_m.groupby(g_id_event)['ra_rain'].cumsum()

# is_rain = water_m['ra_rain'].notna()
# g_id = is_rain.cumsum()
# # g_id
# water_m['since_rain'] = water_m.groupby(g_id).cumcount()
# water_m['dec'] = np.exp(-0.1*water_m['since_rain'])
# water_m['rain_fill'] = water_m['r_event_sum'].ffill()
# # data_u['1_shallow_f'] = data_u['1_shallow'].ffill()
# water_m['rain_dec'] = (water_m['rain_fill']*water_m['dec'])

In [38]:
# water_mini = data_water.copy()
# water_mini['ra_rain'] = water_mini['ra_rain'].fillna(0)

In [39]:
# water_mini = data_water.copy()
# water_mini['ra_rain'] = water_mini['ra_rain'].fillna(0)
# water_mini = decay_feat(water_mini, 'eventsum_ra_rain')
# selected_columns = water_mini.columns[water_mini.columns.str.contains('ra_rain')]
# water_mini[selected_columns]

In [40]:
# Replace NAs in rain with 0
data_water['ra_rain'] = data_water['ra_rain'].fillna(0)

# Apply decay function
data_water = decay_feat(data_water, 'eventsum_ra_rain')

# Drop extra column
# minutes since rain event will be the same as minutes since most recent rain
data_water = data_water.drop('minsince_eventsum_ra_rain', axis=1)

### Lag features

#### Consistent cols

Modify the rows to prevent inappropriate data shifts

In [41]:
# # data_mini_min = data_water.copy()['1996-11-01 00:00:00':'1997-01-31 23:59:59']
# # data_mini_min = data_water.copy()['1993-01-01 00:00:00':'1997-12-25 00:00:00']
# # data_water.head()
# data_mini_min = data_water.copy()

# print(len(data_mini_min))

In [42]:
# data_mini_min.sort_index()['2000-04-14 02:35:00':'2000-04-14 03:00:00']

In [43]:
# # full_range = pd.date_range(start = '1996-11-01 00:00:00', end = '1997-02-01 00:00:00', freq = '5min')
# # full_range.duplicated().any()
# # data_mini_min.reindex(full_range)
# # data_mini_min.drop_duplicates(keep='first').asfreq(freq='5min')['2000-04-14 02:35:00':'2000-04-14 03:00:00']


# # Assuming your df has a unique DatetimeIndex already
# # Ensure the index is sorted first (good practice for time series ops)
# # df = df.sort_index()

# # 1. Define the complete range (e.g., every minute)
# new_index = pd.date_range(start=data_mini_min.index.min(), 
#                           end=data_mini_min.index.max(), 
#                           freq='5min')

# # 2. Reindex to this full range
# # data_mini_min.drop_duplicates(keep='first').reindex(new_index)['2000-04-14 02:35:00':'2000-04-14 03:00:00']
# data_mini_min.reset_index().drop_duplicates(keep='first').set_index('datetime').reindex(new_index)['2000-04-14 02:35:00':'2000-04-14 03:00:00']


# # data_mini_min = data_mini_min.reset_index()


# # 3. Use ffill/bfill to fill the new NaNs
# # df_complete = df_complete.ffill() 


# # print(f"Index type: {type(data_mini_min.index)}")
# # print(f"Index dtype: {data_mini_min.index.dtype}")
# # print(f"Index has duplicates: {data_mini_min.index.duplicated().any()}")
# # print(f"Number of rows: {len(data_mini_min)}")
# # print(f"Number of unique index values: {len(data_mini_min.index.unique())}")

# # # data_mini_min.index.duplicated().any()

# # duplicated_index_mask = data_mini_min.index.duplicated(keep=False)

# # # Filter the DataFrame using the boolean mask
# # data_mini_min[duplicated_index_mask]

# # # type(data_mini_min.index)
# # # full_range

In [44]:
# data_water['ra_rain'] = data_water['ra_rain'].fillna(0)

In [45]:
original_indices = data_water.index.copy()

new_index = pd.date_range(start = data_water.index.min(), 
                          end = data_water.index.max(), 
                          freq = '5min')

# Reindex
data_water = data_water.reindex(new_index)

# Cleanup
del new_index

# # Return
# data_water = data_water.loc[original_indices]
# del original_indices

In [46]:
# new_index = pd.date_range(start = data_water.index.min(), 
#                           end = data_water.index.max(), 
#                           freq = '5min')

# # 2. Reindex to this full range
# # data_mini_min.drop_duplicates(keep='first').reindex(new_index)['2000-04-14 02:35:00':'2000-04-14 03:00:00']
# data_water = data_water.reindex(new_index)

# del new_index

Get values from other recent time stamps.

In [47]:
def lag_feats(input_df, input_cols, input_lags):
    output_df = input_df#.copy()
    for col in input_cols:
        for lag in input_lags:
            output_df[f"{col}_lag{lag}"] = output_df[col].shift(lag)
    return output_df

In [48]:
# lag_feats(data_water, ['raw_ro'], [1, 2, 3, 24]).dropna(subset='raw_ro')[['raw_ro', 'raw_ro_lag1', 'raw_ro_lag2']]
# lag_feats(data_water, ['raw_ro'], [1, 2, 3, 24]).dropna(subset='raw_ro')[['raw_ro', 'raw_ro_lag1', 'raw_ro_lag24']]

# Columns to get temporal stats on
cols_to_shift = ['raw_ro', 'ra_rain']
# # data at 5-min increments -- lag to record values at 5m, 10m, 15m, 30m, 1h, and 2h prior
# lags_of_interest = [1, 2, 3, 6, 12, 24]
# data at 5-min increments -- lag to record values at 5m, 10m, 15m, 20m, 25m, 30m, 1h, 2h, 3h prior
lags_of_interest = [1, 2, 3, 4, 5, 6, 12, 24, 36]

data_water = lag_feats(data_water, cols_to_shift, lags_of_interest)

# data_water.sample(10)

#### Risky

In [49]:
# ## risky?
# var_of_interest = 'obstruction_ro'
# data_water = lag_feats(data_water, [var_of_interest], [1])
# # data_water = lag_feats(data_water, [var_of_interest], lags_of_interest)
# ##

### Rolling stats

Get stat values from range of recent time stamps.

In [50]:
# def rolling_feats(input_df, input_cols, input_windows):
#     output_df = input_df.copy()
#     for col in input_cols:
#         for window in input_windows:
#             output_df[f"{col}_rollmean_{window}"] = output_df[col].rolling(window).mean()
#             output_df[f"{col}_rollstd_{window}"] = output_df[col].rolling(window).std()
#             output_df[f"{col}_rollslope_{window}"] = (output_df[col].rolling(window).apply(lambda x: np.polyfit(range(len(x)), x, 1)[0], raw=True))
#     return output_df

In [51]:
# # data_water_mini = data_water['1990-01-01 00:00:00':'1990-01-30 23:59:59']
# # rolling_feats(data_water_mini, cols_to_lag, [6, 12, 36])
# # 10m, 30m, 1h, 6h
# windows_of_interest = [2, 6, 12, 72]


# # windows_of_interest = [2, 6, 12]
# # cols_to_shift = ['raw_ro', 'ra_rain']
# # data_water_slow = rolling_feats(data_water['2000-01-01 00:00:00':'2000-12-31 23:59:59'], cols_to_shift, windows_of_interest)

# # data_water_slow.sample(10)
# data_water = rolling_feats(data_water, cols_to_shift, windows_of_interest)

# # data_water.sample(10)

In [52]:
def rolling_feats(input_df, input_cols, input_windows, input_mtype = "mean"):
    output_df = input_df#.copy()
    
    # Create a dummy series of index values (0, 1, 2, ... N) once
    # 'x' represents the position within the dataframe for the regression calculation
    x_series = pd.Series(np.arange(len(output_df)), index=output_df.index)
    
    for col in input_cols:
        for window in input_windows:
            # 1. Calculate Mean and Std Dev (already fast)
            if input_mtype == "mean":
                output_df[f"{col}_rollmean_{window}"] = output_df[col].rolling(window).mean().astype(np.float32)
            elif input_mtype == "sum":
                output_df[f"{col}_rollsum_{window}"] = output_df[col].rolling(window).sum().astype(np.float32)
            elif input_mtype == "both":
                output_df[f"{col}_rollmean_{window}"] = output_df[col].rolling(window).mean().astype(np.float32)
                output_df[f"{col}_rollsum_{window}"] = output_df[col].rolling(window).sum().astype(np.float32)
            output_df[f"{col}_rollstd_{window}"] = output_df[col].rolling(window).std().astype(np.float32)

            # 2. Calculate Slope using vectorized operations (Fast)
            # Slope = Cov(Y, X) / Var(X)
            
            # Calculate Covariance of Y (your data) vs X (the index position)
            rolling_cov = output_df[col].rolling(window).cov(x_series)
            
            # Calculate Variance of X (index position)
            rolling_var_x = x_series.rolling(window).var()
            
            # The slope is Cov(X, Y) / Var(X)
            output_df[f"{col}_rollslope_{window}"] = (rolling_cov / rolling_var_x).astype(np.float32)
            
            # Note on edge cases: 
            # The initial 'window-1' values for rolling_var_x will be NaN/incorrect. 
            # Pandas automatically handles alignment, so the division result will also be NaN where appropriate.
            # This method works very well for standard time series analysis.
    # output_df = mod_float(output_df)
    return output_df

# cols_to_shift = ['raw_ro', 'ra_rain']
# # # 10m, 30m, 1h, 6h
# windows_of_interest = [2, 6, 12]
# data_owater = optimized_rolling_feats_vectorized(data_water['2000-01-01 00:00:00':'2000-12-31 23:59:59'], cols_to_shift, windows_of_interest)
# data_owater

# optimized_rolling_feats_vectorized(data_water['2000-01-01 00:00:00':'2000-12-31 23:59:59'], cols_to_shift, windows_of_interest)

In [53]:
# cols_to_shift = ['raw_ro', 'ra_rain']
# Inclusive of current point--
# 10m, 15m, 20m, 25m, 30m, 1h, 3h, 6h, 12h, 24h
windows_of_interest = [2, 3, 4, 5, 6, 12, 36, 72, 144, 288]
# data_water = rolling_feats(data_water, cols_to_shift, windows_of_interest)

# data_water = rolling_feats(data_water, ['raw_ro'], windows_of_interest, "mean")
data_water = rolling_feats(data_water, ['raw_ro'], windows_of_interest, "both")
data_water = rolling_feats(data_water, ['ra_rain'], windows_of_interest, "sum")

Change since last value

In [54]:
data_water['raw_ro_change'] = data_water['raw_ro'].diff()

# cal_na_mask = data_water['weir_level_cal'].notna() & data_water['raw_ro'].notna()
# # cal_na_mask
# (data_water['weir_level_cal'] - data_water['raw_ro']).dropna()
# del cal_na_mask
# data_water['diff_ro_cal'] = (data_water['weir_level_cal'] - data_water['raw_ro'])
# data_water['rain_diff']

In [55]:
# Return
data_water = data_water.loc[original_indices]

del original_indices

## Soil

Pivot the soil data such that each sample has its own columns, and separated by depth.

In [56]:
# Drop irrelevant column
data_soil_shallow = united_soil.copy().drop('h2o_by_wet_deep', axis=1)
data_soil_shallow['sample'] = data_soil_shallow['sample'].astype('float32')
# Pivot wider
data_soil_shallow = data_soil_shallow.pivot(columns='sample', values='h2o_by_wet_shallow')

# Drop irrelevant column
data_soil_deep = united_soil.copy().drop('h2o_by_wet_shallow', axis=1)

data_soil_deep['sample'] = data_soil_deep['sample'].astype('float32')
# Pivot wider
data_soil_deep = data_soil_deep.pivot(columns='sample', values='h2o_by_wet_deep')

In [57]:
data_soil = pd.merge(
    data_soil_shallow,
    data_soil_deep,
    left_index = True,
    right_index = True,
    suffixes = ("_shallow", "_deep"),
    how = "outer"
)

del data_soil_shallow, data_soil_deep
del united_soil

In [58]:
# soil_mini_shallow = united_soil.copy().drop('h2o_by_wet_deep', axis=1)
# soil_mini_shallow = soil_mini_shallow.pivot(columns='sample', values='h2o_by_wet_shallow')

# soil_mini_deep = united_soil.copy().drop('h2o_by_wet_shallow', axis=1)
# soil_mini_deep = soil_mini_deep.pivot(columns='sample', values='h2o_by_wet_deep')

# soil_mini = pd.merge(
#     soil_mini_shallow,
#     soil_mini_deep,
#     left_index=True,
#     right_index=True,
#     # soil_mini_shallow.reset_index(),
#     # soil_mini_deep.reset_index(),
#     # on = ["date", "sample"],
#     suffixes = ("_shallow", "_deep"),
#     how = "outer"
#     )

# soil_mini.head()

## Unite

In [59]:
# data_u_test = pd.merge(
#     data_water,
#     data_soil['2015-01-01 00:00:00':'2016-12-31 23:59:59'],
#     left_index = True,
#     right_index = True,
#     how = 'outer'
# )

# data_u_test

In [60]:
# # def since_feat(input_df, input_col):
# #     output_df = input_df.copy()
# #     # Create index of instances where there is a data point
# #     instances = output_df[input_col].notna()
# #     # Create groupings based on most recent instance
# #     group_id = instances.cumsum()
# #     # Create new column to count number of records since the point
# #     # which resets to 0 at each new point
# #     output_df[f"since_{input_col}"] = output_df.groupby(group_id).cumcount()
# #     return output_df

# cols_soil = [col for col in data_u_test.columns if (col.endswith('shallow') | col.endswith('deep'))]
# soil_instances = data_u_test[cols_soil].notna()
# soil_group_id = soil_instances.cumsum().max(axis=1)
# data_u_test["since_soil"] = data_u_test.groupby(soil_group_id).cumcount()
# # data_u_test.groupby(soil_group_id).cumcount()
# # data_u_test["since_soil"] = data_u_test.groupby(soil_group_id).cumcount()
# # data_u_test[cols_soil].notna().cumsum().max(axis=1)

In [61]:
# data_united = pd.merge(
#     data_water,
#     # REMOVE LATER
#     # data_soil['2015-01-01 00:00:00':'2016-12-31 23:59:59'],
#     data_soil['2000-01-01 00:00:00':'2015-12-31 23:59:59'],
#     # data_soil,
#     #
#     left_index = True,
#     right_index = True,
#     how = 'outer'
# )

data_united = pd.merge(
    data_water,
    # REMOVE LATER
    # data_soil['2015-01-01 00:00:00':'2016-12-31 23:59:59'],
    # data_soil['2000-01-01 00:00:00':'2015-12-31 23:59:59'],
    # data_cal,
    # data_cal['2000-01-01 00:00:00':'2015-12-31 23:59:59'],
    data_cal[temp_subset_start:temp_subset_end],
    #
    left_index = True,
    right_index = True,
    how = 'outer'
)

data_united = pd.merge(
    data_united,
    # REMOVE LATER
    # data_soil['2015-01-01 00:00:00':'2016-12-31 23:59:59'],
    # data_soil['2000-01-01 00:00:00':'2015-12-31 23:59:59'],
    # data_soil,
    # data_soil['2000-01-01 00:00:00':'2015-12-31 23:59:59'],
    data_soil[temp_subset_start:temp_subset_end],
    #
    left_index = True,
    right_index = True,
    how = 'outer'
)

# data_united['diff_ro_cal'] = (data_united['weir_level_cal'] - data_united['raw_ro'])
# data_united = minsince_feat(data_united, 'weir_level_cal')

### United features

In [62]:
data_united['diff_ro_cal'] = (data_united['weir_level_cal'] - data_united['raw_ro'])
data_united['diff_ro_cal'] = data_united['diff_ro_cal'].astype(np.float32)
data_united = timesince_feat(data_united, 'weir_level_cal', "minutes")

In [63]:
# timeuntil_feat(data_united_m, 'weir_level_cal', "minutes")[['ra_rain', 'weir_level_cal', 'minuntil_weir_level_cal']]

### Temporal features
Modify temporal features to be based on sine and cosine transformations, which allows for the model to be based on the cyclical patterns of time rather than abrupt distances

(e.g., the raw values Day 365 of the year is 'far' from Day 001, but in reality they are very near)

In [64]:
# data_u_mini = data_united['2001-01-01 00:00:00':'2002-01-01 00:00:00']

In [65]:
def temporal_feat(input_df, input_unit):
    output_df = input_df
    if input_unit=='day':
        cycle_length = 365.25
        value = output_df.index.dayofyear
    elif input_unit=='month':
        cycle_length = 12
        value = output_df.index.month
    elif input_unit=='hour':
        cycle_length = 24
        value = output_df.index.hour
    elif input_unit=='minute':
        cycle_length = 60
        value = output_df.index.minute
    
    output_df[f'{input_unit}_sin'] = np.sin(2 * np.pi * value / cycle_length).astype(np.float32)
    output_df[f'{input_unit}_cos'] = np.cos(2 * np.pi * value / cycle_length).astype(np.float32)

    return output_df

In [66]:
# temporal_feat(data_united['2001-01-01 00:00:00':'2002-01-01 00:00:00'], 'day').info()

In [67]:
data_united = temporal_feat(data_united, 'minute')
data_united = temporal_feat(data_united, 'hour')
data_united = temporal_feat(data_united, 'day')
data_united = temporal_feat(data_united, 'month')

In [68]:
# for col in data_united.columns:
#     if (col.endswith('shallow') | col.endswith('deep')):
#         data_united = minsince_feat(data_united, col)

In [69]:
# # Create feature to track soil value staleness
# cols_soil = [col for col in data_united.columns if (col.endswith('shallow') | col.endswith('deep'))]
# soil_instances = data_united[cols_soil].notna()
# soil_group_id = soil_instances.cumsum().max(axis=1)
# data_united["since_soil"] = data_united.groupby(soil_group_id).cumcount()

# del soil_instances, soil_group_id

In [70]:
# create features to track soil value staleness
cols_soil = [col for col in data_united.columns if (col.endswith('shallow') | col.endswith('deep'))]

for col in cols_soil:
# for col in data_united.columns:
    # if (col.endswith('shallow') | col.endswith('deep')):
    # data_united = minsince_feat(data_united, col)
    data_united = timesince_feat(data_united, col, "days")

# Extend soil vals
data_united[cols_soil] = data_united[cols_soil].ffill()

# Cutoff
# cols_soil_days = [col for col in data_united.columns if (col.startswith('daysince_') & (col.endswith('_shallow') | col.endswith('_deep')))]
# data_united['daysince_soil'] = data_united[cols_soil_days].min(axis=1)


del col, cols_soil
# data_united.sample(10)

In [71]:
# data_united_m = data_united.copy()['2001-01-01 00:00:00':'2002-01-01 00:00:00']

# cols_soil = [col for col in data_united.columns if (col.endswith('shallow') | col.endswith('deep'))]
# shallow_cols = [col for col in data_united.columns if (col.endswith('_shallow') & col.startswith('daysince_'))]
# [col for col in data_united.columns if ((col.endswith('_shallow') | col.endswith('_deep')) & col.startswith('daysince_'))]
# print(shallow_cols)
# data_united['2002-01-01 00:00:00':'2003-01-01 00:00:00'][shallow_cols].min(axis=1)

In [72]:
# cols_soil_days = [col for col in data_united_m.columns if (col.startswith('daysince_') & (col.endswith('_shallow') | col.endswith('_deep')))]
# data_united_m['daysince_soil'] = data_united_m[cols_soil_days].min(axis=1)
# data_united_m.drop(cols_soil_days, axis=1)

In [73]:
# # Extend soil vals
# # cols_to_fill = [col for col in data_united.columns if (col.endswith('shallow') | col.endswith('deep'))]
# # data_united[cols_to_fill] = data_united[cols_to_fill].ffill()
# data_united[cols_soil] = data_united[cols_soil].ffill()

# del cols_soil
# data_united.sample(10)

last param?

In [74]:
# data_u_mini = data_united[['raw_ro', 'weir_level_cal']]['2004-01-01 00:00:00':'2005-01-01 00:00:00'].copy()
# data_u_mini = timeuntil_feat(data_u_mini, 'weir_level_cal', 'minutes')
# data_u_mini = data_u_mini.dropna(subset='minuntil_weir_level_cal')

# split_date = '2004-06-15 12:30:30' # Define your cutoff date

# # Create independent copies using .copy()
# train_df = data_u_mini.loc[:split_date].copy()
# test_df = data_u_mini.loc[split_date:].copy()

# # The test_df index minimum confirms the exact moment of the split
# test_start_time = test_df.index.min()

# del data_u_mini, train_df, test_df, test_start_time, split_date


In [75]:
# test_df[test_df['minuntil_weir_level_cal'] == 0].index.min()

# if train_df.iloc[-1]['minuntil_weir_level_cal'] != 0:

    # print(train_df.iloc[-1])

## Train/Test split

80/20 initial split, with expanding sliding window for training/validation for hyperparameters, model stability, and feature selection.

In [76]:
# # Var to predict
# var_of_interest = 'obstruction_ro'

# # For if doing forward-interpretations
# # i_to_drop = data_united.loc[data_united[var_of_interest].isnull()].index.tolist()

# # Columns to be removed for training
# # Corrected RO value assumed not available, since raw RO is what is the largest indicator
# y_drops = ['level_ro', 'obstruction_ro', 'gap_fill_ro', 'weir_cleaning_ro', 'spike_ro', 'calibration_ro']

# # Remove if doing forward-features
# X_all = data_united.dropna(subset=[var_of_interest]).drop(y_drops, axis=1).copy()
# y_all = data_united.dropna(subset=[var_of_interest])[var_of_interest].copy()

# # For if doing forward-interpretations
# i_to_drop = data_united.loc[data_united[var_of_interest].isnull()].index.tolist()
# # X_all = data_united.drop(y_drops, axis=1).copy()
# # y_all = data_united[var_of_interest].copy()

# REMOVE NAs
data_united = data_united.dropna(subset=[var_of_interest])

X_all = data_united.drop(var_of_interest, axis=1).copy()
y_all = data_united[var_of_interest].copy()

In [77]:
y_len = len(y_all)

print(
    y_len, "\n",
    (round(.2*y_len) + round(.8*y_len)),
    "\nTrain:\t80p of ", y_len, " is ", round(.8*y_len),
    "\nTest:\t20p of ", y_len, " is ", round(.2*y_len),
    sep=""
)

del y_len

1134443
1134443
Train:	80p of 1134443 is 907554
Test:	20p of 1134443 is 226889


Unlike the typical approach for train/test splits, temporal data in this context must _not_ be randomly split as it would lead to severe leakage.

In [78]:
# Conduct the split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.2, shuffle=False)

# Cleanup
del X_all, y_all

print(
    "Train:\t", len(X_train), "\t", X_train.index[0], "thru", X_train.index[-1],
    "\nTest:\t", len(X_test), "\t", X_test.index[0], "thru", X_test.index[-1]
    # len(x_train), len(x_test), "\n",
    # x_train.index[-1]
)

Train:	 907554 	 2001-02-01 00:00:00 thru 2009-10-26 18:05:00 
Test:	 226889 	 2009-10-26 18:10:00 thru 2011-12-31 23:55:00


### Sliding Window

In [79]:
# Initialize the split function
tscv = TimeSeriesSplit(n_splits=3)
# print(tscv)

for i, (train_index, val_index) in enumerate(tscv.split(X_train)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={val_index}")
    # print("  Train: index=", mini_x.index[train_index])
    # print(f"  Test:  index={val_index}")
    print("------------------------------------------------------------")

del i, train_index, val_index

Fold 0:
  Train: index=[     0      1      2 ... 226887 226888 226889]
  Test:  index=[226890 226891 226892 ... 453775 453776 453777]
------------------------------------------------------------
Fold 1:
  Train: index=[     0      1      2 ... 453775 453776 453777]
  Test:  index=[453778 453779 453780 ... 680663 680664 680665]
------------------------------------------------------------
Fold 2:
  Train: index=[     0      1      2 ... 680663 680664 680665]
  Test:  index=[680666 680667 680668 ... 907551 907552 907553]
------------------------------------------------------------


## Hyperparameter tuning

In [80]:
param_dist = {
    'n_estimators': randint(50, 500), # early stopping control the actual number
    'learning_rate': uniform(0.01, 0.29),
    'max_depth': randint(3, 7),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': [0, 0.1, 0.2]
}

xgb_model = xgb.XGBClassifier(
    early_stopping_rounds=50,
    objective='binary:logistic',
    tree_method='hist',
    n_jobs=-1,
    eval_metric='logloss',
    scale_pos_weight = (np.sum(y_train == 0) / np.sum(y_train == 1))
    # scale_pos_weight = (y_sub_train.value_counts()[False] / y_sub_train.value_counts()[True]).item()
)

# Randomized search with efficient settings
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    # n_iter=20,                    # Start with a small number of iterations (e.g., 20-40)
    n_iter=2,
    scoring='roc_auc',
    cv=tscv,                      # Use TimeSeriesSplit
    verbose=1,
    random_state=42,
    return_train_score=True
)

# When you run the fit, ensure you pass early stopping parameters:
# X_train, y_train, X_val, y_val = ... define your data ...
# eval_set = [(X_train, y_train), (X_val, y_val)]
#
# random_search.fit(
#     X_train, y_train,
#     early_stopping_rounds=50, # Stop if validation metric doesn't improve for 50 rounds
#     eval_set=eval_set,
#     verbose=False # Set to True if you want to watch the early stopping logs
# )


In [81]:
# When you run the fit, ensure you pass early stopping parameters:
X_train_two, X_val, y_train_two, y_val = train_test_split(X_train, y_train, test_size = 0.2, shuffle=False)
eval_set = [(X_train_two, y_train_two), (X_val, y_val)]

print("Starting hyperparameter tuning...")
# random_search.fit(X_sub_train, y_sub_train)
# X_train_two = X_train_two.drop(i_to_drop, errors='ignore')
# y_train_two = y_train_two.drop(i_to_drop, errors='ignore')
# X_val = X_val.drop(i_to_drop, errors='ignore')
# y_val = y_val.drop(i_to_drop, errors = 'ignore')

Starting hyperparameter tuning...


In [82]:

# Check main training labels
if y_train.isnull().any() or np.isinf(y_train).any():
    print("ERROR: y_train contains NaN or Inf values!")

# Check validation labels (if you are using an eval_set)
# Assuming y_val is the label portion of your validation set
if y_val.isnull().any() or np.isinf(y_val).any():
    print("ERROR: y_val contains NaN or Inf values!")


In [83]:
random_search.fit(
    X_train_two, y_train_two,
    # early_stopping_rounds=50, # Stop if validation metric doesn't improve for 50 rounds
    eval_set=eval_set,
    verbose=False # Set to True if you want to watch the early stopping logs
)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,param_distributions,"{'colsample_bytree': <scipy.stats....t 0x11c164b90>, 'gamma': [0, 0.1, ...], 'learning_rate': <scipy.stats....t 0x11bec3a10>, 'max_depth': <scipy.stats....t 0x11c164a50>, ...}"
,n_iter,2
,scoring,'roc_auc'
,n_jobs,
,refit,True
,cv,TimeSeriesSpl...est_size=None)
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,np.float64(0.749816047538945)
,device,
,early_stopping_rounds,50
,enable_categorical,False


In [84]:
# # Saving result
# import joblib

# filename = 'random_search_results.joblib'
# joblib.dump(random_search, filename) 

In [97]:
# Save
joblib.dump(random_search, get_path('models/random_search_mini_spw.joblib', 'outputs'))

# Make a chime to indicate completion
_ = playsound(get_path('completed.mp3', 'code'), block=False)

In [86]:
# Print the results
print("Best hyperparameters found:")
print(random_search.best_params_)
print(f"Best F1 Score (averaged across CV folds): {random_search.best_score_:.4f}")

Best hyperparameters found:
{'colsample_bytree': np.float64(0.749816047538945), 'gamma': 0, 'learning_rate': np.float64(0.0631960890611875), 'max_depth': 6, 'n_estimators': 238, 'subsample': np.float64(0.8387400631785948)}
Best F1 Score (averaged across CV folds): 0.6509


## Feature selection

In [87]:
best_model = random_search.best_estimator_
feature_importances = best_model.feature_importances_
# map scores to feature names
# feature_importances
feature_names = X_train.columns.tolist() 

feature_importance_df = pd.DataFrame({
    'feat': feature_names,
    'importance': feature_importances
})

# sort importance
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

# print(feature_importance_df)
feature_importance_df

# most important features
# print(feature_importance_df.head(25)) 

Unnamed: 0,feat,importance
108,2.0_deep,0.066555
100,4.0_shallow,0.037670
97,1.0_shallow,0.037144
106,10.0_shallow,0.036348
63,raw_ro_rollstd_288,0.035889
...,...,...
140,daysince_4.0_deep,0.000000
142,daysince_6.0_deep,0.000000
141,daysince_5.0_deep,0.000000
143,daysince_7.0_deep,0.000000


In [88]:
# calculate most important 90 percent of the importance
feature_importance_df['cumulative_imp'] = feature_importance_df['importance'].cumsum()
features_90_percent = feature_importance_df[feature_importance_df['cumulative_imp'] <= 0.90].shape[0] + 1
features_90_percent

71

In [89]:
# print(feature_importance_df.tail(30))
feature_importance_df.tail(147-69)

Unnamed: 0,feat,importance,cumulative_imp
127,daysince_1.0_shallow,0.004791,0.899593
11,raw_ro_lag5,0.004469,0.904062
118,minsince_weir_level_cal,0.004439,0.908501
44,raw_ro_rollslope_6,0.004341,0.912842
15,raw_ro_lag36,0.004328,0.917170
...,...,...,...
140,daysince_4.0_deep,0.000000,1.000000
142,daysince_6.0_deep,0.000000,1.000000
141,daysince_5.0_deep,0.000000,1.000000
143,daysince_7.0_deep,0.000000,1.000000


# draft

In [90]:
best_params = random_search.best_params_

# final model w optimized params
final_optimized_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    # use_label_encoder=False,
    tree_method='hist',
    random_state=42,
    **best_params # unpack best params
)

final_optimized_model.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,np.float64(0.749816047538945)
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
# Make a chime to indicate completion
_ = playsound(get_path('completed.mp3', 'code'), block=False)

## Metrics

In [92]:
print("Predicting...")
y_pred = final_optimized_model.predict(X_test)
y_pred_proba = final_optimized_model.predict_proba(X_test)[:, 1]

# current fold info
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

# # Find the optimal threshold
# Calculate F1-score for every possible threshold
fscores = (2 * precision * recall) / (precision + recall)

# Handle potential division by zero warnings
# if no positive predictions were made
fscores[np.isnan(fscores)] = 0 

# index of highest F1
optimal_idx = np.argmax(fscores)
best_threshold = thresholds[optimal_idx] # Note: thresholds array is one element shorter than P/R arrays

print(f"Best threshold: {best_threshold:.4f}")

# print(f"Fold {fold} Optimal Threshold (Max F1): {best_threshold:.4f}")


print("Getting metrics...")
print("F1\tAcc\tPre\tRec")
print(
    f1_score(y_test, y_pred),
    accuracy_score(y_test, y_pred),
    precision_score(y_test, y_pred),
    recall_score(y_test, y_pred),
    sep ="\t"
)

# all_f1.append(fold_f1)
# all_accuracy.append(fold_accuracy)
# all_precision.append(fold_precision)
# all_recall.append(fold_recall)

Predicting...
Best threshold: 0.1565
Getting metrics...
F1	Acc	Pre	Rec
0.4676848807690707	0.8872488309261357	0.8245047688921496	0.32642035552457305


In [93]:
# final_f1 = f1_score(y_val, y_test_pred)
# final_accuracy = accuracy_score(y_val, y_test_pred)
# final_precision = precision_score(y_val, y_test_pred)
# final_recall = recall_score(y_val, y_test_pred)

# print(f"{fold}\tF1: {final_f1:.4f}\tAcc{final_accuracy:.4f}\tPrec{final_precision:.4f}\tRec: {final_recall:.4f}")

In [94]:
# print("i\tF1\tAcc\tPre\tRec")
# for i in range(len(all_f1)):
#     print(i, round(all_f1[i], 4), round(all_accuracy[i], 4), round(all_precision[i], 4), round(all_recall[i],4), sep="\t")