# Data Splitting and Modelling

Author: Gillian A. McGinnis, final-semester M.S. Information Science - Machine Learning  
The University of Arizona College of Information  
INFO 698 - Capstone  
Start date: 21 October 2025  
Last updated: 25 November 2025

In [1]:
"""
Module providing code for test/train split and sliding window creation. Relies on 01_clean.ipynb completion.
"""

'\nModule providing code for test/train split and sliding window creation. Relies on 01_clean.ipynb completion.\n'

## Setup

In [2]:
var_of_interest = "obstruction_ro"

### Packages

In [3]:
# GPU Setup
%load_ext cudf.pandas
import pandas as pd
import cudf
import cupy as cp

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

from google.colab import userdata
gh_pat = userdata.get('gh_pat')
gh_repo = userdata.get('gh_repo')
repo_url = f'https://{gh_pat}@github.com/{gh_repo}'
!git clone {repo_url}

fatal: destination path 'info-698-capstone' already exists and is not an empty directory.


In [5]:
import os
if os.getcwd() == '/content':
    print("Changing wd...")
    os.chdir('info-698-capstone/code')

# # Verify the current working directory
print(f"Current working directory is: {os.getcwd()}")

Changing wd...
Current working directory is: /content/info-698-capstone/code


In [6]:
# General packages
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from scipy.stats import randint, uniform
from sklearn.model_selection import TimeSeriesSplit, train_test_split, RandomizedSearchCV, TunedThresholdClassifierCV
import xgboost as xgb
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, precision_recall_curve, make_scorer

# For saving models
import joblib

# For data importing and exporting
from helper_utils import get_path, model_path

In [7]:
## (Optional chunk)
# Current session information

# From StackOverflow,
# https://stackoverflow.com/a/62128239/23486987
try:
    import session_info
except:
    !pip install session_info
    import session_info
# !pip install session_info
# import session_info
session_info.show(dependencies=False)

In [8]:
# To make it easier to tell when processes have completed -- can delete later
# From StackOverflow,
# https://stackoverflow.com/a/62128239/23486987
try:
    from playsound3 import playsound
except:
    !pip install playsound3
    from playsound3 import playsound

In [61]:
# Set seed
np.random.seed(42)
cp.random.seed(42)

In [10]:
# Make sure GPU active
# !nvidia-smi
import torch
if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("Warning: GPU not found!")

Using GPU: NVIDIA A100-SXM4-40GB


### Data

In [11]:
# united_water = pd.read_parquet('data/clean/water_nocal.parquet')
# data_cal = pd.read_parquet('data/clean/calibration.parquet')
# data_cal = data_cal.rename(columns={'weir_level':'weir_level_cal'})

# united_soil = pd.read_parquet('data/clean/soil.parquet')

united_water = pd.read_parquet(get_path('clean/water_nocal.parquet'))
united_soil = pd.read_parquet(get_path('clean/soil.parquet'))

# united_water = pd.read_parquet('data/clean/water_nocal.parquet')
data_cal = pd.read_parquet(get_path('clean/calibration.parquet'))
data_cal = data_cal.rename(columns={'weir_level':'weir_level_cal'})

# united_soil = pd.read_parquet('data/clean/soil.parquet')

In [12]:
data_cal.info()

<class 'cudf.core.dataframe.DataFrame'>
DatetimeIndex: 6136 entries, 1994-01-03 08:46:00 to 2025-08-01 09:10:00
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   weir_level_cal  6136 non-null   int8
dtypes: int8(1)
memory usage: 53.9 KB


### Cleanup

Small amount of data wrangling for memory improvements (some as a consequence of importing).

#### Memory improvements

In [13]:
# Select columns of interest
data_water = united_water.drop(columns=['raw_rain', 'chk_note_rain', 'chk_fail_rain', 'chk_note_ro', 'chk_fail_ro', 'comment_ro', 'source_ro'])

# Cleanup
del united_water

# Remove duplicate entries
data_water = data_water.reset_index().drop_duplicates(keep='first').set_index('datetime')

In [14]:
water_drops = ['level_ro', 'obstruction_ro', 'gap_fill_ro', 'weir_cleaning_ro', 'spike_ro', 'calibration_ro']
water_drops.remove(var_of_interest)

data_water = data_water.drop(water_drops, axis=1)

del water_drops

data_water.info(memory_usage="deep")

<class 'cudf.core.dataframe.DataFrame'>
DatetimeIndex: 3581782 entries, 1989-06-21 13:00:00 to 2025-08-01 13:00:00
Data columns (total 3 columns):
 #   Column          Dtype
---  ------          -----
 0   ra_rain         float32
 1   raw_ro          float32
 2   obstruction_ro  bool
dtypes: bool(1), float32(2)
memory usage: 59.4 MB


In [15]:
united_soil['sample'] = united_soil['sample'].astype('category')

## Prepare

In [16]:
### Note ###
# REMOVE this later -- just a smaller subset for feature engineering testing!!!
# temp_subset_start = '2000-01-01 00:00:00'
temp_subset_start = '2001-02-01 00:00:00'
temp_subset_end = '2011-12-31 23:59:59'
# data_water = data_water['2015-01-01 00:00:00':'2016-12-31 23:59:59']
data_water = data_water[temp_subset_start:temp_subset_end]
######

## Feature Engineering

### Distance from Event

In [19]:
def timesince_feat(input_df, input_col, input_unit):
    # output_df = input_df.copy()#[input_col].to_frame()
    output_df = input_df
    instances = output_df[input_col].notna()
    # Create groupings based on most recent instance
    group_id = instances.cumsum()
    # Exclude the first grouping
    # otherwise it assumes there was an event just prior to the first entry
    group_id = group_id.replace(0, np.nan)
    # Create new column to count the distance in days since the point
    # which resets to 0 at each new point
    output_df['timestamp'] = pd.to_datetime(output_df.index)
    # Get start timestamp of the group
    output_df['ts_start'] = output_df.groupby(group_id)['timestamp'].transform('min')
    # Calculate the distance
    if input_unit == "minutes":
        # output_df[f"minsince_{input_col}"] = (output_df['timestamp'] - output_df['ts_start']).dt.total_seconds().div(60).astype('Int32')
        output_df[f"minsince_{input_col}"] = (output_df['timestamp'] - output_df['ts_start']).dt.total_seconds().div(60).astype(np.float32)
        # output_df[f"minsince_{input_col}"] = output_df[f"minsince_{input_col}"].astype(np.float32)
    elif input_unit == "days":
        # output_df[f"daysince_{input_col}"] = (output_df['timestamp'] - output_df['ts_start']).dt.days.astype('Int32')
        output_df[f"daysince_{input_col}"] = (output_df['timestamp'] - output_df['ts_start']).dt.days.astype(np.float32)
        # output_df[f"minsince_{input_col}"] = output_df[f"minsince_{input_col}"].astype(np.float32)
        # output_df[f"daysince_{input_col}"] = output_df[f"daysince_{input_col}"].astype('Int32')
    # Remove extra cols
    output_df = output_df.drop(columns=['timestamp', 'ts_start'])
    return output_df

#### Rain
Create feature which tracks how recent a rain event occurred.

In [20]:
data_water = timesince_feat(data_water, 'ra_rain', "minutes")

### Rain event

Keep track of cumulative rainfall during a specific event.

In [21]:
# Create index of instances where there is a data point
# rain_event = data_water['ra_rain'].isnull()
# rain_event = (data_water['ra_rain'].isnull() & ((data_water['minsince_ra_rain'] >= 5.0) & (data_water['minsince_ra_rain'] != 0)))
rain_event = (data_water['ra_rain'].isnull() & ((data_water['minsince_ra_rain'] >= 5.0) & (data_water['minsince_ra_rain'] != 0)))
# Create groupings based on most recent instance
rain_event_id = rain_event.cumsum()
# Create new column to count number of records since the point
# which resets to 0 at each new point
# del group_id, instances
# water_mini
# group_id = group_id.replace(0, np.nan)
# water_mini['since_ra_rain2'] = water_mini.groupby(group_id).cumcount()
# water_mini
# water_mini.info()
data_water['eventsum_ra_rain'] = data_water.groupby(rain_event_id)['ra_rain'].cumsum()

del rain_event, rain_event_id

### Decay

In [22]:
def decay_feat(input_df, input_col, input_dec_rate = -0.1):
    output_df = input_df#.copy()
    # output_df = since_feat(input_df = output_df, input_col = input_col)
    if f"minsince_{input_col}" not in output_df.columns:
        # output_df = minsince_feat(input_df = output_df, input_col = input_col)
        output_df = timesince_feat(input_df = output_df, input_col = input_col, input_unit = "minutes")
    # Update for GPU for overflow fix
    output_df[f"minsince_{input_col}"] = output_df[f"minsince_{input_col}"].astype(np.float64)

    output_df[f"decayrate{input_dec_rate}_{input_col}"] = np.exp(input_dec_rate * output_df[f"minsince_{input_col}"]).astype(np.float32)
    output_df[f"ffill_{input_col}"] = output_df[input_col].ffill()
    output_df[f"decay{input_dec_rate}_{input_col}"] = (output_df[f"ffill_{input_col}"] * output_df[f"decayrate{input_dec_rate}_{input_col}"])

    return output_df

# water_m = united_water[['raw_ro', 'level_ro', 'ra_rain', 'obstruction_ro']]

# null_mask = water_m['ra_rain'].isnull()
# g_id_event = null_mask.cumsum()
# water_m['r_event_sum'] = water_m.groupby(g_id_event)['ra_rain'].cumsum()

# is_rain = water_m['ra_rain'].notna()
# g_id = is_rain.cumsum()
# # g_id
# water_m['since_rain'] = water_m.groupby(g_id).cumcount()
# water_m['dec'] = np.exp(-0.1*water_m['since_rain'])
# water_m['rain_fill'] = water_m['r_event_sum'].ffill()
# # data_u['1_shallow_f'] = data_u['1_shallow'].ffill()
# water_m['rain_dec'] = (water_m['rain_fill']*water_m['dec'])

In [23]:
# Replace NAs in rain with 0
data_water['ra_rain'] = data_water['ra_rain'].fillna(0)

# Apply decay function
data_water = decay_feat(data_water, 'eventsum_ra_rain')

# Drop extra column
# minutes since rain event will be the same as minutes since most recent rain
data_water = data_water.drop('minsince_eventsum_ra_rain', axis=1)

### Lag features

#### Consistent cols

Modify the rows to prevent inappropriate data shifts

In [24]:
original_indices = data_water.index.copy()

new_index = pd.date_range(start = data_water.index.min(),
                          end = data_water.index.max(),
                          freq = '5min')

# Reindex
data_water = data_water.reindex(new_index)

# Cleanup
del new_index

# # Return
# data_water = data_water.loc[original_indices]
# del original_indices

Get values from other recent time stamps.

In [25]:
def lag_feats(input_df, input_cols, input_lags):
    output_df = input_df#.copy()
    for col in input_cols:
        for lag in input_lags:
            output_df[f"{col}_lag{lag}"] = output_df[col].shift(lag)
    return output_df

In [26]:
# lag_feats(data_water, ['raw_ro'], [1, 2, 3, 24]).dropna(subset='raw_ro')[['raw_ro', 'raw_ro_lag1', 'raw_ro_lag2']]
# lag_feats(data_water, ['raw_ro'], [1, 2, 3, 24]).dropna(subset='raw_ro')[['raw_ro', 'raw_ro_lag1', 'raw_ro_lag24']]

# Columns to get temporal stats on
cols_to_shift = ['raw_ro', 'ra_rain']
# # data at 5-min increments -- lag to record values at 5m, 10m, 15m, 30m, 1h, and 2h prior
# lags_of_interest = [1, 2, 3, 6, 12, 24]
# data at 5-min increments -- lag to record values at 5m, 10m, 15m, 20m, 25m, 30m, 1h, 2h, 3h prior
lags_of_interest = [1, 2, 3, 4, 5, 6, 12, 24, 36]

data_water = lag_feats(data_water, cols_to_shift, lags_of_interest)

# data_water.sample(10)

### Rolling stats

Get stat values from range of recent time stamps.

In [27]:
def rolling_feats(input_df, input_cols, input_windows, input_mtype = "mean"):
    output_df = input_df#.copy()

    # Create a dummy series of index values (0, 1, 2, ... N) once
    # 'x' represents the position within the dataframe for the regression calculation
    x_series = pd.Series(np.arange(len(output_df)), index=output_df.index)

    for col in input_cols:
        for window in input_windows:
            # 1. Calculate mean and std
            if input_mtype == "mean":
                output_df[f"{col}_rollmean_{window}"] = output_df[col].rolling(window).mean().astype(np.float32)
            elif input_mtype == "sum":
                output_df[f"{col}_rollsum_{window}"] = output_df[col].rolling(window).sum().astype(np.float32)
            elif input_mtype == "both":
                output_df[f"{col}_rollmean_{window}"] = output_df[col].rolling(window).mean().astype(np.float32)
                output_df[f"{col}_rollsum_{window}"] = output_df[col].rolling(window).sum().astype(np.float32)
            output_df[f"{col}_rollstd_{window}"] = output_df[col].rolling(window).std().astype(np.float32)

            # 2. Calculate Slope using vectorized operations
            # Calculate Covariance of Y (data) vs X (index position)
            rolling_cov = output_df[col].rolling(window).cov(x_series)
            # Calculate Variance of X (index position)
            rolling_var_x = x_series.rolling(window).var()
            # Slope = Cov(Y, X) / Var(X)
            output_df[f"{col}_rollslope_{window}"] = (rolling_cov / rolling_var_x).astype(np.float32)

            # Note on edge cases:
            # The initial 'window-1' values for rolling_var_x will be NaN/incorrect.
            # Pandas automatically handles alignment, so the division result will also be NaN where appropriate.
            # This method works very well for standard time series analysis.
    return output_df

In [28]:
# Inclusive of current point--
# 10m, 15m, 20m, 25m, 30m, 1h, 3h, 6h, 12h, 24h
windows_of_interest = [2, 3, 4, 5, 6, 12, 36, 72, 144, 288]

# data_water = rolling_feats(data_water, ['raw_ro'], windows_of_interest, "mean")
data_water = rolling_feats(data_water, ['raw_ro'], windows_of_interest, "both")
data_water = rolling_feats(data_water, ['ra_rain'], windows_of_interest, "sum")

Change since last value

In [29]:
data_water['raw_ro_change'] = data_water['raw_ro'].diff()

# cal_na_mask = data_water['weir_level_cal'].notna() & data_water['raw_ro'].notna()
# # cal_na_mask
# (data_water['weir_level_cal'] - data_water['raw_ro']).dropna()
# del cal_na_mask
# data_water['diff_ro_cal'] = (data_water['weir_level_cal'] - data_water['raw_ro'])
# data_water['rain_diff']

In [30]:
# Return
data_water = data_water.loc[original_indices]

del original_indices

## Soil

Pivot the soil data such that each sample has its own columns, and separated by depth.

In [31]:
# Drop irrelevant column
data_soil_shallow = united_soil.copy().drop('h2o_by_wet_deep', axis=1)
data_soil_shallow['sample'] = data_soil_shallow['sample'].astype('float32')
# Pivot wider
data_soil_shallow = data_soil_shallow.pivot(columns='sample', values='h2o_by_wet_shallow')

# Drop irrelevant column
data_soil_deep = united_soil.copy().drop('h2o_by_wet_shallow', axis=1)

data_soil_deep['sample'] = data_soil_deep['sample'].astype('float32')
# Pivot wider
data_soil_deep = data_soil_deep.pivot(columns='sample', values='h2o_by_wet_deep')

In [32]:
data_soil = pd.merge(
    data_soil_shallow,
    data_soil_deep,
    left_index = True,
    right_index = True,
    suffixes = ("_shallow", "_deep"),
    how = "outer"
)

del data_soil_shallow, data_soil_deep
del united_soil

## Unite

In [34]:
data_united = pd.merge(
    data_water,
    # REMOVE LATER
    data_cal[temp_subset_start:temp_subset_end],
    #
    left_index = True,
    right_index = True,
    how = 'outer'
)

data_united = pd.merge(
    data_united,
    # REMOVE LATER
    data_soil[temp_subset_start:temp_subset_end],
    #
    left_index = True,
    right_index = True,
    how = 'outer'
)

# data_united['diff_ro_cal'] = (data_united['weir_level_cal'] - data_united['raw_ro'])
# data_united = minsince_feat(data_united, 'weir_level_cal')

### United features

In [35]:
data_united['diff_ro_cal'] = (data_united['weir_level_cal'] - data_united['raw_ro'])
data_united['diff_ro_cal'] = data_united['diff_ro_cal'].astype(np.float32)
data_united = timesince_feat(data_united, 'weir_level_cal', "minutes")

### Temporal features
Modify temporal features to be based on sine and cosine transformations, which allows for the model to be based on the cyclical patterns of time rather than abrupt distances

(e.g., the raw values Day 365 of the year is 'far' from Day 001, but in reality they are very near)

In [36]:
def temporal_feat(input_df, input_unit):
    output_df = input_df
    if input_unit=='day':
        cycle_length = 365.25
        value = output_df.index.dayofyear
    elif input_unit=='month':
        cycle_length = 12
        value = output_df.index.month
    elif input_unit=='hour':
        cycle_length = 24
        value = output_df.index.hour
    elif input_unit=='minute':
        cycle_length = 60
        value = output_df.index.minute

    output_df[f'{input_unit}_sin'] = np.sin(2 * np.pi * value / cycle_length).astype(np.float32)
    output_df[f'{input_unit}_cos'] = np.cos(2 * np.pi * value / cycle_length).astype(np.float32)

    return output_df

In [37]:
data_united = temporal_feat(data_united, 'minute')
data_united = temporal_feat(data_united, 'hour')
data_united = temporal_feat(data_united, 'day')
data_united = temporal_feat(data_united, 'month')

In [None]:
# # Create feature to track soil value staleness
# cols_soil = [col for col in data_united.columns if (col.endswith('shallow') | col.endswith('deep'))]
# soil_instances = data_united[cols_soil].notna()
# soil_group_id = soil_instances.cumsum().max(axis=1)
# data_united["since_soil"] = data_united.groupby(soil_group_id).cumcount()

# del soil_instances, soil_group_id

In [38]:
# create features to track soil value staleness
cols_soil = [col for col in data_united.columns if (col.endswith('shallow') | col.endswith('deep'))]

for col in cols_soil:
# for col in data_united.columns:
    # if (col.endswith('shallow') | col.endswith('deep')):
    # data_united = minsince_feat(data_united, col)
    data_united = timesince_feat(data_united, col, "days")

# Extend soil vals
data_united[cols_soil] = data_united[cols_soil].ffill()

# Cutoff
# cols_soil_days = [col for col in data_united.columns if (col.startswith('daysince_') & (col.endswith('_shallow') | col.endswith('_deep')))]
# data_united['daysince_soil'] = data_united[cols_soil_days].min(axis=1)


del col, cols_soil
# data_united.sample(10)

## Train/Test split

80/20 initial split, with expanding sliding window for training/validation for hyperparameters, model stability, and feature selection.

In [39]:
# REMOVE NAs
data_united = data_united.dropna(subset=[var_of_interest])

X_all = data_united.drop(var_of_interest, axis=1).copy()
y_all = data_united[var_of_interest].copy()

In [42]:
# Fix for inferred later
# y_all = y_all.astype(bool)
y_all = y_all.astype(np.float32)

for col in X_all.columns:
  if str(X_all[col].dtype) == ('Int32'):
    X_all[col] = X_all[col].astype(np.float32)

y_all.info()
X_all.info()

<class 'pandas.core.series.Series'>
DatetimeIndex: 1134443 entries, 2001-02-01 00:00:00 to 2011-12-31 23:55:00
Series name: obstruction_ro
Non-Null Count    Dtype  
--------------    -----  
1134443 non-null  float32
dtypes: float32(1)
memory usage: 13.0 MB
<class 'cudf.core.dataframe.DataFrame'>
DatetimeIndex: 1134443 entries, 2001-02-01 00:00:00 to 2011-12-31 23:55:00
Columns: 147 entries, ra_rain to daysince_10.0_deep
dtypes: float32(146), float64(1)
memory usage: 649.1 MB


In [43]:
y_len = len(y_all)

print(
    y_len, "\n",
    (round(.2*y_len) + round(.8*y_len)),
    "\nTrain:\t80p of ", y_len, " is ", round(.8*y_len),
    "\nTest:\t20p of ", y_len, " is ", round(.2*y_len),
    sep=""
)

del y_len

1134443
1134443
Train:	80p of 1134443 is 907554
Test:	20p of 1134443 is 226889


Unlike the typical approach for train/test splits, temporal data in this context must _not_ be randomly split as it would lead to severe leakage.

In [45]:
# Conduct the split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.2, shuffle=False)
# Conduct an inner split for tuning
X_train_inner, X_test_inner, y_train_inner, y_test_inner = train_test_split(X_train, y_train, test_size = 0.2, shuffle=False)

# Cleanup
del X_all, y_all

print(
    "Train:\t", len(X_train), "\t", X_train.index[0], "thru", X_train.index[-1],
    "\nTest:\t", len(X_test), "\t", X_test.index[0], "thru", X_test.index[-1]
    # len(x_train), len(x_test), "\n",
    # x_train.index[-1]
)

Train:	 907554 	 2001-02-01 00:00:00 thru 2009-10-26 18:05:00 
Test:	 226889 	 2009-10-26 18:10:00 thru 2011-12-31 23:55:00


### Sliding Window

In [46]:
# Initialize the split function
tscv = TimeSeriesSplit(n_splits=3)
# print(tscv)

for i, (train_index, val_index) in enumerate(tscv.split(X_train_inner)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={val_index}")
    # print("  Train: index=", mini_x.index[train_index])
    # print(f"  Test:  index={val_index}")
    print("------------------------------------------------------------")

del i, train_index, val_index

Fold 0:
  Train: index=[     0      1      2 ... 181510 181511 181512]
  Test:  index=[181513 181514 181515 ... 363020 363021 363022]
------------------------------------------------------------
Fold 1:
  Train: index=[     0      1      2 ... 363020 363021 363022]
  Test:  index=[363023 363024 363025 ... 544530 544531 544532]
------------------------------------------------------------
Fold 2:
  Train: index=[     0      1      2 ... 544530 544531 544532]
  Test:  index=[544533 544534 544535 ... 726040 726041 726042]
------------------------------------------------------------


## Hyperparameter tuning

As per XGBoosting documentation/tutorials, early stopping with random search for hyperparameter tuning must be iterated upon manually, as `RandomizedSearchCV` does not support using a separate validation set within each CV fold.

Source: https://xgboosting.com/xgboost-early-stopping-with-random-search/

In [68]:
# Code modified from
# https://xgboosting.com/xgboost-early-stopping-with-random-search/

# Define hyperparameter distributions for random search
param_distributions = {
    'learning_rate': ('uniform', 0.01, 0.3),
    'max_depth': ('choice', [3, 6, 9, 12]),
    'subsample': ('uniform', 0.5, 1.0),
    'colsample_bytree': ('uniform', 0.4, 1.0)
}

# # Function to sample parameters based on their distribution type
# def sample_param(distribution):
#     if distribution[0] == 'uniform':
#         return uniform(distribution[1], distribution[2] - distribution[1]).rvs()
#     elif distribution[0] == 'choice':
#         return np.random.choice(distribution[1])
#     else:
#         raise ValueError(f"Unsupported distribution type: {distribution[0]}")


# Define a global seed or a seeded generator object once at the start of your script
rng = np.random.default_rng(42)

# Function to sample parameters based on their distribution type
def sample_param(distribution):
    if distribution[0] == 'uniform':
        # Use the seeded generator to create the scipy distribution
        return uniform(loc=distribution[1], scale=distribution[2] - distribution[1]).rvs(random_state=rng)
    elif distribution[0] == 'choice':
        # Use the seeded generator's choice method
        return rng.choice(distribution[1])
    else:
        raise ValueError(f"Unsupported distribution type: {distribution[0]}")

# Configure cross-validation and early stopping
n_splits = 5
early_stopping_rounds = 10

# Perform random search with early stopping
n_iterations = 3
best_params = None
best_score = 0
best_avg_rounds = 0

for _ in range(n_iterations):
    test_scores = []
    best_rounds = []
    optimal_rounds_list = []
    params = {k: sample_param(v) for k, v in param_distributions.items()}

    for train_index, test_index in tscv.split(X_train_inner):
        X_train_fold, X_test_fold = X_train_inner.iloc[train_index], X_train_inner.iloc[test_index]
        y_train_fold, y_test_fold = y_train_inner.iloc[train_index], y_train_inner.iloc[test_index]

        # Split train set into train and validation
        X_train_fold, X_val, y_train_fold, y_val = train_test_split(X_train_fold, y_train_fold, test_size=0.2, shuffle=False)

        # Prep hyperparam
        neg_count_fold = (y_train_fold == 0).sum()
        pos_count_fold = (y_train_fold == 1).sum()

        # Prepare the model
        model = xgb.XGBClassifier(
            n_estimators=100,
            learning_rate=params['learning_rate'],
            max_depth=int(params['max_depth']),  # max_depth should be an int
            subsample=params['subsample'],
            colsample_bytree=params['colsample_bytree'],
            objective='binary:logistic',
            random_state=42,
            ## SETTINGS FOR GPU
            seed_per_iteration = True,
            tree_method='hist',
            device='cuda',
            scale_pos_weight= neg_count_fold/pos_count_fold,
            ##
            n_jobs=-1,
            early_stopping_rounds=early_stopping_rounds # fixed early stopping
        )

        # Fit model on train fold and use validation for early stopping
        model.fit(X_train_fold, y_train_fold, eval_set=[(X_val, y_val)], verbose=False)

        # Find optimal number of iterations
        optimal_rounds_list.append(model.best_iteration)

        # Predict on test set
        y_pred_test = model.predict(X_test_fold)
        # test_score = accuracy_score(y_test_fold, y_pred_test)
        test_score = f1_score(y_test_fold, y_pred_test)
        test_scores.append(test_score)

    # Compute average score across all folds
    average_score = np.mean(test_scores)
    average_optimal_rounds = np.mean(optimal_rounds_list)

    if average_score > best_score:
        best_score = average_score
        best_params = params
        best_avg_rounds = int(round(average_optimal_rounds)) # Store the integer average
        ## Maybe??
        # best_model = model

del X_train_fold, X_val, y_train_fold, y_val
del y_pred_test, test_score
del n_splits, early_stopping_rounds, n_iterations, test_scores, best_rounds, optimal_rounds_list
del neg_count_fold, pos_count_fold
del average_score, average_optimal_rounds

print(f"Best Parameters: {best_params}")
# print(f"Best CV Average Accuracy: {best_score}")
print(f"Best CV Average F1: {best_score}")
print(f"Best Avg Rounds: {best_avg_rounds}")
#
# print(f"Best model: {best_model}")

Best Parameters: {'learning_rate': np.float64(0.03731143088741836), 'max_depth': np.int64(6), 'subsample': np.float64(0.987811175818378), 'colsample_bytree': np.float64(0.8566838211942118)}
Best CV Average F1: 0.053235295735828574
Best Avg Rounds: 86


In [49]:
_ = playsound(get_path('completed.mp3', 'code'), block=False)

In [None]:
# model_name = "random_search_mini_xgboosting"

# if os.path.exists(model_path(model_name)) == False:
#     print("Saving model")
#     # model.fit(
#     #     X_train_two, y_train_two,
#     #     # early_stopping_rounds=50, # Stop if validation metric doesn't improve for 50 rounds
#     #     eval_set=eval_set,
#     #     verbose=False # Set to True if you want to watch the early stopping logs
#     # )
#     # random_search.fit(X_train_inner, y_train_inner)

#     # Save
#     joblib.dump(best_model, model_path(model_name))
# else:
#     print("Importing model from saved files...")
#     best_model = joblib.load(model_path(model_name))

In [72]:
model_name = "rs_xgb_cpu"

if os.path.exists(model_path(model_name)) == False:
    print("Fitting final model...")

    # XGBoost requires int for certain params
    best_params['max_depth'] = int(best_params['max_depth'])

    # Prep hyperparam
    neg_count_it = (y_train_inner == 0).sum()
    pos_count_it = (y_train_inner == 1).sum()

    final_model = xgb.XGBClassifier(
        n_estimators=best_avg_rounds,
        # n_estimators=100, # avg optimal n_estimators if known, or reasonable default
        learning_rate=best_params['learning_rate'],
        max_depth=best_params['max_depth'],
        subsample=best_params['subsample'],
        colsample_bytree=best_params['colsample_bytree'],
        objective='binary:logistic',
        random_state=42,
        ## SETTINGS FOR GPU
        seed_per_iteration = True,
        tree_method='hist',
        device='cuda',
        scale_pos_weight= neg_count_it/pos_count_it,
        ##
        n_jobs=-1
    )

    final_model.fit(X_train_inner, y_train_inner)
    joblib.dump(final_model, model_path(model_name))

    del neg_count_it, pos_count_it

    # Local download
    from google.colab import files
    files.download(model_path(model_name))

else:
    print("Importing model from saved files...")
    final_model = joblib.load(model_path(model_name))

# print("Final model has been fitted on the entire dataset.")

Importing model from saved files...


In [73]:
# Thresholds
X_itest_train, X_itest_test, y_itest_train, y_itest_test = train_test_split(X_test_inner, y_test_inner, test_size=0.2, shuffle=False)

threshold = 0.5
y_proba = final_model.predict_proba(X_itest_train)[:,1]
y_bin = (y_proba >= threshold).astype(np.int32)
best_f1 = f1_score(y_itest_train, y_bin)
print(f"F1 at default 0.5 threshold: {best_f1:.4f}")

best_threshold = 0
thresholds = np.linspace(0.01, 0.99, 100)

for threshold in thresholds:
    y_bin = (y_proba >= threshold).astype(np.int32)
    current_f1 = f1_score(y_itest_train, y_bin)

    if current_f1 > best_f1:
        best_f1 = current_f1
        best_threshold = threshold

print(f"Optimal Threshold Found: {best_threshold:.4f}")
print(f"Best F1 Score at this threshold: {best_f1:.4f}")

y_proba_test = final_model.predict_proba(X_itest_test)[:,1]
final_predictions = (y_proba_test >= best_threshold).astype(np.int32)

print("\n--- Final Metrics on Holdout Set (using optimal threshold) ---")
print(f"F1 Score: {f1_score(y_itest_test, final_predictions)}")
print(f"Accuracy Score: {accuracy_score(y_itest_test, final_predictions)}")
print(f"Precision Score: {precision_score(y_itest_test, final_predictions)}")
print(f"Recall Score: {recall_score(y_itest_test, final_predictions)}")

del y_proba, threshold, thresholds, best_f1
del X_itest_train, y_itest_train
# del X_itest_train, X_itest_test, y_itest_train, y_itest_test

F1 at default 0.5 threshold: 0.5923
Optimal Threshold Found: 0.5148
Best F1 Score at this threshold: 0.5932

--- Final Metrics on Holdout Set (using optimal threshold) ---
F1 Score: 0.5917732457656412
Accuracy Score: 0.8048095198743905
Precision Score: 0.5156626506024097
Recall Score: 0.6942416869424168


In [None]:
# Does not work with GPUs :<

# X_itest_mini, X_holdout, y_itest_mini, y_holdout = train_test_split(X_test_inner, y_test_inner, test_size=0.2, random_state=42, shuffle=False)

# t_tuner = TunedThresholdClassifierCV(
#     estimator=final_model,
#     scoring=make_scorer(f1_score),
#     cv="prefit",
#     thresholds=100,
#     refit=False,
#     n_jobs=-1
# )

# t_tuner.fit(X_itest_mini, y_itest_mini)

# print(
#     "Threshold:", t_tuner.best_threshold_,
#     "F1:", t_tuner.best_score_
# )

# t_x_pred = t_tuner.predict(X_holdout)

# print(
#     f1_score(y_holdout, t_x_pred),
#     accuracy_score(y_holdout, t_x_pred),
#     precision_score(y_holdout, t_x_pred),
#     recall_score(y_holdout, t_x_pred),
#     sep="\n"
# )

## Smoothing

This function will run on CPU.

In [74]:
def apply_smoothing(predictions_array, window_size=5):
    predictions_series = pd.Series(predictions_array)
    smoothed = predictions_series.rolling(
        window=window_size,
        center=True,
        min_periods=1
    ).apply(lambda x: np.bincount(x.astype(int)).argmax(), raw=False).astype(int)
    return smoothed.values

In [76]:
# Smallest window size
window_min = 3
# Largest window size
window_max = 13

# Start w no smoothing
best_window_size = 1
# best_smoothing_f1 = f1_score(y_itest_test, t_x_pred)
best_smoothing_f1 = f1_score(y_itest_test, final_predictions)
print("No window F1:\t", best_smoothing_f1)

# test range of odd window sizes
# for window_size in [3, 5, 7, 9, 11]:
for window_size in range(window_min, window_max+1, 2):
    # print("Testing window =", window_size)
    smoothed_preds = apply_smoothing(final_predictions, window_size=window_size)
    f1 = f1_score(y_itest_test, smoothed_preds)
    print("Window", window_size, "F1:\t", f1)

    if f1 > best_smoothing_f1:
        # print("F1 improved with smoothing!")
        best_smoothing_f1 = f1
        best_window_size = window_size

print(f"Optimal window size: {best_window_size}")
print(f"Best F1: {best_smoothing_f1}")

del window_min, window_max, f1, window_size

No window F1:	 0.5917732457656412
Window 3 F1:	 0.5917132484296663
Window 5 F1:	 0.5925285368384642
Window 7 F1:	 0.5923236514522822
Window 9 F1:	 0.5935498759591531
Window 11 F1:	 0.5938582313553452
Window 13 F1:	 0.5939141982793463
Optimal window size: 13
Best F1: 0.5939141982793463


In [77]:
# Final! 3 dec places
# t_x_pred_f = t_tuner.predict(X_test)
final_pred_y = final_model.predict_proba(X_test)[:,1]
final_pred_y = (final_pred_y >= 0.5).astype(np.int32)

print("Base model (0.5 threshold, no smoothing)")

print(
    f"F1:\t{f1_score(y_test, final_pred_y):.4f}",
    f"Acc:\t{accuracy_score(y_test, final_pred_y):.4f}",
    f"Pre:\t{precision_score(y_test, final_pred_y):.4f}",
    f"Rec:\t{recall_score(y_test, final_pred_y):.4f}",
    "-----------------------------------",
    sep="\n"
)

print(f"Optmized threshold of {best_threshold:.4f}")

final_pred_y = (final_pred_y >= best_threshold).astype(np.int32)

print(
    f"F1:\t{f1_score(y_test, final_pred_y):.4f}",
    f"Acc:\t{accuracy_score(y_test, final_pred_y):.4f}",
    f"Pre:\t{precision_score(y_test, final_pred_y):.4f}",
    f"Rec:\t{recall_score(y_test, final_pred_y):.4f}",
    "-----------------------------------",
    sep="\n"
)

# t_x_pred_f = t_tuner.predict(X_test)
print("Windowed with", best_window_size)

final_pred_y = apply_smoothing(final_pred_y, window_size=best_window_size)

print(
    f"F1:\t{f1_score(y_test, final_pred_y):.4f}",
    f"Acc:\t{accuracy_score(y_test, final_pred_y):.4f}",
    f"Pre:\t{precision_score(y_test, final_pred_y):.4f}",
    f"Rec:\t{recall_score(y_test, final_pred_y):.4f}",
    "-----------------------------------",
    sep="\n"
)

Base model (0.5 threshold, no smoothing)
F1:	0.5619
Acc:	0.8560
Pre:	0.5218
Rec:	0.6087
-----------------------------------
Optmized threshold of 0.5148
F1:	0.5619
Acc:	0.8560
Pre:	0.5218
Rec:	0.6087
-----------------------------------
Windowed with 13
F1:	0.5624
Acc:	0.8569
Pre:	0.5245
Rec:	0.6061
-----------------------------------


## drafting

In [None]:
param_dist = {
    'n_estimators': randint(50, 500), # early stopping control the actual number
    'learning_rate': uniform(0.01, 0.29),
    'max_depth': randint(3, 7),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': [0, 0.1, 0.2]
}

xgb_model = xgb.XGBClassifier(
    early_stopping_rounds=50,
    objective='binary:logistic',
    tree_method='hist',
    n_jobs=-1,
    eval_metric='logloss',
    scale_pos_weight = (np.sum(y_train_inner == 0) / np.sum(y_train_inner == 1))
    # scale_pos_weight = (y_sub_train.value_counts()[False] / y_sub_train.value_counts()[True]).item()
)

# Randomized search with efficient settings
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    # n_iter=20,                    # Start with a small number of iterations (e.g., 20-40)
    n_iter=2,
    scoring='roc_auc',
    cv=tscv,                      # Use TimeSeriesSplit
    verbose=1,
    random_state=42,
    return_train_score=True
    # refit=True
)

# When you run the fit, ensure you pass early stopping parameters:
# X_train, y_train, X_val, y_val = ... define your data ...
# eval_set = [(X_train, y_train), (X_val, y_val)]
#
# random_search.fit(
#     X_train, y_train,
#     early_stopping_rounds=50, # Stop if validation metric doesn't improve for 50 rounds
#     eval_set=eval_set,
#     verbose=False # Set to True if you want to watch the early stopping logs
# )


In [None]:
X_train_two, X_val, y_train_two, y_val = train_test_split(X_train_inner, y_train_inner, test_size = 0.2, shuffle=False)
eval_set = [(X_train_two, y_train_two), (X_val, y_val)]

# print("Starting hyperparameter tuning...")
# random_search.fit(X_sub_train, y_sub_train)
# X_train_two = X_train_two.drop(i_to_drop, errors='ignore')
# y_train_two = y_train_two.drop(i_to_drop, errors='ignore')
# X_val = X_val.drop(i_to_drop, errors='ignore')
# y_val = y_val.drop(i_to_drop, errors = 'ignore')

In [None]:

# Check main training labels
if y_train_two.isnull().any() or np.isinf(y_train_two).any():
    print("ERROR: y_train contains NaN or Inf values!")

# Check validation labels (if you are using an eval_set)
# Assuming y_val is the label portion of your validation set
if y_val.isnull().any() or np.isinf(y_val).any():
    print("ERROR: y_val contains NaN or Inf values!")


In [None]:
# try:
#     if data_weir.empty == False:
#         print("Data loaded, random sample shown below")
#         print(data_weir.sample(n=5))
# except NameError:
#     print("Data has not yet been read in, loading now...")
#     data_weir = pd.read_csv(
#         "data/bci_lutzweir_combined.csv",
#         usecols = ['datetime', 'level', 'raw', 'chk_note', 'chk_fail', 'comment', 'source'],
#         parse_dates=['datetime'],
#         dtype = {'source':'category', 'chk_note':'category', 'chk_fail':'str', 'comment':'str'},
#         date_format='%d/%m/%Y %H:%M:%S'
#     )
# import os
# if os.path.exists(get_path('models/random_search_mini_spw.joblib', 'outputs')):
#     continue
# else:
#     print("nope")
# os.path.exists(get_path('models/random_search_mini_spw2.joblib', 'outputs')) == False

In [None]:
# def model_path(input_name):
#     mod_loc = "models/"+input_name+".joblib"
#     return get_path(mod_loc, 'outputs')

In [None]:
# os.path.exists(model_path("random_search_mini_spw")) == False

In [None]:
# model_name = "random_search_mini_inner"
model_name = "random_search_mini_spw"


if os.path.exists(model_path(model_name)) == False:
    print("Starting hyperparameter tuning...")
    # random_search.fit(
    #     X_train_two, y_train_two,
    #     # early_stopping_rounds=50, # Stop if validation metric doesn't improve for 50 rounds
    #     eval_set=eval_set,
    #     verbose=False # Set to True if you want to watch the early stopping logs
    # )
    random_search.fit(X_train_inner, y_train_inner)

    # Save
    joblib.dump(random_search, model_path(model_name))
else:
    print("Importing model from saved files...")
    random_search = joblib.load(model_path(model_name))

In [None]:
# # Saving result
# import joblib

# filename = 'random_search_results.joblib'
# joblib.dump(random_search, filename)

In [None]:
# Save
# joblib.dump(random_search, get_path('models/random_search_mini_spw.joblib', 'outputs'))

# Make a chime to indicate completion
_ = playsound(get_path('completed.mp3', 'code'), block=False)

In [None]:
# Print the results
print("Best hyperparameters found:")
print(random_search.best_params_)
print(f"Best F1 Score (averaged across CV folds): {random_search.best_score_:.4f}")

### Saving model

In [None]:
best_model = random_search.best_estimator_

## Feature selection

In [None]:
feature_importances = best_model.feature_importances_
# map scores to feature names
# feature_importances
feature_names = X_train.columns.tolist()

feature_importance_df = pd.DataFrame({
    'feat': feature_names,
    'importance': feature_importances
})

# sort importance
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

# print(feature_importance_df)
feature_importance_df

# most important features
# print(feature_importance_df.head(25))

In [None]:
threshold_importance = 0.95
# calculate most important 90 percent of the importance
feature_importance_df['cumulative_imp'] = feature_importance_df['importance'].cumsum()
features_percent = feature_importance_df[feature_importance_df['cumulative_imp'] <= threshold_importance].shape[0] + 1
features_percent

In [None]:
# Most important features:
print(round(threshold_importance*100), "% (most important features):", features_percent)
feature_importance_df.head(features_percent)

In [None]:
# feature_importance_df.tail(1)
# Least important features:
print("Remaining", round((1-threshold_importance)*100), "% (least important features):", len(feature_names)-features_percent)
feature_importance_df.tail(len(feature_names)-features_percent)

In [None]:
# Features with 0 importance:
print("Features with 0 importance:", len(feature_importance_df[feature_importance_df['importance']==0]))

feature_importance_df[feature_importance_df['importance']==0]

Feature importance by type

In [None]:
mapping_dict = {
    'soil': '_deep|_shallow',
    'runoff':'ro',
    'rain':'rain',
    'calibration':'_cal'
}

for col_name, pattern in mapping_dict.items():
    feature_importance_df[col_name] = feature_importance_df['feat'].str.contains(pattern, case=False, regex=True)

feature_importance_df

In [None]:
feature_importance_df['most'] = (feature_importance_df['cumulative_imp'] <= threshold_importance)
feature_importance_df['zero'] = (feature_importance_df['importance'] == 0)

cat_cols = list(mapping_dict.keys())

table_feature_cat_importance = pd.DataFrame({
    'Total features': feature_importance_df[cat_cols].sum(),
    'Above threshold': feature_importance_df[feature_importance_df['most']][cat_cols].sum(),
    'Below threshold': feature_importance_df[~feature_importance_df['most']][cat_cols].sum(),
    'Zero importance': feature_importance_df[feature_importance_df['zero']][cat_cols].sum()
}).fillna(0).astype(int)

del cat_cols, mapping_dict

table_feature_cat_importance.index.name = 'Category'

table_feature_cat_importance

## Classification Threshold

By default, a threshold of 0.5 will be selected for categorizing a point as `True` or `False`. However, in this context a model more sensitive to `True` may make final results more accurate.

This measure can be found by finding the threshold that maximizes the F1 score.

In [None]:
# classifier_tuned = TunedThresholdClassifierCV(best_model, scoring="balanced_accuracy").fit(X_train_two, y_train_two)
# print(f"Cut-off point found at {classifier_tuned.best_threshold_:.3f}")
t_tuner = TunedThresholdClassifierCV(
    estimator=best_model,
    scoring=make_scorer(f1_score),
    cv="prefit",
    thresholds=100,
    refit=False
)

t_tuner.fit(X_test_inner, y_test_inner)

print(
    "Threshold:", t_tuner.best_threshold_,
    "F1:", t_tuner.best_score_
)

In [None]:
t_x_pred = t_tuner.predict(X_test_inner)

print(
    f1_score(y_test_inner, t_x_pred),
    accuracy_score(y_test_inner, t_x_pred),
    precision_score(y_test_inner, t_x_pred),
    recall_score(y_test_inner, t_x_pred),
    sep="\n"
)

In [None]:
_ = playsound(get_path('completed.mp3', 'code'), block=False)
# X_train_two, X_val, y_train_two, y_val = train_test_split(X_train, y_train, test_size = 0.2, shuffle=False)

In [None]:
all_f1 = []
all_accuracy = []
all_precision = []
all_recall = []
optimal_thresholds = []
performance_df = {}
# performance_df = DataFrame()

for fold, (index_train, index_val) in enumerate(tscv.split(X_train)):
    print("Split", fold)
    X_sub_train, X_sub_val = X_train.iloc[index_train].copy(), X_train.iloc[index_val].copy()
    y_sub_train, y_sub_val = y_train.iloc[index_train].copy(), y_train.iloc[index_val].copy()

    # y_sub_train = y_sub_train.drop(i_to_drop, errors='ignore')

    # if len(y_sub_train.unique()) != 2:
    #     print("Skipping fold", fold)
    #     continue

    # model = xgb.XGBClassifier(
    #     objective='binary:logistic',
    #     eval_metric='logloss',
    #     tree_method = "hist",
    #     random_state = 42,
    #     scale_pos_weight = (y_sub_train.value_counts()[False] / y_sub_train.value_counts()[True]).item()
    # )

    print("Fitting model...")
    best_model.fit(
        X_sub_train, y_sub_train,
        # Evaluation set
        eval_set = [(X_sub_val, y_sub_val)],
        # Weight of False vs True
        # early_stopping_rounds = 50,
        # Silence messages
        verbose=False
    )

    print("Predicting probabilities...")
    # Get probabilities for the positive class (class 1)
    y_pred_proba = best_model.predict_proba(X_sub_val)[:, 1]

    # 1. Calculate precision, recall, and thresholds for the current fold
    precision, recall, thresholds = precision_recall_curve(y_sub_val, y_pred_proba)

    # 2. Find the optimal threshold based on F1-score (or whichever metric you prefer)
    fscores = (2 * precision * recall) / (precision + recall)
    fscores[np.isnan(fscores)] = 0
    optimal_idx = np.argmax(fscores)

    # Need to handle the fact that thresholds array is one element shorter than P/R/F1 arrays
    # Best practice is often to use the threshold just after the optimal index is found in P/R/F1 arrays
    best_threshold = thresholds[optimal_idx]
    optimal_thresholds.append(best_threshold)

    print(f"Fold {fold} Optimal Threshold (Max F1): {best_threshold:.4f}")

    # 3. Apply the *optimal* threshold to the validation predictions for THIS fold
    y_pred_optimal = (y_pred_proba >= best_threshold).astype(int)

    # 4. Calculate metrics using the *optimally thresholded* predictions
    print("Getting metrics using optimal threshold...")
    fold_f1 = f1_score(y_sub_val, y_pred_optimal)
    fold_accuracy = accuracy_score(y_sub_val, y_pred_optimal)
    fold_precision = precision_score(y_sub_val, y_pred_optimal)
    fold_recall = recall_score(y_sub_val, y_pred_optimal)

    all_f1.append(fold_f1)
    all_accuracy.append(fold_accuracy)
    all_precision.append(fold_precision)
    all_recall.append(fold_recall)

    # You might want to use a more structured DataFrame for performance_df construction
    performance_df[fold] = {
        'Threshold': best_threshold,
        'F1': fold_f1,
        'Accuracy': fold_accuracy,
        'Precision': fold_precision,
        'Recall': fold_recall
    }

    print(f"{fold}\tOptimal F1: {fold_f1:.4f}\tAcc: {fold_accuracy:.4f}\tPrec: {fold_precision:.4f}\tRec: {fold_recall:.4f}")
    # print("Predicting...")
    # y_pred = best_model.predict(X_sub_val)
    # y_pred_proba = best_model.predict_proba(X_sub_val)[:, 1]

    # # 2. Calculate precision, recall, and thresholds for the current fold
    # precision, recall, thresholds = precision_recall_curve(y_sub_val, y_pred_proba)

    # # 3. Find the optimal threshold based on F1-score (a common choice for balance)
    # # Calculate F1-score for every possible threshold
    # fscores = (2 * precision * recall) / (precision + recall)

    # # Handle potential division by zero warnings if no positive predictions were made
    # fscores[np.isnan(fscores)] = 0

    # # Locate the index of the highest F1-score
    # optimal_idx = np.argmax(fscores)
    # best_threshold = thresholds[optimal_idx] # Note: thresholds array is one element shorter than P/R arrays

    # optimal_thresholds.append(best_threshold)

    # print(f"Fold {fold} Optimal Threshold (Max F1): {best_threshold:.4f}")


    # print("Getting metrics...")
    # fold_f1 = f1_score(y_sub_val, y_pred)
    # fold_accuracy = accuracy_score(y_sub_val, y_pred)
    # fold_precision = precision_score(y_sub_val, y_pred)
    # fold_recall = recall_score(y_sub_val, y_pred)

    # all_f1.append(fold_f1)
    # all_accuracy.append(fold_accuracy)
    # all_precision.append(fold_precision)
    # all_recall.append(fold_recall)
    # performance_df[fold] = {best_threshold, fold_f1, fold_accuracy, fold_precision, fold_recall}

    # print(f"{fold}\tF1: {fold_f1:.4f}\tAcc{fold_accuracy:.4f}\tPrec{fold_precision:.4f}\tRec: {fold_recall:.4f}")

print(performance_df)

In [None]:
_ = playsound(get_path('completed.mp3', 'code'), block=False)

In [None]:
import pprint

pprint.pprint(performance_df)

# draft

In [None]:
# Initialize
optimal_thresholds = []

# Calculate precision-recall curve info from predicted values of validation set
precision, recall, thresholds = precision_recall_curve(y_val, y_pred_proba)

fscores = (2 * precision * recall) / (precision + recall)

# fixies div by 0 errors
fscores[np.isnan(fscores)] = 0

# find index of highest F1
optimal_idx = np.argmax(fscores)
best_threshold = thresholds[optimal_idx] # thresholds array is one element shorter than P/R arrays

optimal_thresholds.append(best_threshold)

print(f"Fold {fold} Optimal Threshold (Max F1): {best_threshold:.4f}")

In [None]:
best_params = random_search.best_params_

# final model w optimized params
final_optimized_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    # use_label_encoder=False,
    tree_method='hist',
    random_state=42,
    **best_params # unpack best params
)

final_optimized_model.fit(X_train, y_train)

In [None]:
# Make a chime to indicate completion
_ = playsound(get_path('completed.mp3', 'code'), block=False)

## Metrics

In [None]:
print("Predicting...")
y_pred = final_optimized_model.predict(X_test)
y_pred_proba = final_optimized_model.predict_proba(X_test)[:, 1]

# current fold info
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

# # Find the optimal threshold
# Calculate F1-score for every possible threshold
fscores = (2 * precision * recall) / (precision + recall)

# Handle potential division by zero warnings
# if no positive predictions were made
fscores[np.isnan(fscores)] = 0

# index of highest F1
optimal_idx = np.argmax(fscores)
best_threshold = thresholds[optimal_idx] # Note: thresholds array is one element shorter than P/R arrays

print(f"Best threshold: {best_threshold:.4f}")

# print(f"Fold {fold} Optimal Threshold (Max F1): {best_threshold:.4f}")


print("Getting metrics...")
print("F1\tAcc\tPre\tRec")
print(
    f1_score(y_test, y_pred),
    accuracy_score(y_test, y_pred),
    precision_score(y_test, y_pred),
    recall_score(y_test, y_pred),
    sep ="\t"
)

# all_f1.append(fold_f1)
# all_accuracy.append(fold_accuracy)
# all_precision.append(fold_precision)
# all_recall.append(fold_recall)

In [None]:
# final_f1 = f1_score(y_val, y_test_pred)
# final_accuracy = accuracy_score(y_val, y_test_pred)
# final_precision = precision_score(y_val, y_test_pred)
# final_recall = recall_score(y_val, y_test_pred)

# print(f"{fold}\tF1: {final_f1:.4f}\tAcc{final_accuracy:.4f}\tPrec{final_precision:.4f}\tRec: {final_recall:.4f}")

In [None]:
# print("i\tF1\tAcc\tPre\tRec")
# for i in range(len(all_f1)):
#     print(i, round(all_f1[i], 4), round(all_accuracy[i], 4), round(all_precision[i], 4), round(all_recall[i],4), sep="\t")