# Data Splitting and Modelling

Author: Gillian A. McGinnis, final-semester M.S. Information Science - Machine Learning  
The University of Arizona College of Information  
INFO 698 - Capstone  
Start date: 21 October 2025  
Last updated: 25 November 2025

In [40]:
"""
Module providing code for test/train split and sliding window creation. Relies on 01_clean.ipynb completion.
"""

'\nModule providing code for test/train split and sliding window creation. Relies on 01_clean.ipynb completion.\n'

## Setup

### Packages

In [41]:
# GPU Setup
%load_ext cudf.pandas
import pandas as pd
import cudf
import cupy as cp

import cuml.accel
cuml.accel.install()

# General packages
import numpy as np
import xgboost as xgb
from scipy.stats import randint, uniform
# For getting medians for windowing/smoothing
from scipy.signal import medfilt

# For saving models
import joblib
import pickle

from sklearn.model_selection import TimeSeriesSplit, train_test_split, RandomizedSearchCV, TunedThresholdClassifierCV
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, precision_recall_curve, make_scorer, roc_auc_score, auc
# For help with model tuning
from sklearn.base import clone

The cudf.pandas extension is already loaded. To reload it, use:
  %reload_ext cudf.pandas


In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

from google.colab import userdata
import os

if os.getcwd() == '/content':
  gh_repo = userdata.get('gh_repo')
  if not os.path.exists(f'{gh_repo}'):
    print("Cloning repo...")
    gh_pat = userdata.get('gh_pat')
    gh_user = userdata.get('gh_user')
    repo_url = f'https://{gh_pat}@github.com/{gh_user}/{gh_repo}'
    !git clone {repo_url}
    del gh_pat, gh_user, repo_url
  print("Changing wd...")
  os.chdir(f'{gh_repo}/code')
  del gh_repo

# # Verify the current working directory
print(f"Current working directory is: {os.getcwd()}")


Current working directory is: /content/info-698-capstone/code


In [43]:
# For data importing and exporting
from helper_utils import get_path, model_path, apply_model, report_scores

In [44]:
# # To make it easier to tell when processes have completed -- can delete later
# Google colab compatible:
# https://stackoverflow.com/a/68582785/23486987
from IPython.display import Audio, display

def play_chime():
  return Audio(get_path('completed.mp3', 'code'), autoplay=True)

In [45]:
## (Optional chunk)
# Current session information

# From StackOverflow,
# https://stackoverflow.com/a/62128239/23486987
try:
    import session_info
except:
    !pip install session_info
    import session_info
# !pip install session_info
# import session_info
session_info.show(dependencies=False)

In [46]:
# Make sure GPU active
# !nvidia-smi
import torch
if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("Warning: GPU not found!")

Using GPU: Tesla T4


## Inputs

### Constants

In [47]:
## Variable of interest
# for the model to predict
VAR_OF_INTEREST = "obstruction_ro"

## Date subsets
# (for testing smaller runs)
# Set to None to use all data
# DATE_START = None
# DATE_END = None
DATE_START = '2001-02-01 00:00:00'
DATE_END = '2011-12-31 23:59:59'

## Cutoff for correlation for feature removal
# (i.e., columns that are at least this correlated will be removed)
CORR_CUTOFF = 0.97

## Number of splits
# for hyperparameter tuning via expanding window
N_SPLITS = 5

## Number of rounds after which to stop the build
# once the performance stops improving
EARLY_STOPPING_ROUNDS = 50

# Define hyperparameter distributions for random search
N_ESTIMATORS = 1000

## Number of times to iterate the tuning loop
# (set to a small number for testing)
N_ITERATIONS = 5
# N_ITERATIONS = 50


## Smoothing window sizes -- must be odd
# Smallest window size
WINDOW_MIN = 1
# Largest window size
WINDOW_MAX = 35

## Model names
MODEL_NAME = "xgb_testing"
FITTED_MODEL_NAME = "xgb_testing_fitted"

## Seed for reproducability
SEED = 42

In [48]:
# Set seed
np.random.seed(SEED)
cp.random.seed(SEED)

In [49]:
# ### Note ###
# # Set to None later -- just a smaller subset for feature engineering testing
# # DATE_START = '2000-01-01 00:00:00'
# DATE_START = '2001-02-01 00:00:00'
# DATE_END = '2011-12-31 23:59:59'
# DATE_START= None
# DATE_END = None
# ######

### Data

In [50]:
united_water = pd.read_parquet(get_path('clean/water_nocal.parquet'))
united_soil = pd.read_parquet(get_path('clean/soil.parquet'))

data_cal = pd.read_parquet(get_path('clean/calibration.parquet'))
data_cal = data_cal.rename(columns={'weir_level':'weir_level_cal'})

In [51]:
data_cal.info()

<class 'cudf.core.dataframe.DataFrame'>
DatetimeIndex: 6136 entries, 1994-01-03 08:46:00 to 2025-08-01 09:10:00
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   weir_level_cal  6136 non-null   int8
dtypes: int8(1)
memory usage: 53.9 KB


#### Memory improvements

Small amount of data wrangling for memory improvements (some as a consequence of importing).

In [52]:
# Select columns of interest
data_water = united_water.drop(columns=['raw_rain', 'chk_note_rain', 'chk_fail_rain', 'chk_note_ro', 'chk_fail_ro', 'comment_ro', 'source_ro'])

# Cleanup
del united_water

# Remove duplicate entries
data_water = data_water.reset_index().drop_duplicates(keep='first').set_index('datetime')

In [53]:
water_drops = ['level_ro', 'obstruction_ro', 'gap_fill_ro', 'weir_cleaning_ro', 'spike_ro', 'calibration_ro']
water_drops.remove(VAR_OF_INTEREST)

data_water = data_water.drop(water_drops, axis=1)

del water_drops

data_water.info(memory_usage="deep")

<class 'cudf.core.dataframe.DataFrame'>
DatetimeIndex: 3581782 entries, 1989-06-21 13:00:00 to 2025-08-01 13:00:00
Data columns (total 3 columns):
 #   Column          Dtype
---  ------          -----
 0   ra_rain         float32
 1   raw_ro          float32
 2   obstruction_ro  bool
dtypes: bool(1), float32(2)
memory usage: 59.4 MB


In [54]:
united_soil['sample'] = united_soil['sample'].astype('category')

In [55]:
# Subset data (if applicable)
data_water = data_water[DATE_START:DATE_END]

## Feature Engineering

### Distance from Event

In [None]:
def timesince_feat(input_df, input_col, input_unit):
    output_df = input_df
    instances = output_df[input_col].notna()
    # Create groupings based on most recent instance
    group_id = instances.cumsum()
    # Exclude the first grouping
    # otherwise it assumes there was an event just prior to the first entry
    group_id = group_id.replace(0, np.nan)
    # Create new column to count the distance in days since the point
    # which resets to 0 at each new point
    output_df['timestamp'] = pd.to_datetime(output_df.index)
    # Get start timestamp of the group
    output_df['ts_start'] = output_df.groupby(group_id)['timestamp'].transform('min')
    # Calculate the distance
    if input_unit == "minutes":
        output_df[f"minsince_{input_col}"] = (output_df['timestamp'] - output_df['ts_start']).dt.total_seconds().div(60).astype(np.float32)
    elif input_unit == "days":
        output_df[f"daysince_{input_col}"] = (output_df['timestamp'] - output_df['ts_start']).dt.days.astype(np.float32)
    # Remove extra cols
    output_df = output_df.drop(columns=['timestamp', 'ts_start'])
    return output_df

#### Rain
Create feature which tracks how recent a rain event occurred.

In [None]:
data_water = timesince_feat(data_water, 'ra_rain', "minutes")

### Rain event

Keep track of cumulative rainfall during a specific event.

In [None]:
# Create index of instances where there is a data point
rain_event = (data_water['ra_rain'].isnull() & ((data_water['minsince_ra_rain'] >= 5.0) & (data_water['minsince_ra_rain'] != 0)))
# Create groupings based on most recent instance
rain_event_id = rain_event.cumsum()
# Create new column to count number of records since the point
# which resets to 0 at each new point
data_water['eventsum_ra_rain'] = data_water.groupby(rain_event_id)['ra_rain'].cumsum()

del rain_event, rain_event_id

### Decay

In [None]:
def decay_feat(input_df, input_col, input_dec_rate = -0.1):
    output_df = input_df
    if f"minsince_{input_col}" not in output_df.columns:
        output_df = timesince_feat(input_df = output_df, input_col = input_col, input_unit = "minutes")
    # Update for GPU for overflow fix
    output_df[f"minsince_{input_col}"] = output_df[f"minsince_{input_col}"].astype(np.float64)

    output_df[f"decayrate{input_dec_rate}_{input_col}"] = np.exp(input_dec_rate * output_df[f"minsince_{input_col}"]).astype(np.float32)
    output_df[f"ffill_{input_col}"] = output_df[input_col].ffill()
    output_df[f"decay{input_dec_rate}_{input_col}"] = (output_df[f"ffill_{input_col}"] * output_df[f"decayrate{input_dec_rate}_{input_col}"])

    return output_df

In [None]:
# Replace NAs in rain with 0
data_water['ra_rain'] = data_water['ra_rain'].fillna(0)

# Apply decay function
data_water = decay_feat(data_water, 'eventsum_ra_rain')

# Drop extra column
# minutes since rain event will be the same as minutes since most recent rain
data_water = data_water.drop('minsince_eventsum_ra_rain', axis=1)

### Lag features

Because XGBoost predicts on one data point at a time, time series data must include "lag" features (i.e., data points prior to that event) in order to better repsent recent history.

#### Consistent cols

Modify the rows to prevent inappropriate data shifts.
This expands the index to include _all_ possible 5 minute time stamps, so that lagging by index difference guarantees the temporal consistency of lagged features.
This is important because there are some gaps of data, and it would be incorrect for a features that measures "rainfall 5 minutes ago" was actually representing that of the immediately previous point a few hours (or, in some cases, _years_) back due to large data gaps.

In [None]:
original_indices = data_water.index.copy()

new_index = pd.date_range(start = data_water.index.min(),
                          end = data_water.index.max(),
                          freq = '5min')

# Reindex
data_water = data_water.reindex(new_index)

# Cleanup
del new_index

# # Return
# data_water = data_water.loc[original_indices]
# del original_indices

Get values from other recent time stamps.

In [None]:
def lag_feats(input_df, input_cols, input_lags):
    output_df = input_df#.copy()
    for col in input_cols:
        for lag in input_lags:
            output_df[f"{col}_lag{lag}"] = output_df[col].shift(lag)
    return output_df

In [None]:
# Columns to get temporal stats on
cols_to_shift = ['raw_ro', 'ra_rain']

# data at 5-min increments -- lag to record values at 5m, 10m, 15m, 20m, 25m, 30m, 1h, 2h, 3h prior
lags_of_interest = [1, 2, 3, 4, 5, 6, 12, 24, 36]

data_water = lag_feats(data_water, cols_to_shift, lags_of_interest)

# data_water.sample(10)

### Rolling stats

Similarly to lag features, rolling statistics can help XGBoost determine how typical recent behavior is, or any patterns that may emerge with "abnormal" or extreme behavior (such as a sharp slope).

In [None]:
def rolling_feats(input_df, input_cols, input_windows, input_mtype = "mean"):
    output_df = input_df

    # Create dummy series of index values (0, 1, 2, ... N)
    # x represents the position within the df for the regression calculation
    x_series = pd.Series(np.arange(len(output_df)), index=output_df.index)

    for col in input_cols:
        for window in input_windows:
            # Calculate general stats
            if input_mtype == "mean":
                output_df[f"{col}_rollmean_{window}"] = output_df[col].rolling(window).mean().astype(np.float32)
            elif input_mtype == "sum":
                output_df[f"{col}_rollsum_{window}"] = output_df[col].rolling(window).sum().astype(np.float32)
            elif input_mtype == "both":
                output_df[f"{col}_rollmean_{window}"] = output_df[col].rolling(window).mean().astype(np.float32)
                output_df[f"{col}_rollsum_{window}"] = output_df[col].rolling(window).sum().astype(np.float32)
            output_df[f"{col}_rollstd_{window}"] = output_df[col].rolling(window).std().astype(np.float32)

            # Calculating slope w vecotrized options
            # Calculate covariance of y (data) vs X (index)
            rolling_cov = output_df[col].rolling(window).cov(x_series)
            # Calculate variance of X (index)
            rolling_var_x = x_series.rolling(window).var()
            # Slope = Cov(Y, X) / Var(X)
            output_df[f"{col}_rollslope_{window}"] = (rolling_cov / rolling_var_x).astype(np.float32)
    return output_df

In [None]:
# Inclusive of current point--
# 10m, 15m, 20m, 25m, 30m, 1h, 3h, 6h, 12h, 24h
windows_of_interest = [2, 3, 4, 5, 6, 12, 36, 72, 144, 288]

data_water = rolling_feats(data_water, ['raw_ro'], windows_of_interest, "both")
data_water = rolling_feats(data_water, ['ra_rain'], windows_of_interest, "sum")

Change since last value

In [None]:
data_water['raw_ro_change'] = data_water['raw_ro'].diff()
data_water['ra_rain_change'] = data_water['ra_rain'].diff()

data_water['raw_ro_rollmean_2_change'] = data_water['raw_ro_rollmean_2'].diff()

In [None]:
# Revert index
# (adjusted for GPU)

data_water_reset = data_water.reset_index()
index_col_name = data_water_reset.columns[0]
indices_df = original_indices.to_frame(name=index_col_name)

filtered_data_water = cudf.merge(
    data_water_reset,
    indices_df,
    on=index_col_name,
    how='inner'
)
data_water = filtered_data_water.set_index(index_col_name)

del original_indices, filtered_data_water, index_col_name, data_water_reset, indices_df

In [None]:
# # Return
# data_water = data_water.loc[original_indices]

# del original_indices

## Soil

Pivot the soil data such that each sample has its own columns, and separated by depth.

In [None]:
# Drop irrelevant column
data_soil_shallow = united_soil.copy().drop('h2o_by_wet_deep', axis=1)
data_soil_shallow['sample'] = data_soil_shallow['sample'].astype('float32')
# Pivot wider
data_soil_shallow = data_soil_shallow.pivot(columns='sample', values='h2o_by_wet_shallow')

# Drop irrelevant column
data_soil_deep = united_soil.copy().drop('h2o_by_wet_shallow', axis=1)

data_soil_deep['sample'] = data_soil_deep['sample'].astype('float32')
# Pivot wider
data_soil_deep = data_soil_deep.pivot(columns='sample', values='h2o_by_wet_deep')

In [None]:
data_soil = pd.merge(
    data_soil_shallow,
    data_soil_deep,
    left_index = True,
    right_index = True,
    suffixes = ("_shallow", "_deep"),
    how = "outer"
)

del data_soil_shallow, data_soil_deep
del united_soil

## Unite

Join the data frames to prep for the model.

In [None]:
data_united = pd.merge(
    data_water,
    #
    data_cal[DATE_START:DATE_END],
    #
    left_index = True,
    right_index = True,
    how = 'outer'
)

data_united = pd.merge(
    data_united,
    #
    data_soil[DATE_START:DATE_END],
    #
    left_index = True,
    right_index = True,
    how = 'outer'
)

### United features

Add a few final features on the united data set.

In [None]:
# Difference compared to calibration point (infrequent)
data_united['diff_ro_cal'] = (data_united['weir_level_cal'] - data_united['raw_ro'])

# Convert to float
data_united['diff_ro_cal'] = data_united['diff_ro_cal'].astype(np.float32)

# Time since last calibration point
data_united = timesince_feat(data_united, 'weir_level_cal', "minutes")

In [None]:
# create features to track soil value staleness
cols_soil = [col for col in data_united.columns if (col.endswith('shallow') | col.endswith('deep'))]

for col in cols_soil:
# for col in data_united.columns:
    # if (col.endswith('shallow') | col.endswith('deep')):
    # data_united = minsince_feat(data_united, col)
    data_united = timesince_feat(data_united, col, "days")

# Extend soil vals
data_united[cols_soil] = data_united[cols_soil].ffill()

del col, cols_soil

### Temporal features
Modify temporal features to be based on sine and cosine transformations, which allows for the model to be based on the cyclical patterns of time rather than abrupt distances

(e.g., the raw values Day 365 of the year is 'far' from Day 001, but in reality they are very near)

In [None]:
def temporal_feat(input_df, input_unit):
    output_df = input_df
    if input_unit=='day':
        cycle_length = 365.25
        value = output_df.index.dayofyear
    elif input_unit=='month':
        cycle_length = 12
        value = output_df.index.month
    elif input_unit=='hour':
        cycle_length = 24
        value = output_df.index.hour
    elif input_unit=='minute':
        cycle_length = 60
        value = output_df.index.minute

    output_df[f'{input_unit}_sin'] = np.sin(2 * np.pi * value / cycle_length).astype(np.float32)
    output_df[f'{input_unit}_cos'] = np.cos(2 * np.pi * value / cycle_length).astype(np.float32)

    return output_df

In [None]:
data_united = temporal_feat(data_united, 'minute')
data_united = temporal_feat(data_united, 'hour')
data_united = temporal_feat(data_united, 'day')
data_united = temporal_feat(data_united, 'month')

## Train/Test split

80/20 initial split, with expanding sliding window for training/validation for hyperparameters, model stability, and feature selection.

In [None]:
# REMOVE NAs
data_united = data_united.dropna(subset=[VAR_OF_INTEREST])

X_all = data_united.drop(VAR_OF_INTEREST, axis=1).copy()
y_all = data_united[VAR_OF_INTEREST].copy()

In [None]:
## Random data type fixes for GPU usage

# Fix for inferred later
# y_all = y_all.astype(bool)
y_all = y_all.astype(np.float32)
y_all = y_all.as_gpu_object()

for col in X_all.columns:
  if str(X_all[col].dtype) == ('Int32'):
    X_all[col] = X_all[col].astype(np.float32)

# print(y_all.__class__)
# y_all = cudf.Series.from_pandas(y_all)
print(y_all.__class__)
X_all.info()

In [None]:
y_len = len(y_all)

print(
    y_len, "\n",
    (round(.2*y_len) + round(.8*y_len)),
    "\nTrain:\t80p of ", y_len, " is ", round(.8*y_len),
    "\nTest:\t20p of ", y_len, " is ", round(.2*y_len),
    sep=""
)

del y_len

Unlike the typical approach for train/test splits, temporal data in this context must _not_ be randomly split as it would lead to severe leakage.

In [None]:
# Conduct the split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.2, shuffle=False)
# Conduct an inner split for tuning
# X_train_inner, X_test_inner, y_train_inner, y_test_inner = train_test_split(X_train, y_train, test_size = 0.2, shuffle=False)

# Cleanup
del X_all, y_all

print(
    "Train:\t", len(X_train), "\t", X_train.index[0], "thru", X_train.index[-1],
    "\nTest:\t", len(X_test), "\t", X_test.index[0], "thru", X_test.index[-1]
    # len(x_train), len(x_test), "\n",
    # x_train.index[-1]
)

### Expanding Window

For tuning, an expanding window approach will be used. This is similar to how a model would act once deployed, as it will only gain more data over time.

In [None]:
# Initialize the split function
tscv = TimeSeriesSplit(n_splits=5)
# print(tscv)

for i, (train_index, val_index) in enumerate(tscv.split(X_train)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={val_index}")
    # print("  Train: index=", mini_x.index[train_index])
    # print(f"  Test:  index={val_index}")
    print("------------------------------------------------------------")

del i, train_index, val_index

## Parameter selection

Highly correlated features can be removed prior to model selection.
So long as the _same_ features are dropped in the test set as well, this will not result in data leakage.

This takes a bit of time, so will load any cached list from a prior run.

In [None]:
if os.path.exists('to_drop.pkl') == True:
  print("Vars to drop previously cached...")
  with open('to_drop.pkl', 'rb') as file:
    to_drop = pickle.load(file)
else:
  print("Finding corr vars...")

  X_train_corrblock = X_train.copy() # Backup

  corr = X_train_corrblock.corr().abs()
  # corr = np.corrcoef(X_train_corrblock.values, rowvar=False)
  upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

  to_drop = [col for col in upper.columns if any(upper[col] > CORR_CUTOFF)]

  with open('to_drop.pkl', "wb") as fp:
    pickle.dump(to_drop, fp)
    # print("Importing model from saved files...")
    # final_model = joblib.load(model_path(model_name))
  del X_train_corrblock

print(to_drop)

In [None]:
X_train_backup = X_train.copy()
X_train = X_train.drop(columns=to_drop)

X_test_backup = X_test.copy()
X_test = X_test.drop(columns=to_drop)

## Hyperparameter tuning

As per XGBoosting documentation/tutorials, early stopping with random search for hyperparameter tuning must be iterated upon manually, as `RandomizedSearchCV` does not support using a separate validation set within each CV fold.

Source: https://xgboosting.com/xgboost-early-stopping-with-random-search/


The area under the precision-recall curve (AUC-PR) can be used to evaluate the performance, since it considers a range of classification thresholds.
This is better than an ROC AUC metric since there is greater class imbalance (i.e., `True` is more rare).

Source: https://xgboosting.com/evaluate-xgboost-performance-with-precision-recall-curve/


In [None]:
# Code modified from
# https://xgboosting.com/xgboost-early-stopping-with-random-search/

# Define hyperparameter distributions for random search
param_distributions = {
    'learning_rate': ('uniform', 0.01, 0.3),
    'max_depth': ('choice', [2, 3, 4, 5, 6]),
    'subsample': ('uniform', 0.5, 1.0),
    'colsample_bytree': ('uniform', 0.4, 1.0),
    'scale_pos_weight':('choice', [5, 7, 9, 10, 11]),
    'gamma': ('uniform', 0, 0.5),
    'reg_alpha': ('uniform', 0, 1.0)
}

# Define seed again
rng = np.random.default_rng(SEED)

# Function to sample parameters based on their distribution type
def sample_param(distribution):
    if distribution[0] == 'uniform':
        # Use seeded generator to create the scipy distribution
        return uniform(loc=distribution[1], scale=distribution[2] - distribution[1]).rvs(random_state=rng)
    elif distribution[0] == 'choice':
        # Use seeded generator choice method
        return rng.choice(distribution[1])
    else:
        raise ValueError(f"Unsupported distribution type: {distribution[0]}")

best_params = None
best_score = 0
best_avg_rounds = 0

for _ in range(N_ITERATIONS):
    test_scores = []
    best_rounds = []
    optimal_rounds_list = []
    params = {k: sample_param(v) for k, v in param_distributions.items()}

    for train_index, test_index in tscv.split(X_train):
        X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

        # Split train set into train and validation
        X_train_fold, X_val, y_train_fold, y_val = train_test_split(X_train_fold, y_train_fold, test_size=0.2, shuffle=False)

        # # Prep hyperparam
        # neg_count_fold = (y_train_fold == 0).sum()
        # pos_count_fold = (y_train_fold == 1).sum()

        # Prepare the model
        model = xgb.XGBClassifier(
            n_estimators=N_ESTIMATORS,
            learning_rate=params['learning_rate'],
            max_depth=int(params['max_depth']),  # max_depth should be an int
            subsample=params['subsample'],
            colsample_bytree=params['colsample_bytree'],
            objective='binary:logistic',
            gamma=params['gamma'],
            reg_alpha=params['reg_alpha'],
            scale_pos_weight=params['scale_pos_weight'],
            ## SETTINGS FOR GPU
            seed_per_iteration = True,
            tree_method='hist',
            device='cuda',
            # scale_pos_weight= neg_count_fold/pos_count_fold,
            ##
            # Custom eval metric: AUC-PR
            eval_metric='aucpr',
            random_state=SEED,
            n_jobs=-1,
            early_stopping_rounds=EARLY_STOPPING_ROUNDS # fixed early stopping
        )

        # Fit model on train fold and use validation for early stopping
        model.fit(X_train_fold, y_train_fold, eval_set=[(X_val, y_val)], verbose=False)

        # Find optimal number of iterations
        optimal_rounds_list.append(model.best_iteration)

        # # Predict on test set
        # ## Using F1
        # y_pred_test = model.predict(X_test_fold)
        # # test_score = accuracy_score(y_test_fold, y_pred_test)
        # # test_score = f1_score(y_test_fold, y_pred_test)
        # test_score = f1_score(y_test_fold.to_cupy().get(), y_pred_test)
        # test_scores.append(test_score)

        # ## Using ROC AUC
        # y_pred_test = model.predict_proba(X_test_fold)[:,1]
        # # y_pred_test = y_pred_test.to_cupy()
        # # y_pred_test = cp.array(y_pred_test)
        # y_pred_test = cudf.Series(y_pred_test)
        # # test_score = roc_auc_score(y_test_fold, y_pred_test)
        # test_score = cuml.metrics.roc_auc_score(y_test_fold, y_pred_test)
        # test_scores.append(test_score)
        # ##

        ## Using AUC PR
        # Predict on test set
        y_pred_test = model.predict_proba(X_test_fold)[:,1]
        # test_score = accuracy_score(y_test_fold, y_pred_test)
        # test_score = f1_score(y_test_fold, y_pred_test)
        y_prec, y_rec, _ = precision_recall_curve(y_test_fold.to_cupy().get(), y_pred_test)
        test_score = auc(y_rec, y_prec)
        test_scores.append(test_score)

    # Compute average score across all folds
    average_score = np.mean(test_scores)
    average_optimal_rounds = np.mean(optimal_rounds_list)

    if average_score > best_score:
        best_score = average_score
        best_params = params
        best_avg_rounds = int(round(average_optimal_rounds)) # Store the integer average
        ## Maybe??
        # best_f1 = test_score
        # best_model = model


# print(f"Best Parameters: {best_params}")
print("Best Parameters:")
# aligned printing code from stack overflow,
# https://stackoverflow.com/a/54573735/23486987
for key, value in best_params.items():
    print(f'{key:20}{value}')

# print(f"Best CV Average Accuracy: {best_score}")
# print(f"CV Average ROC AUC: {best_score:.4f}")
print(f"CV Average: {average_score:.4f}")
print(f"Best Avg Rounds: {best_avg_rounds}")

In [None]:
play_chime()

In [None]:
del X_train_fold, X_val, y_train_fold, y_val
del y_pred_test
del test_scores, best_rounds, optimal_rounds_list
# del n_splits, early_stopping_rounds, n_iterations, test_scores, best_rounds, optimal_rounds_list
del average_score, average_optimal_rounds
del y_prec, y_rec

## Save model

In [None]:
# model_name = MODEL_NAME

if os.path.exists(model_path(MODEL_NAME)) == False:
    print("Creating final model...")

    # # Prep hyperparam
    # neg_count_it = (y_train_inner == 0).sum()
    # pos_count_it = (y_train_inner == 1).sum()

    final_model = xgb.XGBClassifier(
        n_estimators=best_avg_rounds,
        # n_estimators=100, # avg optimal n_estimators if known, or reasonable default
        random_state=SEED,
        learning_rate=best_params['learning_rate'],
        max_depth=int(best_params['max_depth']),
        subsample=best_params['subsample'],
        colsample_bytree=best_params['colsample_bytree'],
        objective='binary:logistic',
        scale_pos_weight=best_params['scale_pos_weight'],
        gamma=best_params['gamma'],
        reg_alpha=best_params['reg_alpha'],
        ## SETTINGS FOR GPU
        seed_per_iteration = True,
        tree_method='hist',
        device='cuda',
        ##
        n_jobs=-1
    )

    # Do not fit until OOF tuning has been conducted
    # final_model.fit(X_train_inner, y_train_inner)
    joblib.dump(final_model, model_path(MODEL_NAME))

    # Local download
    from google.colab import files
    files.download(model_path(MODEL_NAME))

else:
    print("Importing model from saved files...")
    final_model = joblib.load(model_path(MODEL_NAME))

## OOF Predictions

Out-of-fold predictions will be used to tune smoothing and thresholding parameters. To do this, the model with tuned hyperparameters will predict each sliding window set from before. This reflects the real-world performance of the model as it can fit to more data.

In [None]:
oof_pred = np.full(len(y_train), np.nan)

for train_index, test_index in tscv.split(X_train):
  X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
  y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

  # Prepare the model
  model = clone(final_model)

  model.fit(X_train_fold, y_train_fold)
  y_pred_test = model.predict_proba(X_test_fold)[:,1]
  oof_pred[test_index] = y_pred_test

# Isolate the OOF truth values
# because the first training set is not included
valid_mask = ~np.isnan(oof_pred)
y_train_oof = y_train.values[valid_mask]
oof_pred = oof_pred[valid_mask]

del X_train_fold, y_train_fold, X_test_fold, y_test_fold, y_pred_test
del model, valid_mask

### Smoothing & Thresholding

Windowing can help smooth predictions by preventing standalone points that differ from their neighbors (e.g., having a sequence of `True` interrupted by one `False`, or vice-versa, both of which are unlikely in this context due to how weir blockages occur).

By default, a threshold of 0.5 will be selected for categorizing a point as `True` or `False`. However, in this context a model more sensitive to `True` may make final results more accurate.

These measures can be found by finding the window and corresponding threshold that maximizes the F1 score.

In [None]:
# def smooth_window(probs, window_size=3):
#     return pd.Series(probs).rolling(window=window_size, min_periods=1, center=True).mean().values

In [None]:
def f1_cust(input_true, input_pred):
  """Report the F1 score using inputs that might be mixed type.

    Args:
        input_true (np.ndarray or cp.ndarray): Array of true y values.
        input_parent (np.ndarray or cp.ndarray): Array of predicted y values.

    Returns:
        float: F1 score.
    """
  output_true = input_true.copy()
  output_pred = input_pred.copy()
  if not isinstance(output_true, np.ndarray):
    output_true = output_true.get()
  if not isinstance(output_pred, np.ndarray):
    output_pred = output_pred.get()

  output_f1 = f1_score(output_true, output_pred)
  # prec, rec, _ = precision_recall_curve(output_true, output_pred)
  # output_f1 = auc(rec, prec)
  return output_f1

In [None]:
# Start w no smoothing
best_window_size = 1
# Start w default threshold
threshold = 0.5

y_bin = (oof_pred >= threshold).astype(np.int32)
# y_bin = cp.array(y_bin)
best_f1 = f1_cust(y_train_oof, y_bin)
# print(f"No smoothing, 0.5 threshold:\t{best_f1:.4f}")

best_threshold = threshold
thresholds = np.linspace(0.01, 0.99, 100)

# # Smallest window size
# window_min = 1
# # Largest window size
# window_max = 35

print(f"Window 1\tCurr best->\5W:1\tT:{best_threshold}\t\tF1:{best_f1:.4}")

# Test range of odd window sizes
for current_window in range(WINDOW_MIN, WINDOW_MAX+1, 2):
    # print("Testing window =", current_window)
    # smoothed_preds = smooth_window(oof_pred, window_size=current_window)
    smoothed_preds = medfilt(oof_pred, kernel_size=current_window)

    for threshold in thresholds:
        y_bin = (smoothed_preds >= threshold).astype(np.int32)
        current_f1 = f1_cust(y_train_oof, y_bin)
        # print(f"{threshold:.4}\t{current_f1:.5}")

        if current_f1 > best_f1:
          best_f1 = current_f1
          # print(f"New best F1:{best_f1:.4}")
          best_window = current_window
          best_threshold = threshold
    # print(f"Current best F1:{best_f1:.4}")
    print(f"Window {current_window}\tCurr best->\tW:{best_window}\tT:{best_threshold:.4}\tF1:{best_f1:.4}")
    # print(f"Window {current_window}\tCurr best--\tW:{best_window}\tT:{best_threshold:.4}\tF1:{best_f1:.4}")

# oof_pred_adj = smooth_window(oof_pred, window_size=best_window)
oof_pred_adj = medfilt(oof_pred, kernel_size=best_window)
oof_pred_adj = (oof_pred_adj >= best_threshold).astype(int)

f1_rez = f1_cust(y_train_oof, oof_pred_adj)
# print("----------Final results----------")
print(f"Final results----->\t\tW:{best_window}\tT:{best_threshold:.4f}\tF1:{f1_rez:.4}")
# print(f"Final results:\t\tWindow: {best_window},\tThreshold: {best_threshold:.4f}\tF1:{f1_rez:.4f}")

# del current_f1, current_window, threshold, f1_rez, window_min, window_max, y_bin, best_f1
del current_f1, current_window, threshold, f1_rez, y_bin, best_f1

In [None]:
play_chime()

In [None]:
# @title
# # Start w no smoothing
# best_window_size = 1
# # Start w default threshold
# threshold = 0.5

# y_bin = (oof_pred >= threshold).astype(np.int32)
# # y_bin = cp.array(y_bin)
# best_f1 = f1_cust(y_train_oof, y_bin)
# # print(f"No smoothing, 0.5 threshold:\t{best_f1:.4f}")

# best_threshold = threshold
# thresholds = np.linspace(0.01, 0.99, 100)
# # thresholds = np.linspace(0.01, 0.99, 10)

# # Smallest window size
# window_min = 1
# # Largest window size
# window_max = 35
# # window_max = 10

# print(f"Window 1\tCurr best: W:1\tT:{best_threshold}\t\tF1:{best_f1:.4}")

# # Test range of odd window sizes
# for current_window in range(window_min, window_max+1, 2):
#     # print("Testing window =", current_window)
#     smoothed_preds = smooth_window(oof_pred, window_size=current_window)

#     for threshold in thresholds:
#         y_bin = (smoothed_preds >= threshold).astype(np.int32)
#         current_f1 = f1_cust(y_train_oof, y_bin)
#         # print(f"{threshold:.4}\t{current_f1:.5}")

#         if current_f1 > best_f1:
#           best_f1 = current_f1
#           # print(f"New best F1:{best_f1:.4}")
#           best_window = current_window
#           best_threshold = threshold
#     # print(f"Current best F1:{best_f1:.4}")
#     print(f"Window {current_window}\tCurr best: W:{best_window}\tT:{best_threshold:.4}\tF1:{best_f1:.4}")

# oof_pred_adj = smooth_window(oof_pred, window_size=best_window)
# oof_pred_adj = (oof_pred_adj >= best_threshold).astype(int)

# f1_rez = f1_cust(y_train_oof, oof_pred_adj)
# print(f"Window {best_window}, threshold {best_threshold:.4f}:\t{f1_rez:.4f}")

# del current_f1, current_window, threshold, f1_rez, window_min, window_max, y_bin, best_f1

## Applying to test set

In [None]:
final_model.fit(X_train, y_train)

final_pred_y = final_model.predict_proba(X_test)[:,1]

final_pred_y_base = (final_pred_y >= 0.5).astype(np.int32)

y_test_conv = y_test.copy().to_cupy()
y_test_conv = y_test_conv.get()


print("Base model (0.5 threshold, no smoothing)")

print(
    f"F1:\t{f1_score(y_test_conv, final_pred_y_base):.4f}",
    f"Acc:\t{accuracy_score(y_test_conv, final_pred_y_base):.4f}",
    f"Pre:\t{precision_score(y_test_conv, final_pred_y_base):.4f}",
    f"Rec:\t{recall_score(y_test_conv, final_pred_y_base):.4f}",
    "-----------------------------------",
    sep="\n"
)

# t_x_pred_f = t_tuner.predict(X_test)
print("Windowed with", best_window)

final_pred_y_win = medfilt(final_pred_y, kernel_size=best_window)
# final_pred_y_win = smooth_window(final_pred_y, window_size=best_window)
final_pred_y_win = (final_pred_y_win >= 0.5).astype(np.int32)

print(
    f"F1:\t{f1_score(y_test_conv, final_pred_y_win):.4f}",
    f"Acc:\t{accuracy_score(y_test_conv, final_pred_y_win):.4f}",
    f"Pre:\t{precision_score(y_test_conv, final_pred_y_win):.4f}",
    f"Rec:\t{recall_score(y_test_conv, final_pred_y_win):.4f}",
    "-----------------------------------",
    sep="\n"
)


print(f"Windowed & optmized threshold of {best_threshold:.4f}")

# final_pred_y = (final_pred_y >= best_threshold).astype(np.int32)
# final_pred_y_opt = smooth_window(final_pred_y, window_size=best_window)
final_pred_y_opt = medfilt(final_pred_y, kernel_size=best_window)
final_pred_y_opt = (final_pred_y_opt >= best_threshold).astype(np.int32)

print(
    f"F1:\t{f1_score(y_test_conv, final_pred_y_opt):.4f}",
    f"Acc:\t{accuracy_score(y_test_conv, final_pred_y_opt):.4f}",
    f"Pre:\t{precision_score(y_test_conv, final_pred_y_opt):.4f}",
    f"Rec:\t{recall_score(y_test_conv, final_pred_y_opt):.4f}",
    "-----------------------------------",
    sep="\n"
)

In [None]:
play_chime()

## Save results

In [None]:
# p_proba, p_none, p_w, p_t, p_wt = apply_model(X_test, final_model, best_window, best_threshold, return_all=True)

# print(
#     "F1\tAcc\tPre\tRec",
#     report_scores(y_test_conv, p_none, 4),
#     report_scores(y_test_conv, p_w, 4),
#     report_scores(y_test_conv, p_t, 4),
#     report_scores(y_test_conv, p_wt, 4),
#     sep="\n"
# )

for item in apply_model(X_test, final_model, best_window, best_threshold, "all"):
  print(report_scores(y_test_conv, item, 4))

In [None]:
# fitted_model_name = "xgb_full_fitted"

if os.path.exists(model_path(FITTED_MODEL_NAME)) == False:
    print("Saving final model...")

    joblib.dump(final_model, model_path(FITTED_MODEL_NAME))

    # Local download
    from google.colab import files
    files.download(model_path(FITTED_MODEL_NAME))

In [None]:
report_scores(y_test_conv, apply_model(X_test, final_model, best_window, best_threshold, "adj"), 4)

In [None]:
def apply_model(input_set, input_model, input_window, input_threshold, input_return="adj"):

  pred_y_proba = input_model.predict_proba(input_set)[:,1]

  if input_return == "proba":
    return pred_y_proba

  pred_y_wt = medfilt(pred_y_proba, kernel_size = input_window)
  pred_y_wt = (pred_y_wt >= input_threshold).astype(np.int32)

  if input_return=="adj":
    return pred_y_wt

  if input_return=="all":

    # No window, default threshold
    pred_y_none = (pred_y_proba >= 0.5).astype(np.int32)

    # Best window, default threshold
    pred_y_w = medfilt(pred_y_proba, kernel_size=input_window)
    pred_y_w = (pred_y_w >= 0.5).astype(np.int32)

    # No window, best threshold
    pred_y_t = (pred_y_proba >= input_threshold).astype(np.int32)

    # # Best window, best threshold
    # pred_y_wt = medfilt(pred_y_proba, kernel_size = input_window)
    # pred_y_wt = (pred_y_wt >= input_threshold).astype(np.int32)
    return pred_y_none, pred_y_w, pred_y_t, pred_y_wt


def report_scores(input_true, input_pred, input_round=None):
  output_f1 = f1_score(input_true, input_pred)
  output_acc = accuracy_score(input_true, input_pred)
  output_pre = precision_score(input_true, input_pred)
  output_rec = recall_score(input_true, input_pred)

  if input_round is not None:
    output_f1 = round(output_f1, input_round)
    output_acc = round(output_acc, input_round)
    output_pre = round(output_pre, input_round)
    output_rec = round(output_rec, input_round)

  return output_f1, output_acc, output_pre, output_rec

In [None]:
## Save to wd

# Indeces of test dates
test_dates = y_test.index.to_frame()
test_dates.to_parquet(get_path('results/test_index.parquet', 'outputs'))

# y predictions
np.save(get_path('results/y_test_pred.npy', 'outputs'), final_pred_y_opt)
np.save(get_path('results/y_test_pred_proba.npy', 'outputs'), final_pred_y_opt)

# actual test vals
np.save(get_path('results/y_test_true.npy', 'outputs'), y_test_conv)
# np.save(get_path('results/X_test_true.npy', 'outputs'), X_test)

In [None]:
# Save locally
files.download(get_path('results/test_index.parquet', 'outputs'))

files.download(get_path('results/y_test_pred.npy', 'outputs'))
files.download(get_path('results/y_test_pred_proba.npy', 'outputs'))

files.download(get_path('results/y_test_true.npy', 'outputs'))
# files.download(get_path('results/X_test_true.npy', 'outputs'))

## (draft code)

### Feature importance

In [None]:
feature_importances = final_model.feature_importances_
# map scores to feature names
# feature_importances
feature_names = X_train.columns.tolist()

feature_importance_df = pd.DataFrame({
    'feat': feature_names,
    'importance': feature_importances
})

# sort importance
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

# print(feature_importance_df)
feature_importance_df

# most important features
# print(feature_importance_df.head(25))

In [None]:
threshold_importance = 0.95
# calculate most important 90 percent of the importance
feature_importance_df['cumulative_imp'] = feature_importance_df['importance'].cumsum()
features_percent = feature_importance_df[feature_importance_df['cumulative_imp'] <= threshold_importance].shape[0] + 1
features_percent

In [None]:
# Most important features:
print(round(threshold_importance*100), "% (most important features):", features_percent)
feature_importance_df.head(features_percent)

In [None]:
# feature_importance_df.tail(1)
# Least important features:
print("Remaining", round((1-threshold_importance)*100), "% (least important features):", len(feature_names)-features_percent)
feature_importance_df.tail(len(feature_names)-features_percent)

In [None]:
# Features with 0 importance:
print("Features with 0 importance:", len(feature_importance_df[feature_importance_df['importance']==0]))

feature_importance_df[feature_importance_df['importance']==0]

Feature importance by type

In [None]:
mapping_dict = {
    'soil': '_deep|_shallow',
    'runoff':'ro',
    'rain':'rain',
    'calibration':'_cal'
}

for col_name, pattern in mapping_dict.items():
    feature_importance_df[col_name] = feature_importance_df['feat'].str.contains(pattern, case=False, regex=True)

feature_importance_df

In [None]:
feature_importance_df['most'] = (feature_importance_df['cumulative_imp'] <= threshold_importance)
feature_importance_df['zero'] = (feature_importance_df['importance'] == 0)

cat_cols = list(mapping_dict.keys())

table_feature_cat_importance = pd.DataFrame({
    'Total features': feature_importance_df[cat_cols].sum(),
    'Above threshold': feature_importance_df[feature_importance_df['most']][cat_cols].sum(),
    'Below threshold': feature_importance_df[~feature_importance_df['most']][cat_cols].sum(),
    'Zero importance': feature_importance_df[feature_importance_df['zero']][cat_cols].sum()
}).fillna(0).astype(int)

del cat_cols, mapping_dict

table_feature_cat_importance.index.name = 'Category'

table_feature_cat_importance

### Threshold

In [None]:
# threshold = 0.5
# y_bin = (oof_pred >= threshold).astype(np.int32)
# best_f1 = f1_score(y_train_oof, y_bin)
# print(f"F1 at default 0.5 threshold: {best_f1:.4f}")

# best_threshold = 0
# thresholds = np.linspace(0.01, 0.99, 100)

# for threshold in thresholds:
#     y_bin = (y_proba >= threshold).astype(np.int32)
#     current_f1 = f1_score(y_train_oof, y_bin)

#     if current_f1 > best_f1:
#         best_f1 = current_f1
#         best_threshold = threshold

# print(f"Optimal threshold:\t{best_threshold:.4f}")
# print(f"Threshold F1:\t{best_f1:.4f}")

In [None]:
# @title

# X_itest_mini, X_holdout, y_itest_mini, y_holdout = train_test_split(X_test_inner, y_test_inner, test_size=0.2, random_state=SEED, shuffle=False)

# t_tuner = TunedThresholdClassifierCV(
#     estimator=final_model,
#     scoring=make_scorer(f1_score),
#     cv="prefit",
#     thresholds=100,
#     refit=False,
#     n_jobs=-1
# )

# t_tuner.fit(X_itest_mini, y_itest_mini)

# print(
#     "Threshold:", t_tuner.best_threshold_,
#     "F1:", t_tuner.best_score_
# )

# t_x_pred = t_tuner.predict(X_holdout)

# print(
#     f1_score(y_holdout, t_x_pred),
#     accuracy_score(y_holdout, t_x_pred),
#     precision_score(y_holdout, t_x_pred),
#     recall_score(y_holdout, t_x_pred),
#     sep="\n"
# )

In [None]:
# @title
# # Thresholds
# X_itest_train, X_itest_test, y_itest_train, y_itest_test = train_test_split(X_test_inner, y_test_inner, test_size=0.2, shuffle=False)

# threshold = 0.5
# y_proba = final_model.predict_proba(X_itest_train)[:,1]
# y_bin = (y_proba >= threshold).astype(np.int32)
# best_f1 = f1_score(y_itest_train, y_bin)
# print(f"F1 at default 0.5 threshold: {best_f1:.4f}")

# best_threshold = 0
# thresholds = np.linspace(0.01, 0.99, 100)

# for threshold in thresholds:
#     y_bin = (y_proba >= threshold).astype(np.int32)
#     current_f1 = f1_score(y_itest_train, y_bin)

#     if current_f1 > best_f1:
#         best_f1 = current_f1
#         best_threshold = threshold

# print(f"Optimal threshold:\t{best_threshold:.4f}")
# print(f"Threshold F1:\t{best_f1:.4f}")

# y_proba_test = final_model.predict_proba(X_itest_test)[:,1]
# final_predictions = (y_proba_test >= best_threshold).astype(np.int32)

# print(
#     "\n--- Metrics on Holdout Set (using optimal threshold) ---",
#     f"F1:\t{f1_score(y_itest_test, final_predictions):.4f}",
#     f"Acc:\t{accuracy_score(y_itest_test, final_predictions):.4f}",
#     f"Pre:\t{precision_score(y_itest_test, final_predictions):.4f}",
#     f"Rec:\t{recall_score(y_itest_test, final_predictions):.4f}",
#     sep="\n"
# )
# # print(f"F1:\t{f1_score(y_itest_test, final_predictions):.4f}")
# # print(f"Acc:\t{accuracy_score(y_itest_test, final_predictions):.4f}")
# # print(f"Pre:\t{precision_score(y_itest_test, final_predictions):.4f}")
# # print(f"Rec:\t{recall_score(y_itest_test, final_predictions):.4f}")

# del y_proba, threshold, thresholds, best_f1
# del X_itest_train, y_itest_train

### Smoothing

To smooth the post-processing results, windows can be tested to determine the optimal majority-vote,

This function will run on CPU.

In [None]:
# def apply_smoothing(predictions_array, window_size=5):
#     predictions_series = pd.Series(predictions_array)
#     smoothed = predictions_series.rolling(
#         window=window_size,
#         center=True,
#         min_periods=1
#     ).apply(lambda x: np.bincount(x.astype(int)).argmax(), raw=False).astype(int)
#     return smoothed.values


# # Smallest window size
# window_min = 3
# # Largest window size
# window_max = 25

# # Start w no smoothing
# best_window_size = 1
# best_smoothing_f1 = f1_score(y_itest_test, final_predictions)
# # print("No window F1:\t", best_smoothing_f1)
# print(f"No window F1:\t{best_smoothing_f1:.4f}")

# # Test range of odd window sizes
# for window_size in range(window_min, window_max+1, 2):
#     # print("Testing window =", window_size)
#     smoothed_preds = apply_smoothing(final_predictions, window_size=window_size)
#     f1 = f1_score(y_itest_test, smoothed_preds)
#     # print("Window", window_size, "F1:\t", f1)
#     print(f"Window {window_size} F1:\t{f1:.4f}")

#     if f1 > best_smoothing_f1:
#         # print("F1 improved with smoothing!")
#         best_smoothing_f1 = f1
#         best_window_size = window_size

# print(f"Optimal window size: {best_window_size}")
# print(f"Best F1: {best_smoothing_f1:.4f}")

# del window_min, window_max, f1, window_size

In [None]:
# def apply_smoothing(predictions_array, window_size=5):
#     predictions_series = pd.Series(predictions_array)
#     smoothed = predictions_series.rolling(
#         window=window_size,
#         center=True,
#         min_periods=1
#     ).apply(lambda x: np.bincount(x.astype(int)).argmax(), raw=False).astype(int)
#     return smoothed.values

# # def apply_smoothing(predictions_array, window_size=5):
# #     # 1. Convert NumPy array to CuPy array (if it isn't already on the GPU)
# #     predictions_cp = cp.asarray(predictions_array)

# #     # 2. Convert CuPy array to cuDF Series
# #     predictions_series_gpu = cudf.Series(predictions_cp)

# #     # Define the rolling operation
# #     smoothed_gpu = predictions_series_gpu.rolling(
# #         window=window_size,
# #         center=True,
# #         min_periods=1
# #     )

# #     # 3. Apply the majority vote function using CuPy operations within a custom apply
# #     # We define a custom function using cupy's bincount
# #     def majority_vote_gpu(x):
# #         # Convert to int, count occurrences, and find the index of the max count (argmax)
# #         return cp.bincount(x.astype(cp.int32)).argmax()

# #     # Apply the function
# #     # Note: cuDF's rolling apply works best with simple cupy aggregations.
# #     # For complex custom lambdas like this, performance may vary,
# #     # but it keeps the data on the GPU.
# #     smoothed_gpu = smoothed_gpu.apply(majority_vote_gpu)

# #     # 4. Convert back to CPU NumPy array for final use if needed (optional)
# #     return smoothed_gpu.values.get() # .get() moves data from GPU (CuPy array) back to CPU (NumPy array)
# #     # return smoothed_gpu.values # Keep as CuPy array on GPU

In [None]:
# # Smallest window size
# window_min = 3
# # Largest window size
# window_max = 25

# # Start w no smoothing
# best_window_size = 1
# best_smoothing_f1 = f1_score(y_itest_test, final_predictions)
# # print("No window F1:\t", best_smoothing_f1)
# print(f"No window F1:\t{best_smoothing_f1:.4f}")

# # Test range of odd window sizes
# for window_size in range(window_min, window_max+1, 2):
#     # print("Testing window =", window_size)
#     smoothed_preds = apply_smoothing(final_predictions, window_size=window_size)
#     f1 = f1_score(y_itest_test, smoothed_preds)
#     # print("Window", window_size, "F1:\t", f1)
#     print(f"Window {window_size} F1:\t{f1:.4f}")

#     if f1 > best_smoothing_f1:
#         # print("F1 improved with smoothing!")
#         best_smoothing_f1 = f1
#         best_window_size = window_size

# print(f"Optimal window size: {best_window_size}")
# print(f"Best F1: {best_smoothing_f1:.4f}")

# del window_min, window_max, f1, window_size