# Stage II length binned gradient boosting classifier

With the stage two dataset complete, all that remains is to construct a second set of classifiers.

## 1. Run setup

In [1]:
# Change working directory to parent so we can import as we would from main.py
print(f'Working directory: ', end = '')
%cd ..

import h5py
import pickle
import pandas as pd

from scipy.stats import uniform, randint
from sklearn.metrics import make_scorer

import configuration as config
import functions.notebook_helper as helper_funcs
import functions.notebook_plotting as plot_funcs
import functions.length_binned_xgboost as xgb_funcs

Working directory: /mnt/arkk/llm_detector/classifier


Set some run parameters and filepaths:

In [2]:
# Data to work with
dataset_name = 'falcon-7b_scores_v2_10-300_words_stage_I'

# H5py dataset path
input_file = f'{config.DATA_PATH}/{dataset_name}_stage_II.h5'

# Number of folds to run for cross validation
cv_folds = 7

# Number of parameter sets to sample for hyperparameter optimization
hyperparameter_iterations = 100

# Paths to save and load results
cross_validation_results_filename = f'{config.DATA_PATH}/stage_two_cross_validation_results_{cv_folds}_folds.pkl'
hyperparameter_optimization_results_filename = f'{config.DATA_PATH}/stage_two_hyperparameter_optimization_results_{hyperparameter_iterations}_iterations.pkl'

# Dictionary to hold testing/experimentation results
parsed_results = {
    'Fold': [],
    'Condition': [],
    'Optimized': [],
    'Accuracy (%)': [],
    'False positive rate': [],
    'False negative rate': [],
    'Binary cross-entropy': []
}

# Make evaluation metrics scorers
scoring_funcs = {
    'negated_binary_cross_entropy': make_scorer(helper_funcs.negated_binary_cross_entropy),
    'binary_cross_entropy': make_scorer(helper_funcs.binary_cross_entropy), 
    'accuracy': make_scorer(helper_funcs.percent_accuracy),
    'false_positive_rate': make_scorer(helper_funcs.false_positive_rate),
    'false_negative_rate': make_scorer(helper_funcs.false_negative_rate)
}

# Plots to draw
plots = ['Accuracy (%)', 'False positive rate', 'False negative rate', 'Binary cross-entropy']

OK, I think we are ready, let's do the cross-validation.

## 2. Stage II classifiers: baseline performance
### 2.1. Cross-validation

In [3]:
%%time

# Do the cross-validation
parsed_results = xgb_funcs.cross_validate_bins(
    input_file = input_file,
    parsed_results = parsed_results,
    scoring_funcs = scoring_funcs,
    cv_folds = cv_folds,
    shuffle_control = False
)

# Save the result
with open(cross_validation_results_filename, 'wb') as result_output_file:
    pickle.dump(parsed_results, result_output_file, protocol = pickle.HIGHEST_PROTOCOL)

# Plot the results
plot_funcs.plot_two_factor_cross_validation(plots, parsed_results).show()

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = '/mnt/arkk/llm_detector/classifier/data/falcon-7b_scores_v2_10-300_words_stage_I_stage_II.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

### 2.2. Cross-validation: shuffled control

In [4]:
%%time

# Start a fresh results dictionary so we don't over-write our baseline results
parsed_control_results = {
    'Fold': [],
    'Condition': [],
    'Optimized': [],
    'Accuracy (%)': [],
    'False positive rate': [],
    'False negative rate': [],
    'Binary cross-entropy': []
}

# Do the cross-validation
parsed_control_results = xgb_funcs.cross_validate_bins(
    input_file = input_file,
    parsed_results = parsed_control_results,
    scoring_funcs = scoring_funcs,
    cv_folds = cv_folds,
    shuffle_control = True
)

# Plot the results
plot_funcs.plot_two_factor_cross_validation(plots, parsed_control_results).show()

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = '/mnt/arkk/llm_detector/classifier/data/falcon-7b_scores_v2_10-300_words_stage_I_stage_II.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

## 3. Stage II classifiers: hyperparameter tuning
### 3.1. Random search with cross-validation

In [5]:
# Define hyperparameter distributions for randomized grid search
parameter_distributions = {
    'learning_rate': uniform(loc = 0.0001, scale = 0.9999),
    'gamma': uniform(loc = 0.0, scale = 100.0),
    'max_depth': randint(1, 100),
    'min_child_weight': uniform(loc = 0.0001, scale = 0.9999),
    'subsample': uniform(loc = 0.5, scale = 0.5),
    'reg_alpha': uniform(loc = 0.0, scale = 1.0),
    'reg_lambda': uniform(loc = 0.0, scale = 1.0),
    'n_estimators': randint(1, 100),
    'num_parallel_tree': randint(1, 50)
}

In [6]:
%%time

# Do the optimization
results = xgb_funcs.hyperparameter_optimize_bins(
    input_file = input_file,
    parameter_distributions = parameter_distributions,
    scoring_funcs = scoring_funcs,
    cv_folds = cv_folds,
    hyperparameter_iterations = hyperparameter_iterations
)

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = '/mnt/arkk/llm_detector/classifier/data/falcon-7b_scores_v2_10-300_words_stage_I_stage_II.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [7]:
# Save the raw result
with open(hyperparameter_optimization_results_filename, 'wb') as result_output_file:
    pickle.dump(results, result_output_file, protocol = pickle.HIGHEST_PROTOCOL)

NameError: name 'results' is not defined

### 3.2. Hyperparameter optimization results

In [None]:
# Parse the results
winners, cv_results = xgb_funcs.parse_hyperparameter_tuning_results(results)

In [None]:
# Plot the results
plot_funcs.plot_hyperparameter_tuning(cv_results).show()

### 3.3. Winning models comparison to baseline

In [None]:
# Go through the hyperparameter optimization results, format and 
# add them to the results from the baseline cross-validation
parsed_results = xgb_funcs.add_winners_to_parsed_results(
    cv_results, 
    parsed_results, 
    cv_folds
)

plot_funcs.plot_two_factor_cross_validation(plots, parsed_results).show()

In [None]:
# Plot confusion matrix for hold-out test data in each bin
plot_funcs.plot_testing_confusion_matrices(winners, input_file).show()

### 3.4. Winning models evaluation on hold-out test data