# Predicting Performing Arts Attendance with Machine Learning

## - ML Trained and Tuned with Five Seeds

August 8, 2025

---

In [None]:
import os
import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_curve 
from sklearn.metrics import auc, roc_auc_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils.class_weight import compute_sample_weight

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer

import optuna
from functools import partial

import shap
shap.initjs()  # load JS for visualization in Jupyter

## Load & clean the data

In [None]:
%run 'code_01_data_cleaning.ipynb'

In [None]:
df.shape

In [None]:
y_vars = ['ATTEND']
df_Y = df[y_vars]
df_X = df.drop(columns=y_vars)

In [None]:
df_Y.head()

In [None]:
df_X.head()

## Seeds

In [None]:
import random
import hashlib

def md5_hash(input_string):
    """Generates an MD5 hash from a given string.
    Args:
    input_string: The string to hash.
    Returns:
    The MD5 hash as a hexadecimal string.
    """
    md5_hasher = hashlib.md5()
    md5_hasher.update(input_string.encode('utf-8'))
    return md5_hasher.hexdigest()

In [None]:
input_string = "performingartsattendance"
hashed_value = md5_hash(input_string)
print(f"The MD5 hash of '{input_string}' is: {hashed_value}")

# Convert the hexadecimal hash to an integer
try:
    number = int(hashed_value, 16)
    print(f"The integer representation of the hash is: {number}")
except ValueError:
    print("Invalid hexadecimal string")

# Set the seed value
random.seed(number)

print(f"Initial seed number: {number}")

# Generate a list of random numbers
n_seeds = 5
random.seed(number)
a = 0
b = 2**31-1
seeds = [random.randint(a, b) for _ in range(n_seeds)]

# Print the list
print("Seed", seeds)

## Set split

In [None]:
def set_split(seed, df_X, df_y, test_size=0.2):
    return train_test_split(df_X, df_y, test_size=test_size, random_state=seed)

## Preprocessing

In [None]:
categorical_vars = ['REGION', 'STATEFIP', 'METRO', 
                    'SEX', 'RACE', 'HISPAN', 'VETSTAT', 'YRIMMIG', 'MARST',
                    'EMPSTAT', 'CLASSWKR',
                    'EDUC99',
                    'SCHLCOLL', 'PROFCERT',
                    'DIFFHEAR', 'DIFFEYE', 'DIFFREM',
                    'DIFFPHYS', 'DIFFMOB', 'DIFFANY']
numerical_vars = [col for col in df_X.columns if col not in categorical_vars]

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, categorical_vars),
    ('num', num_pipeline, numerical_vars)
])

## Evaluation

In [None]:
# clf: trained
def evaluate_model(seed, clf, X_test_transformed, y_test):
    # Predict
    y_pred = clf.predict(X_test_transformed)
    y_prob = clf.predict_proba(X_test_transformed)[:, 1]

    return {
        'seed': seed,
        'f1': f1_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_prob),
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'roc_curve': roc_curve(y_test, y_prob),
        'report': classification_report(y_test, y_pred),
        'y_test': y_test,
        'y_pred': y_pred,
        'y_prob': y_prob,
        'clf': clf
    }

## CVs

In [None]:
cvs = [5, 10]

## Trials

In [None]:
n_trials = 200

In [None]:
def compute_ci(x):
    x = x.dropna()
    n = len(x)
    mean = x.mean()
    median = x.median()
    sd = x.std(ddof=1)
    se = sd / np.sqrt(n)
    ci = stats.t.interval(0.95, df=n-1, loc=mean, scale=se) if n > 1 else (np.nan, np.nan)
    return pd.Series({
        'mean': mean,
        'median': median,
        'n': n,
        'ci_lower': ci[0],
        'ci_upper': ci[1]
    })

<p style="padding: 15px; background-color: skyblue; color: black; font-weight: bold;
          text-align: center; font-size: 170%">Machine Learning</p>

In [None]:
results = []

In [None]:
ycol = 'ATTEND'

## Random Forest

In [None]:
model_name = 'RF'
model_fullname = 'Random Forest'

In [None]:
def model_rf_default(seed, df_X, df_y, preprocessor=preprocessor):
    # Train/test split
    X_train, X_test, y_train, y_test = set_split(seed, df_X, df_y)
    
    # Preprocessing
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)
    
    # Classifier
    clf = RandomForestClassifier(random_state=seed)
    sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
    clf.fit(X_train_transformed, y_train, sample_weight=sample_weights)

    # Evaluate
    results = evaluate_model(seed, clf, X_test_transformed, y_test)
    
    return results

In [None]:
results.extend([{
    'y': ycol,
    'model': model_name,
    'cv': 0,
    **model_rf_default(s, df_X, df_Y[ycol])}
    for s in seeds
])

In [None]:
def rf_objective(trial, seed, cv, X_train_transformed, y_train):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 2, 64, log=True),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 50),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 50),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None, 0.2, 0.5, 0.8]),
        "n_jobs": -1
    }

    # Build model
    clf = RandomForestClassifier(**params, random_state=seed)
    
    # Cross-validated F1 score
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=seed)
    f1_scores = []

    for train_idx, val_idx in skf.split(X_train_transformed, y_train):
        X_train_cv, X_val_cv = X_train_transformed[train_idx], X_train_transformed[val_idx]
        y_train_cv, y_val_cv = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Compute sample weights for the training fold
        sample_weights = compute_sample_weight(class_weight='balanced', y=y_train_cv)

        # Fit with sample weights
        clf.fit(X_train_cv, y_train_cv, sample_weight=sample_weights)

        # Predict and evaluate
        y_pred = clf.predict(X_val_cv)
        f1 = f1_score(y_val_cv, y_pred)
        f1_scores.append(f1)
    
    return np.mean(f1_scores)

In [None]:
def model_rf_tuned(seed, cv, df_X, df_y, n_trials=n_trials, preprocessor=preprocessor):
    # Train/test split
    X_train, X_test, y_train, y_test = set_split(seed, df_X, df_y)

    # Preprocessing
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    # Tune model
    sampler = optuna.samplers.TPESampler(seed=seed)
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(lambda trial: rf_objective(trial, seed, cv, X_train_transformed, y_train), 
                   n_trials=n_trials)

    best_params = study.best_params

    # Use best parameters to train final model
    clf = RandomForestClassifier(**best_params, random_state=seed)
    sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
    clf.fit(X_train_transformed, y_train, sample_weight=sample_weights)

    # Evaluate
    results = evaluate_model(seed, clf, X_test_transformed, y_test)
    
    return results

In [None]:
for cv in cvs:
    results.extend([{
        'y': ycol,
        'model': model_name,
        'cv': cv,
        **model_rf_tuned(s, cv, df_X, df_Y[ycol])}
        for s in seeds
    ])

### Results

In [None]:
results_df = pd.DataFrame(results)
plotdf = results_df[results_df['model'] == model_name].copy()
plotdf.drop(columns=['y_test', 'y_pred', 'y_prob', 'roc_curve'])

In [None]:
results_df[results_df['model'] == 'RF'].groupby('cv')['f1'].apply(compute_ci).reset_index().pivot(
    index='cv', columns='level_1', values='f1').reset_index()

In [None]:
results_df[results_df['model'] == 'RF'].groupby('cv')['auc'].apply(compute_ci).reset_index().pivot(
    index='cv', columns='level_1', values='auc').reset_index()

In [None]:
results_df[results_df['model'] == 'RF'].groupby('cv')['accuracy'].apply(compute_ci).reset_index().pivot(
    index='cv', columns='level_1', values='accuracy').reset_index()

In [None]:
results_df[results_df['model'] == 'RF'].groupby('cv')['precision'].apply(compute_ci).reset_index().pivot(
    index='cv', columns='level_1', values='precision').reset_index()

In [None]:
results_df[results_df['model'] == 'RF'].groupby('cv')['recall'].apply(compute_ci).reset_index().pivot(
    index='cv', columns='level_1', values='recall').reset_index()

In [None]:
# Generate mapping from unique cv values to readable labels
plotdf['cv_label'] = plotdf['cv'].apply(lambda x: 'Default' if x == 0 else f'Tuned cv={int(x)}')

# Create the boxplot
PROPS = {
    'boxprops':{'facecolor':'white', 'edgecolor':'black'},
    'medianprops':{'color':'black'},
    'whiskerprops':{'color':'black'},
    'capprops':{'color':'black'}
}

fig, axs = plt.subplots(1, 2, figsize=(8, 5))
sns.boxplot(x='cv_label', y='f1', data=plotdf, linewidth=1, ax=axs[0], **PROPS)
axs[0].set_ylabel('F1 Score')
sns.boxplot(x='cv_label', y='auc', data=plotdf, linewidth=1, ax=axs[1], **PROPS)
axs[1].set_ylabel('AUC')

for ax in axs:
    ax.set_xlabel("Model Type")
    ax.set_ylim(0, 1)
    ax.grid(True, axis='y')
    
plt.suptitle(f"{ycol} - {model_fullname}: Evaluation metrics with {n_seeds} seeds")
plt.tight_layout()
plt.show()

## Gradient Boosting

In [None]:
model_name = 'GB'
model_fullname = 'Gradient Boosting'

In [None]:
def model_gb_default(seed, df_X, df_y, preprocessor=preprocessor):
    # Train/test split
    X_train, X_test, y_train, y_test = set_split(seed, df_X, df_y)
    
    # Preprocessing
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)
    
    # Classifier
    clf = GradientBoostingClassifier(random_state=seed)
    sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
    clf.fit(X_train_transformed, y_train, sample_weight=sample_weights)

    # Evaluate
    results = evaluate_model(seed, clf, X_test_transformed, y_test)
    
    return results

In [None]:
results.extend([{
    'y': ycol,
    'model': model_name,
    'cv': 0,
    **model_gb_default(s, df_X, df_Y[ycol])}
    for s in seeds
])

In [None]:
def gb_objective(trial, seed, cv, X_train_transformed, y_train):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 2, 16),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 50),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 50),
        "subsample": trial.suggest_float("subsample", 0.4, 1.0, step=0.05),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None, 0.2, 0.5, 0.8])
    }

    # Build model
    clf = GradientBoostingClassifier(**params, random_state=seed)
    
    # Cross-validated F1 score
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=seed)
    f1_scores = []

    for train_idx, val_idx in skf.split(X_train_transformed, y_train):
        X_train_cv, X_val_cv = X_train_transformed[train_idx], X_train_transformed[val_idx]
        y_train_cv, y_val_cv = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Compute sample weights for the training fold
        sample_weights = compute_sample_weight(class_weight='balanced', y=y_train_cv)

        # Fit with sample weights
        clf.fit(X_train_cv, y_train_cv, sample_weight=sample_weights)

        # Predict and evaluate
        y_pred = clf.predict(X_val_cv)
        f1 = f1_score(y_val_cv, y_pred)
        f1_scores.append(f1)
    
    return np.mean(f1_scores)

In [None]:
def model_gb_tuned(seed, cv, df_X, df_y, n_trials=n_trials, preprocessor=preprocessor):
    # Train/test split
    X_train, X_test, y_train, y_test = set_split(seed, df_X, df_y)

    # Preprocessing
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    # Tune model
    sampler = optuna.samplers.TPESampler(seed=seed)
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(lambda trial: gb_objective(trial, seed, cv, X_train_transformed, y_train), 
                   n_trials=n_trials)

    best_params = study.best_params

    # Use best parameters to train final model
    clf = GradientBoostingClassifier(**best_params, random_state=seed)
    sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
    clf.fit(X_train_transformed, y_train, sample_weight=sample_weights)
    
    # Evaluate
    results = evaluate_model(seed, clf, X_test_transformed, y_test)
    
    return results

In [None]:
for cv in cvs:
    results.extend([{
        'y': ycol,
        'model': model_name,
        'cv': cv,
        **model_gb_tuned(s, cv, df_X, df_Y[ycol])}
        for s in seeds
    ])

### Results

In [None]:
results_df = pd.DataFrame(results)
plotdf = results_df[results_df['model'] == model_name].copy()
plotdf.drop(columns=['y_test', 'y_pred', 'y_prob', 'roc_curve'])

In [None]:
results_df[results_df['model'] == 'GB'].groupby('cv')['f1'].apply(compute_ci).reset_index().pivot(
    index='cv', columns='level_1', values='f1').reset_index()

In [None]:
results_df[results_df['model'] == 'GB'].groupby('cv')['auc'].apply(compute_ci).reset_index().pivot(
    index='cv', columns='level_1', values='auc').reset_index()

In [None]:
results_df[results_df['model'] == 'GB'].groupby('cv')['accuracy'].apply(compute_ci).reset_index().pivot(
    index='cv', columns='level_1', values='accuracy').reset_index()

In [None]:
results_df[results_df['model'] == 'GB'].groupby('cv')['precision'].apply(compute_ci).reset_index().pivot(
    index='cv', columns='level_1', values='precision').reset_index()

In [None]:
results_df[results_df['model'] == 'GB'].groupby('cv')['recall'].apply(compute_ci).reset_index().pivot(
    index='cv', columns='level_1', values='recall').reset_index()

In [None]:
# Generate mapping from unique cv values to readable labels
plotdf['cv_label'] = plotdf['cv'].apply(lambda x: 'Default' if x == 0 else f'Tuned cv={int(x)}')

# Create the boxplot
fig, axs = plt.subplots(1, 2, figsize=(8, 5))
sns.boxplot(x='cv_label', y='f1', data=plotdf, linewidth=1, ax=axs[0], **PROPS)
axs[0].set_ylabel('F1 Score')
sns.boxplot(x='cv_label', y='auc', data=plotdf, linewidth=1, ax=axs[1], **PROPS)
axs[1].set_ylabel('AUC')

for ax in axs:
    ax.set_xlabel("Model Type")
    ax.set_ylim(0, 1)
    ax.grid(True, axis='y')
    
plt.suptitle(f"{ycol} - {model_fullname}: Evaluation metrics with {n_seeds} seeds")
plt.tight_layout()
plt.show()

## XGBoost

In [None]:
model_name = 'XGB'
model_fullname = 'XGBoost'

In [None]:
def model_xgb_default(seed, df_X, df_y, preprocessor=preprocessor):
    # Train/test split
    X_train, X_test, y_train, y_test = set_split(seed, df_X, df_y)
    
    # Preprocessing
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)
    
    # Classifier
    clf = XGBClassifier(random_state=seed)
    sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
    clf.fit(X_train_transformed, y_train, sample_weight=sample_weights)

    # Evaluate
    results = evaluate_model(seed, clf, X_test_transformed, y_test)
    
    return results

In [None]:
results.extend([{
    'y': ycol,
    'model': model_name,
    'cv': 0,
    **model_xgb_default(s, df_X, df_Y[ycol])}
    for s in seeds
])

In [None]:
def xgb_objective(trial, seed, cv, X_train_transformed, y_train):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 2, 16),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.4, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 100.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 100.0, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 100),
        "eval_metric": "logloss",
        "n_jobs": -1
    }

    # Build model
    clf = XGBClassifier(**params, random_state=seed)
    
    # Cross-validated F1 score
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=seed)
    f1_scores = []

    for train_idx, val_idx in skf.split(X_train_transformed, y_train):
        X_train_cv, X_val_cv = X_train_transformed[train_idx], X_train_transformed[val_idx]
        y_train_cv, y_val_cv = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Compute sample weights for the training fold
        sample_weights = compute_sample_weight(class_weight='balanced', y=y_train_cv)

        # Fit with sample weights
        clf.fit(X_train_cv, y_train_cv, sample_weight=sample_weights)

        # Predict and evaluate
        y_pred = clf.predict(X_val_cv)
        f1 = f1_score(y_val_cv, y_pred)
        f1_scores.append(f1)
    
    return np.mean(f1_scores)

In [None]:
def model_xgb_tuned(seed, cv, df_X, df_y, n_trials=n_trials, preprocessor=preprocessor):
    # Train/test split
    X_train, X_test, y_train, y_test = set_split(seed, df_X, df_y)

    # Preprocessing
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)

    # Tune model
    sampler = optuna.samplers.TPESampler(seed=seed)
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(lambda trial: xgb_objective(trial, seed, cv, X_train_transformed, y_train), 
                   n_trials=n_trials)

    best_params = study.best_params

    # Use best parameters to train final model
    clf = XGBClassifier(**best_params, random_state=seed)

    sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
    clf.fit(X_train_transformed, y_train, sample_weight=sample_weights)
    
    # Evaluate
    results = evaluate_model(seed, clf, X_test_transformed, y_test)
    
    return results

In [None]:
for cv in cvs:
    results.extend([{
        'y': ycol,
        'model': model_name,
        'cv': cv,
        **model_xgb_tuned(s, cv, df_X, df_Y[ycol])}
        for s in seeds
    ])

### Results

In [None]:
results_df = pd.DataFrame(results)
plotdf = results_df[results_df['model'] == model_name].copy()
plotdf.drop(columns=['y_test', 'y_pred', 'y_prob', 'roc_curve'])

In [None]:
results_df[results_df['model'] == 'XGB'].groupby('cv')['f1'].apply(compute_ci).reset_index().pivot(
    index='cv', columns='level_1', values='f1').reset_index()

In [None]:
results_df[results_df['model'] == 'XGB'].groupby('cv')['auc'].apply(compute_ci).reset_index().pivot(
    index='cv', columns='level_1', values='auc').reset_index()

In [None]:
results_df[results_df['model'] == 'XGB'].groupby('cv')['accuracy'].apply(compute_ci).reset_index().pivot(
    index='cv', columns='level_1', values='accuracy').reset_index()

In [None]:
results_df[results_df['model'] == 'XGB'].groupby('cv')['precision'].apply(compute_ci).reset_index().pivot(
    index='cv', columns='level_1', values='precision').reset_index()

In [None]:
results_df[results_df['model'] == 'XGB'].groupby('cv')['recall'].apply(compute_ci).reset_index().pivot(
    index='cv', columns='level_1', values='recall').reset_index()

In [None]:
# Generate mapping from unique cv values to readable labels
plotdf['cv_label'] = plotdf['cv'].apply(lambda x: 'Default' if x == 0 else f'Tuned cv={int(x)}')

# Create the boxplot
fig, axs = plt.subplots(1, 2, figsize=(8, 5))
sns.boxplot(x='cv_label', y='f1', data=plotdf, linewidth=1, ax=axs[0], **PROPS)
axs[0].set_ylabel('F1 Score')
sns.boxplot(x='cv_label', y='auc', data=plotdf, linewidth=1, ax=axs[1], **PROPS)
axs[1].set_ylabel('AUC')

for ax in axs:
    ax.set_xlabel("Model Type")
    ax.set_ylim(0, 1)
    ax.grid(True, axis='y')
    
plt.suptitle(f"{ycol} - {model_fullname}: Evaluation metrics with {n_seeds} seeds")
plt.tight_layout()
plt.show()

## Summary

In [None]:
df = results_df.copy()

In [None]:
# Create a new label that combines model and CV setting
df['model_cv'] = df['model'] + " | cv=" + df['cv'].astype(str)

# Define your preferred model order
model_order = ['RF', 'GB', 'XGB']
cv_order = sorted(df['cv'].unique(), key=lambda x: int(x))
ordered_model_cv = [f"{model} | cv={cv}" for model in model_order for cv in cv_order]

# Plot boxplot of F1 scores
plt.figure(figsize=(10, 6))
sns.boxplot(x='model_cv', y='f1', data=df, linewidth=1, **PROPS, order=ordered_model_cv)

plt.xlabel("Model and CV Setting")
plt.ylabel("F1 Score")
plt.title(f"{ycol} - F1 Score by Model Type")
plt.xticks(rotation=0, ha='center')
plt.ylim(0, 1)
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()

In [None]:
# Plot boxplot of AUC
plt.figure(figsize=(10, 6))
sns.boxplot(x='model_cv', y='auc', data=df, linewidth=1, **PROPS, order=ordered_model_cv)

plt.xlabel("Model and CV Setting")
plt.ylabel("AUC")
plt.title(f"{ycol} - AUC by Model Type")
plt.xticks(rotation=0, ha='center')
plt.ylim(0, 1)
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()

## Save models

In [None]:
# for mdl in results:
#     joblib.dump(mdl['clf'], f"models/{mdl['y']}_{mdl['model']}_cv{mdl['cv']:.0f}_seed{mdl['seed']}.pkl")