In [1]:
import sys
import numpy as np
import pandas as pd
import json
import os

In [2]:
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [3]:
sys.path.append(os.path.abspath(".."))
from utils import load_best_params, read_features, read_targets, metrics_c, save_best_params, print_info_features, \
    print_info_targets, scale_all, reduce_all, add_all_feature_combos, encode_labels, run_hp_all, \
    get_column_widths, print_best_params, get_execution_time

## Read Features and Targets

In [4]:
path = os.path.abspath(os.path.join(os.getcwd(), "../../data/chronology_prediction"))

In [5]:
X = read_features(path)
y = read_targets(path, ["HistoricalPeriod"])

Loaded X_train_tfidf
Loaded X_train_bert
Loaded X_train_cannyhog
Loaded X_train_resnet
Loaded X_train_vit
Loaded X_test_tfidf
Loaded X_test_bert
Loaded X_test_cannyhog
Loaded X_test_resnet
Loaded X_test_vit
Loaded y_train
Loaded y_test


In [6]:
print_info_features(X)

{
	train: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2917), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2048), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
	},
	test: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 768), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 2917), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 2048), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 768), 
	},
}


In [7]:
print_info_targets(y)

y = {
	train: 
		<class 'pandas.core.frame.DataFrame'>
		shape (1719, 1)
		columns ['HistoricalPeriod'],
	test: 
		<class 'pandas.core.frame.DataFrame'>
		shape (191, 1)
		columns ['HistoricalPeriod'],
}


In [8]:
y["train"]["HistoricalPeriod"].value_counts()

HistoricalPeriod
Classical        1025
Archaic           620
Hellenistic        63
Orientalizing      11
Name: count, dtype: int64

## Extra Preprocessing

### Scale Features

In [9]:
X = scale_all(X)

### Reduce Features

Use n_components=0.95 to keep enough components to preserve 95% of the variance in the data.

In [10]:
X_reduced = reduce_all(X, n_components=0.95)

### Combine & Re-scale Text & Image Feature Sets

In [11]:
X = add_all_feature_combos(X, scale=True)

X_reduced = add_all_feature_combos(X_reduced, scale=True)

In [12]:
models_reduced_features = ["KNN", "XGBoost", "LightGBM"]

### Final Feature Sets

In [13]:
print_info_features(X)

{
	train: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2917), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2048), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
		tfidf + cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 3217), 
		tfidf + resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2348), 
		tfidf + vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 1068), 
		bert + cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 3685), 
		bert + resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2816), 
		bert + vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 1536), 
	},
	test: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 300), 
		bert

In [14]:
print_info_features(X_reduced)

{
	train: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 225), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 190), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 262), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 177), 
		tfidf + cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 490), 
		tfidf + resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 562), 
		tfidf + vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 477), 
		bert + cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 415), 
		bert + resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 487), 
		bert + vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 402), 
	},
	test: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 300), 
		bert: 
			<c

### Encode Target Labels

In [15]:
y, target_enc, le = encode_labels(y, "HistoricalPeriod")

0 --> Archaic
1 --> Classical
2 --> Hellenistic
3 --> Orientalizing


## Initialize 10-Fold Cross Validation

*Stratified K-Fold:*
- preserves label distribution
- maintains proportion of classes in each fold


In [16]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds = list(skf.split(np.arange(y["train"].shape[0]), y["train"]["HistoricalPeriod_encoded"]))

## Define Parameter Grid per Model

In [17]:
model_param_grids = {
    "LogisticRegression": (
        LogisticRegression, {
            "C": [0.01, 0.1, 1],
            "solver": ["liblinear", "lbfgs"],
            "max_iter": [5000],
            "class_weight": [None, "balanced"],
            "random_state": [42]
        }
    ),
    "KNN": (
        KNeighborsClassifier, {
            "n_neighbors": [3, 5, 7, 9, 11],
            "weights": ["uniform", "distance"],
            "metric": ["euclidean", "manhattan", "cosine"]
        }
    ),
    "RandomForest": (
        RandomForestClassifier, {
            "n_estimators": [100],  # number of trees
            "max_depth": [None, 10],  # limit depth to control overfitting
            "min_samples_split": [2],  # minimum samples to split an internal node
            "min_samples_leaf": [1, 2, 4],  # minimum samples at a leaf node
            "max_features": ["sqrt"],  # number of features to consider at split
            "class_weight": [None, "balanced"],  # handle class imbalance
            "n_jobs": [-1],
            "random_state": [42]
        }
    ),
    "XGBoost": (
        XGBClassifier,
        {
            "n_estimators": [100, 300],  # Number of boosting rounds
            "max_depth": [3, 6],  # Tree depth: controls model complexity
            "learning_rate": [0.05, 0.1],  # Lower = slower training, but more accurate
            "subsample": [0.8, 1.0],  # Row sampling
            "colsample_bytree": [0.8, 1.0],
            "n_jobs": [-1],
            "random_state": [42],
        }
    ),
    "LightGBM": (
        LGBMClassifier,
        {
            "n_estimators": [100, 300],  # Number of boosting rounds
            "max_depth": [-1, 10],  # Maximum depth of tree (-1 = no limit)
            "learning_rate": [0.05, 0.1],  # Lower = slower training, but more accurate
            "num_leaves": [31, 63],  # Number of leaves in one tree
            "subsample": [0.6, 0.8],  # Random sample ratio of training data
            "class_weight": ["balanced"],
            "n_jobs": [-1],
            "random_state": [42],
            "verbose": [-1],

            "device_type": ["gpu"],
            "n_gpu": [0],
            "boosting_type": ["gbdt"],
        }
    )
}

## Load or Initialize Best Params Dictionary

In [18]:
path_params = "best_params.json"

best_params = load_best_params(path_params)
print(f"Best Params Available for {len(best_params.keys())} Models")

Best Params Available for 2 Models


## Run HP Tuning for New Models

Run tuning only for models missing from the saved params

In [19]:
verbose = True
flag_new_model = False
deciding_metric = "accuracy"
for model_name, (model_class, param_grid) in model_param_grids.items():
    _X = X_reduced["train"] if model_name in models_reduced_features else X["train"]
    _y = y["train"][["HistoricalPeriod_encoded"]]

    column_widths = get_column_widths(list(_y.keys()), list(_X.keys()), deciding_metric, param_grid)

    if model_name not in best_params:
        flag_new_model = True

        if verbose: print(f"\n🔄 Hyperparameter Tuning '{model_name}' Model")

        best_params[model_name] = run_hp_all(
            model_class,
            param_grid,
            folds,
            metrics_c,
            _X,
            _y,
            deciding_metric,
            column_widths,
            verbose=verbose
        )
    else:
        if verbose:
            print(
                f"\n✅ '{model_name}' Model Already Tuned. Execution Time: {get_execution_time(best_params[model_name])}")
            print_best_params(column_widths, best_params[model_name], deciding_metric)


✅ 'LogisticRegression' Model Already Tuned. Execution Time: 41m 50s
+--------------------------+------------------+----------+------+-----------+--------------+
|                   target |      feature_set | accuracy |    C |    solver | class_weight |
+--------------------------+------------------+----------+------+-----------+--------------+
| HistoricalPeriod_encoded |            tfidf |   0.7836 |    1 |     lbfgs |         None |
| HistoricalPeriod_encoded |             bert |   0.7178 | 0.01 |     lbfgs |         None |
| HistoricalPeriod_encoded |         cannyhog |   0.6766 | 0.01 |     lbfgs |         None |
| HistoricalPeriod_encoded |           resnet |   0.6835 | 0.01 |     lbfgs |         None |
| HistoricalPeriod_encoded |              vit |   0.6928 |  0.1 |     lbfgs |         None |
| HistoricalPeriod_encoded | tfidf + cannyhog |   0.7958 | 0.01 |     lbfgs |         None |
| HistoricalPeriod_encoded |   tfidf + resnet |   0.8011 | 0.01 |     lbfgs |         None |
|

## Save Best Parameters per Model

In [20]:
save_best_params(path_params, best_params, flag_new_model)

✅ Saved best parameters to best_params.json
