In [1]:
import sys
import numpy as np
import pandas as pd
import json
import os

In [2]:
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [3]:
sys.path.append(os.path.abspath(".."))
from utils import load_best_params, read_features, read_targets, run_hp_all, metrics_c, save_best_params, combine_features_all_txt_img, scale_feature_set, print_info_features, print_info_targets, reduce_components, d_types_methods, combine_features

## Read Features and Targets

In [4]:
path = os.path.abspath(os.path.join(os.getcwd(), "../../data/chronology_prediction"))

In [5]:
X = read_features(path)
y = read_targets(path, ["HistoricalPeriod"])

Loaded X_train_tfidf
Loaded X_train_bert
Loaded X_train_cannyhog
Loaded X_train_resnet
Loaded X_train_vit
Loaded X_test_tfidf
Loaded X_test_bert
Loaded X_test_cannyhog
Loaded X_test_resnet
Loaded X_test_vit
Loaded y_train
Loaded y_test


In [6]:
print_info_features(X)

X = {
	train: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2917), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2048), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
	},
	test: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 768), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 2917), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 2048), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 768), 
	},
}


In [7]:
print_info_targets(y)

y = {
	train: 
		<class 'pandas.core.frame.DataFrame'>
		shape (1719, 1)
		columns ['HistoricalPeriod'],
	test: 
		<class 'pandas.core.frame.DataFrame'>
		shape (191, 1)
		columns ['HistoricalPeriod'],
}


In [8]:
y["train"]["HistoricalPeriod"].value_counts()

HistoricalPeriod
Classical        1025
Archaic           620
Hellenistic        63
Orientalizing      11
Name: count, dtype: int64

## Scale Features

In [9]:
X = {
    subset: {
        method: scale_feature_set(_X) if method != "tfidf" else _X
        for method, _X in X[subset].items()
    } for subset in X.keys()
}

## Reduce Features

Use n_components=0.95 to keep enough components to preserve 95% of the variance in the data.

In [10]:
reducers = {}
X_reduced = {subset: {} for subset in X.keys()}
for method in d_types_methods["text"] + d_types_methods["image"]:
    if method == "tfidf":
        for subset in X.keys(): X_reduced[subset][method] = X[subset][method]
        continue

    X_reduced["train"][method], reducers[method] = reduce_components(X["train"][method], n_components=0.95)

    X_reduced["test"][method], _ = reduce_components(X["test"][method], reducer_fitted=reducers[method])

In [11]:
print_info_features(X)

X = {
	train: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2917), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2048), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
	},
	test: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 768), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 2917), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 2048), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 768), 
	},
}


## Combine & Re-scale Text & Image Feature Sets

In [12]:
for subset in X.keys():
    X[subset].update(combine_features_all_txt_img(X[subset], scale=True))
    X_reduced[subset].update(combine_features_all_txt_img(X_reduced[subset], scale=True))

In [13]:
print_info_features(X)

X = {
	train: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2917), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2048), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
		tfidf + cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 3217), 
		tfidf + resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2348), 
		tfidf + vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 1068), 
		bert + cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 3685), 
		bert + resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2816), 
		bert + vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 1536), 
	},
	test: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 300), 
		

## Encode Target Labels

In [14]:
le = LabelEncoder()

y["train"]["HistoricalPeriod_encoded"] = le.fit_transform(y["train"]["HistoricalPeriod"])
y["test"]["HistoricalPeriod_encoded"] = le.transform(y["test"]["HistoricalPeriod"])

label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
for label, encoding in label_mapping.items():
    print(f"{encoding} --> {label}")

0 --> Archaic
1 --> Classical
2 --> Hellenistic
3 --> Orientalizing


## Initialize 10-Fold Cross Validation

*Stratified K-Fold:*
- preserves label distribution
- maintains proportion of classes in each fold


In [15]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds = list(skf.split(np.arange(y["train"].shape[0]), y["train"]["HistoricalPeriod_encoded"]))

## Define Parameter Grid per Model

In [16]:
model_param_grids = {
    "LogisticRegression": (
        LogisticRegression, {
            "C": [0.01, 0.1, 1],
            "solver": ["liblinear", "lbfgs"],
            "max_iter": [5000],
            "class_weight": [None, "balanced"],
            "random_state": [42]
        }
    ),
    "KNN": (
        KNeighborsClassifier, {
            "n_neighbors": [3, 5, 7, 9, 11],
            "weights": ["uniform", "distance"],
            "metric": ["euclidean", "manhattan", "cosine"]
        }
    ),
    "RandomForest": (
        RandomForestClassifier, {
            "n_estimators": [100],             # number of trees
            "max_depth": [None, 10],             # limit depth to control overfitting
            "min_samples_split": [2],             # minimum samples to split an internal node
            "min_samples_leaf": [1, 2, 4],               # minimum samples at a leaf node
            "max_features": ["sqrt"],      # number of features to consider at split
            "class_weight": [None, "balanced"],          # handle class imbalance
            "n_jobs": [-1],
            "random_state": [42]                         # for reproducibility
        }
    ),
    # "XGBoost": (
    #     XGBClassifier,
    #     {
    #
    #     }
    # ),
    # "LightGBM": (
    #     LGBMClassifier,
    #     {
    #
    #     }
    # )
}

## Load or Initialize Best Params Dictionary

In [17]:
path_params = "best_params.json"

best_params = load_best_params(path_params)
print(f"Best Params Available for {len(best_params.keys())} Models")

Best Params Available for 0 Models


## Run HP Tuning for New Models

Run tuning only for models missing from the saved params

In [18]:
verbose = True
flag_new_model = False
for model_name, (model_class, param_grid) in model_param_grids.items():

    if model_name not in best_params:
        flag_new_model = True

        if verbose: print(f"\n🎚️ Hyperparameter Tuning '{model_name}' Model")
        best_params[model_name] = run_hp_all(
            model_class,
            param_grid,
            folds,
            metrics_c,
            X["train"] if model_name != "KNN" else X_reduced["train"],
            y["train"][["HistoricalPeriod_encoded"]],
            deciding_metric="accuracy",
            verbose=verbose
        )
    else:
        if verbose: print(f"\n✅ '{model_name}' Model Already Tuned")


🎚️ Hyperparameter Tuning 'LogisticRegression' Model

Features: tfidf | Target: HistoricalPeriod_encoded
✅ Best params: {'C': 1, 'class_weight': None, 'max_iter': 5000, 'random_state': 42, 'solver': 'lbfgs'}
🎯 Best ACCURACY: 0.7836

Features: bert | Target: HistoricalPeriod_encoded
✅ Best params: {'C': 0.01, 'class_weight': None, 'max_iter': 5000, 'random_state': 42, 'solver': 'lbfgs'}
🎯 Best ACCURACY: 0.7178

Features: cannyhog | Target: HistoricalPeriod_encoded
✅ Best params: {'C': 0.01, 'class_weight': None, 'max_iter': 5000, 'random_state': 42, 'solver': 'lbfgs'}
🎯 Best ACCURACY: 0.6766

Features: resnet | Target: HistoricalPeriod_encoded
✅ Best params: {'C': 0.01, 'class_weight': None, 'max_iter': 5000, 'random_state': 42, 'solver': 'lbfgs'}
🎯 Best ACCURACY: 0.6835

Features: vit | Target: HistoricalPeriod_encoded
✅ Best params: {'C': 0.1, 'class_weight': None, 'max_iter': 5000, 'random_state': 42, 'solver': 'lbfgs'}
🎯 Best ACCURACY: 0.6928

Features: tfidf + cannyhog | Target: Hi

## Save Best Parameters per Model

In [19]:
best_params

{'LogisticRegression': {('tfidf', 'HistoricalPeriod_encoded'): {'C': 1,
   'class_weight': None,
   'max_iter': 5000,
   'random_state': 42,
   'solver': 'lbfgs'},
  ('bert', 'HistoricalPeriod_encoded'): {'C': 0.01,
   'class_weight': None,
   'max_iter': 5000,
   'random_state': 42,
   'solver': 'lbfgs'},
  ('cannyhog', 'HistoricalPeriod_encoded'): {'C': 0.01,
   'class_weight': None,
   'max_iter': 5000,
   'random_state': 42,
   'solver': 'lbfgs'},
  ('resnet', 'HistoricalPeriod_encoded'): {'C': 0.01,
   'class_weight': None,
   'max_iter': 5000,
   'random_state': 42,
   'solver': 'lbfgs'},
  ('vit', 'HistoricalPeriod_encoded'): {'C': 0.1,
   'class_weight': None,
   'max_iter': 5000,
   'random_state': 42,
   'solver': 'lbfgs'},
  ('tfidf + cannyhog', 'HistoricalPeriod_encoded'): {'C': 0.01,
   'class_weight': None,
   'max_iter': 5000,
   'random_state': 42,
   'solver': 'lbfgs'},
  ('tfidf + resnet', 'HistoricalPeriod_encoded'): {'C': 0.01,
   'class_weight': None,
   'max_iter'

In [20]:
save_best_params(path_params, best_params, flag_new_model)

✅ Saved best parameters to best_params.json
