In [1]:
import sys
import numpy as np
import pandas as pd
import json
import os

In [2]:
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [3]:
sys.path.append(os.path.abspath(".."))
from utils import load_best_params, read_features, read_targets, run_hp_all, metrics_c, save_best_params

## Read Features and Targets

In [4]:
path = os.path.abspath(os.path.join(os.getcwd(), "../../data/chronology_prediction"))

In [5]:
X = read_features(path)

Loaded X_train_tfidf
Loaded X_train_bert
Loaded X_train_cannyhog
Loaded X_train_resnet
Loaded X_train_vit
Loaded X_test_tfidf
Loaded X_test_bert
Loaded X_test_cannyhog
Loaded X_test_resnet
Loaded X_test_vit


In [6]:
print("X = {")
for subset in X.keys():
    indent = "\t"
    print(f"{indent}{subset}: " + "{")
    for method in X[subset].keys():
        indent = 2 * "\t"
        print(f"{indent}{method}: ")
        indent = 3 * "\t"
        print(f"{indent}{type(X[subset][method])}")
        print(f"{indent}shape = {X[subset][method].shape}, ")
    print("\t},")
print("}")

X = {
	train: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2917), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2048), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
	},
	test: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 768), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 2917), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 2048), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 768), 
	},
}


In [7]:
y = read_targets(path, ["HistoricalPeriod"])

Loaded y_train
Loaded y_test


In [8]:
print("y = {")
for subset in y.keys():
    indent = "\t"
    print(f"{indent}{subset}: ")
    indent = 2 * "\t"
    print(f"{indent}{type(y[subset])}")
    print(f"{indent}shape {y[subset].shape}")
    print(f"{indent}columns {list(y[subset].columns)},")
print("}")

y = {
	train: 
		<class 'pandas.core.frame.DataFrame'>
		shape (1719, 1)
		columns ['HistoricalPeriod'],
	test: 
		<class 'pandas.core.frame.DataFrame'>
		shape (191, 1)
		columns ['HistoricalPeriod'],
}


In [9]:
y["train"]["HistoricalPeriod"].value_counts()

HistoricalPeriod
Classical        1025
Archaic           620
Hellenistic        63
Orientalizing      11
Name: count, dtype: int64

## Scale Data

In [10]:
X_scaled = X.copy()
scaler = StandardScaler()
for subset in X.keys():
    for method in X[subset].keys():
        if method == "tfidf":
            continue
        X_scaled[subset][method] = pd.DataFrame(
            scaler.fit_transform(X_scaled[subset][method]),
            columns=X_scaled[subset][method].columns,
            index=X_scaled[subset][method].index
        )

## Encode Target Labels

In [11]:
le = LabelEncoder()

y["train"]["HistoricalPeriod_encoded"] = le.fit_transform(y["train"]["HistoricalPeriod"])
y["test"]["HistoricalPeriod_encoded"] = le.transform(y["test"]["HistoricalPeriod"])

label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
for label, encoding in label_mapping.items():
    print(f"{encoding} --> {label}")

0 --> Archaic
1 --> Classical
2 --> Hellenistic
3 --> Orientalizing


In [12]:
y["train"]

Unnamed: 0,HistoricalPeriod,HistoricalPeriod_encoded
0,Classical,1
1,Classical,1
2,Classical,1
3,Archaic,0
4,Classical,1
...,...,...
1714,Classical,1
1715,Archaic,0
1716,Classical,1
1717,Archaic,0


## Initialize 10-Fold Cross Validation

*Stratified K-Fold:*
- preserves label distribution
- maintains proportion of classes in each fold


In [13]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds = list(skf.split(np.arange(y["train"].shape[0]), y["train"]["HistoricalPeriod_encoded"]))

## Define Parameter Grid per Model

In [14]:
model_param_grids = {
    "LogisticRegression": (
        LogisticRegression, {
            "C": [0.01, 0.1, 1, 10, 100],
            "solver": ["liblinear", "lbfgs"],
            "max_iter": [1000],
            "class_weight": [None, "balanced"],
            "random_state": [42]
        }
    ),
    # "KNN": (
    #     KNeighborsClassifier, {
    #
    #     }
    # ),
    # "RandomForest": (
    #     RandomForestClassifier, {
    #
    #     }
    # ),
    # "XGBoost": (
    #     XGBClassifier,
    #     {
    #
    #     }
    # ),
    # "LightGBM": (
    #     LGBMClassifier,
    #     {
    #
    #     }
    # )
}

## Load or Initialize Best Params Dictionary

In [15]:
path_params = "best_params.json"

best_params = load_best_params(path_params)
print(f"Best Params Available for {len(best_params.keys())} Models")

Best Params Available for 0 Models


## Run HP Tuning for New Models

Run tuning only for models missing from the saved params

In [16]:
verbose = True
flag_new_model = False
for model_name, (model_class, param_grid) in model_param_grids.items():

    if model_name not in best_params:
        flag_new_model = True

        if verbose: print(f"\n🎚️ Hyperparameter Tuning '{model_name}' Model")
        best_params[model_name] = run_hp_all(
            model_class,
            param_grid,
            folds,
            metrics_c,
            X_scaled["train"],
            y["train"][["HistoricalPeriod_encoded"]],
            deciding_metric="accuracy",
            verbose=verbose
        )
    else:
        if verbose: print(f"\n✅ '{model_name}' Model Already Tuned")


🎚️ Hyperparameter Tuning 'LogisticRegression' Model

Features: tfidf | Target: HistoricalPeriod_encoded
✅ Best params: {'C': 1, 'class_weight': None, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'}
🎯 Best ACCURACY: 0.7836

Features: bert | Target: HistoricalPeriod_encoded
✅ Best params: {'C': 0.01, 'class_weight': None, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'}
🎯 Best ACCURACY: 0.7178

Features: cannyhog | Target: HistoricalPeriod_encoded
✅ Best params: {'C': 0.01, 'class_weight': None, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'}
🎯 Best ACCURACY: 0.6766

Features: resnet | Target: HistoricalPeriod_encoded
✅ Best params: {'C': 0.01, 'class_weight': None, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'}
🎯 Best ACCURACY: 0.6835

Features: vit | Target: HistoricalPeriod_encoded
✅ Best params: {'C': 0.1, 'class_weight': None, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'}
🎯 Best ACCURACY: 0.6928

Features: tfidf + cannyhog | Target: Hi

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

✅ Best params: {'C': 1, 'class_weight': None, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'}
🎯 Best ACCURACY: 0.7755

Features: tfidf + resnet | Target: HistoricalPeriod_encoded


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

✅ Best params: {'C': 1, 'class_weight': None, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'}
🎯 Best ACCURACY: 0.7772

Features: tfidf + vit | Target: HistoricalPeriod_encoded


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

✅ Best params: {'C': 10, 'class_weight': None, 'max_iter': 1000, 'random_state': 42, 'solver': 'liblinear'}
🎯 Best ACCURACY: 0.7819

Features: bert + cannyhog | Target: HistoricalPeriod_encoded
✅ Best params: {'C': 0.01, 'class_weight': None, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'}
🎯 Best ACCURACY: 0.7510

Features: bert + resnet | Target: HistoricalPeriod_encoded
✅ Best params: {'C': 0.01, 'class_weight': None, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'}
🎯 Best ACCURACY: 0.7481

Features: bert + vit | Target: HistoricalPeriod_encoded
✅ Best params: {'C': 0.01, 'class_weight': None, 'max_iter': 1000, 'random_state': 42, 'solver': 'lbfgs'}
🎯 Best ACCURACY: 0.7533


## Save Best Parameters per Model

In [18]:
best_params

{'LogisticRegression': {('tfidf', 'HistoricalPeriod_encoded'): {'C': 1,
   'class_weight': None,
   'max_iter': 1000,
   'random_state': 42,
   'solver': 'lbfgs'},
  ('bert', 'HistoricalPeriod_encoded'): {'C': 0.01,
   'class_weight': None,
   'max_iter': 1000,
   'random_state': 42,
   'solver': 'lbfgs'},
  ('cannyhog', 'HistoricalPeriod_encoded'): {'C': 0.01,
   'class_weight': None,
   'max_iter': 1000,
   'random_state': 42,
   'solver': 'lbfgs'},
  ('resnet', 'HistoricalPeriod_encoded'): {'C': 0.01,
   'class_weight': None,
   'max_iter': 1000,
   'random_state': 42,
   'solver': 'lbfgs'},
  ('vit', 'HistoricalPeriod_encoded'): {'C': 0.1,
   'class_weight': None,
   'max_iter': 1000,
   'random_state': 42,
   'solver': 'lbfgs'},
  ('tfidf + cannyhog', 'HistoricalPeriod_encoded'): {'C': 1,
   'class_weight': None,
   'max_iter': 1000,
   'random_state': 42,
   'solver': 'lbfgs'},
  ('tfidf + resnet', 'HistoricalPeriod_encoded'): {'C': 1,
   'class_weight': None,
   'max_iter': 1000

In [19]:
save_best_params(path_params, best_params, flag_new_model)

✅ Saved best parameters to best_params.json
