In [1]:
import sys
import numpy as np
import json
import os

In [2]:
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [3]:
sys.path.append(os.path.abspath(".."))
import utils

## Read Features and Targets

In [4]:
path = os.path.abspath(os.path.join(os.getcwd(), "../../data/chronology_prediction"))

In [5]:
X = utils.read_features(path)

Loaded X_train_tfidf
Loaded X_train_bert
Loaded X_train_cannyhog
Loaded X_train_resnet
Loaded X_train_vit
Loaded X_test_tfidf
Loaded X_test_bert
Loaded X_test_cannyhog
Loaded X_test_resnet
Loaded X_test_vit


In [6]:
print("X = {")
for subset in X.keys():
    indent = "\t"
    print(f"{indent}{subset}: " + "{")
    for method in X[subset].keys():
        indent = 2 * "\t"
        print(f"{indent}{method}: ")
        indent = 3 * "\t"
        print(f"{indent}{type(X[subset][method])}")
        print(f"{indent}shape = {X[subset][method].shape}, ")
    print("\t},")
print("}")

X = {
	train: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2917), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2048), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
	},
	test: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 768), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 2917), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 2048), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 768), 
	},
}


In [7]:
y = utils.read_targets(path, ["StartYear", "YearRange"])

Loaded y_train
Loaded y_test


In [8]:
print("y = {")
for subset in y.keys():
    indent = "\t"
    print(f"{indent}{subset}: ")
    indent = 2 * "\t"
    print(f"{indent}{type(y[subset])}")
    print(f"{indent}shape {y[subset].shape}")
    print(f"{indent}columns {list(y[subset].columns)},")
print("}")

y = {
	train: 
		<class 'pandas.core.frame.DataFrame'>
		shape (1719, 2)
		columns ['StartYear', 'YearRange'],
	test: 
		<class 'pandas.core.frame.DataFrame'>
		shape (191, 2)
		columns ['StartYear', 'YearRange'],
}


## Initialize 10-Fold Cross Validation


In [9]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)
folds = list(kf.split(np.arange(y["train"].shape[0])))

## Hyperparameter Tuning Process


In [10]:
def hyperparameter_tuning(model_class, param_grid, X, y, verbose=0):
    param_mae = []
    for params in ParameterGrid(param_grid):
        model = model_class(**params)
        s = utils.cross_validation(model, folds, utils.metrics_r, X, y)
        param_mae += [(params, s["mae"][0])]

    best_params, best_mae = min(param_mae, key=lambda x: x[1])
    if verbose:
        print(f"Best params: {best_params}")
        print(f"Best MAE: {best_mae:.2f}")
    return best_params  #, best_mae

## Run HP Tuning for All Feature Sets

In [11]:
def run_hp_all(model_class, param_grid, X, y, verbose=0):
    model_best_params = {}
    for target, _y in y["train"].items():
        for method, _X in X["train"].items():
            if verbose: print(f"\nFeatures: {method} | Target: {target}")
            model_best_params[(method, target)] = hyperparameter_tuning(model_class, param_grid, _X, _y, verbose)

        for text_method in utils.d_types_methods["text"]:
            for image_method in utils.d_types_methods["image"]:
                # _X = utils.combine_features(X, "train", [text_method, image_method])
                _X = utils.combine_features([X["train"][text_method], X["train"][image_method]])
                method = f"{text_method} + {image_method}"

                if verbose: print(f"\nFeatures: {method} | Target: {target}")
                model_best_params[(method, target)] = hyperparameter_tuning(model_class, param_grid, _X, _y, verbose)

    return model_best_params

## Define Parameter Grid per Model

In [12]:
model_param_grids = {
    "Ridge": (
        Ridge, {
            "alpha": [0.01, 0.1, 1, 10, 100, 1000],
            "random_state": [42]
        }
    ),
    "Lasso": (
        Lasso, {
            "alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10],
            "random_state": [42],
            "max_iter": [10000]
        }
    ),
    "RandomForest": (
        RandomForestRegressor, {
            "n_estimators": [100],
            "max_depth": [None, 20],
            "min_samples_split": [2],
            "min_samples_leaf": [1, 2],
            "random_state": [42],
            "n_jobs": [-1]
        }
    ),
    "XGBoost": (
        XGBRegressor,
        {
            "n_estimators": [100],
            "max_depth": [3, 6],
            "learning_rate": [0.05, 0.1],
            "subsample": [0.8],
            "colsample_bytree": [0.8],
            "tree_method": ["hist"],
            "device": ["cuda"],
            "random_state": [42]
        }
    ),
    "LightGBM": (
        LGBMRegressor,
        {
            "n_estimators": [100],
            "max_depth": [-1, 20],
            "learning_rate": [0.05, 0.1],
            "num_leaves": [31, 64],
            "device": ["gpu"],
            "random_state": [42],
            "verbose": [-1]
        }
    )
}

## Load or Initialize Best Parameter Dictionary

Load best_params dictionary from JSON file, if it exists, else initialize dictionary

In [13]:
path_params = "best_params.json"

# Load or initialize
if os.path.exists(path_params):
    with open(path_params, "r") as f:
        best_params = json.load(f)

        # Convert stringified tuples back to tuple keys
        best_params = {
            model: {
                eval(k): v for k, v in param_dict.items()
            } for model, param_dict in best_params.items()
        }
else:
    best_params = {}

In [14]:
len(best_params)

5

## Run HP Tuning for New Models

Run tuning only for models missing from the saved params

In [15]:
verbose = True
flag_new_model = False
for model_name, (model_class, param_grid) in model_param_grids.items():

    if model_name not in best_params:
        flag_new_model = True

        if verbose: print(f"\n🎚️ Hyperparameter Tuning '{model_name}' Model")
        best_params[model_name] = run_hp_all(model_class, param_grid, X, y, verbose)
    else:
        if verbose: print(f"\n✅ '{model_name}' Model Already Tuned")



✅ 'Ridge' Model Already Tuned

✅ 'Lasso' Model Already Tuned

✅ 'RandomForest' Model Already Tuned

✅ 'XGBoost' Model Already Tuned

✅ 'LightGBM' Model Already Tuned


In [16]:
len(best_params)

5

## Save Best Parameters per Model

Save best params if new models added

In [17]:
if flag_new_model:
    # Convert tuple keys to strings to make JSON serializable
    serializable_params = {
        model: {
            str(k): v for k, v in param_dict.items()
        } for model, param_dict in best_params.items()
    }

    with open(path_params, "w") as f:
        json.dump(serializable_params, f, indent=2)

    print(f"✅ Saved best parameters to {path_params}")
else:
    print("✅ No new tuning needed — using existing parameters.")

✅ No new tuning needed — using existing parameters.
