In [1]:
import sys
import numpy as np
import json
import os

In [2]:
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [3]:
sys.path.append(os.path.abspath("../.."))
from utils import read_features, read_targets, print_info_features, print_info_targets, metrics_r, cross_validation, \
    d_types_methods, combine_features, load_best_params, add_all_feature_combos, save_best_params, get_column_widths, \
    run_hp_all, get_execution_time, print_best_params, scale_all, reduce_all

## Read Features and Targets

In [4]:
path = os.path.abspath(os.path.join(os.getcwd(), "../../../data/chronology_prediction"))

In [5]:
X = read_features(path)
y = read_targets(path, ["StartYear", "YearRange"])

Loaded X_train_tfidf
Loaded X_train_bert
Loaded X_train_cannyhog
Loaded X_train_resnet
Loaded X_train_vit
Loaded X_test_tfidf
Loaded X_test_bert
Loaded X_test_cannyhog
Loaded X_test_resnet
Loaded X_test_vit
Loaded y_train
Loaded y_test


In [6]:
print_info_features(X)

{
	train: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2917), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2048), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
	},
	test: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 768), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 2917), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 2048), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 768), 
	},
}


In [7]:
print_info_targets(y)

y = {
	train: 
		<class 'pandas.core.frame.DataFrame'>
		shape (1719, 2)
		columns ['StartYear', 'YearRange'],
	test: 
		<class 'pandas.core.frame.DataFrame'>
		shape (191, 2)
		columns ['StartYear', 'YearRange'],
}


## Extra Preprocessing


### Scale Features

In [8]:
X_scaled = scale_all(X)

### Reduce Features

Use n_components=0.95 to keep enough components to preserve 95% of the variance in the data.

In [9]:
X_reduced = reduce_all(X_scaled, n_components=0.95)

### Combine & Re-scale Text & Image Feature Sets

In [10]:
X = add_all_feature_combos(X, scale=False)
X_reduced = add_all_feature_combos(X_reduced, scale=True)

In [11]:
print_info_features(X)

{
	train: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2917), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2048), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
		tfidf + cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 3217), 
		tfidf + resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2348), 
		tfidf + vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 1068), 
		bert + cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 3685), 
		bert + resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2816), 
		bert + vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 1536), 
	},
	test: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 300), 
		bert

## Initialize 10-Fold Cross Validation


In [12]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)
folds = list(kf.split(np.arange(y["train"].shape[0])))

## Define Parameter Grid per Model

In [13]:
model_param_grids = {
    "Ridge": (
        Ridge, {
            "alpha": [0.01, 0.1, 1, 10, 100, 1000, 10000, 100000],
            "random_state": [42]
        }
    ),
    "Lasso": (
        Lasso, {
            "alpha": [0.001, 0.01, 0.1, 1, 10, 100],
            "max_iter": [10000],
            "random_state": [42]
        }
    ),
    "RandomForest": (
        RandomForestRegressor, {
            "n_estimators": [100],
            "max_depth": [None, 20],
            "min_samples_split": [2],
            "min_samples_leaf": [1, 2],
            "random_state": [42],
            "n_jobs": [-1]
        }
    ),
    "XGBoost": (
        XGBRegressor,
        {
            "n_estimators": [100],
            "max_depth": [3, 6],
            "learning_rate": [0.05, 0.1],
            "subsample": [0.8],
            "colsample_bytree": [0.8],
            "random_state": [42]
        }
    ),
    "LightGBM": (
        LGBMRegressor,
        {
            "n_estimators": [100],
            "max_depth": [-1, 20],
            "learning_rate": [0.05, 0.1],
            "num_leaves": [31, 64],
            "device": ["gpu"],
            "random_state": [42],
            "verbose": [-1]
        }
    )
}

In [14]:
models_reduced_features = ["Ridge", "Lasso"]

## Load or Initialize Best Parameter Dictionary

Load best_params dictionary from JSON file, if it exists, else initialize dictionary

In [15]:
## Load or Initialize Best Params Dictionary
path_params = "best_params.json"

best_params = load_best_params(path_params)
print(f"Best Params Available for {len(best_params.keys())} Models")

Best Params Available for 3 Models


## Run HP Tuning for New Models

Run tuning only for models missing from the saved params

In [16]:
verbose = True
flag_new_model = False
deciding_metric = "mae"

for model_name, (model_class, param_grid) in model_param_grids.items():
    _X = X_reduced["train"] if model_name in models_reduced_features else X["train"]
    _y = y["train"]

    column_widths = get_column_widths(list(_y.keys()), list(_X.keys()), deciding_metric, param_grid)

    if model_name not in best_params:
        flag_new_model = True

        if verbose: print(f"\n🔄 Hyperparameter Tuning '{model_name}' Model")

        best_params[model_name] = run_hp_all(
            model_class,
            param_grid,
            folds,
            metrics_r,
            _X,
            _y,
            deciding_metric,
            column_widths,
            verbose=verbose
        )
    else:
        if verbose:
            print(f"\n✅ '{model_name}' Model Already Tuned. Execution Time: {get_execution_time(best_params[model_name])}")
            print_best_params(column_widths, best_params[model_name], deciding_metric)


✅ 'Ridge' Model Already Tuned. Execution Time: 14s
+-----------+------------------+-----------+--------+
|    target |      feature_set |       mae |  alpha |
+-----------+------------------+-----------+--------+
| StartYear |            tfidf |   38.2985 |      1 |
| StartYear |             bert |   45.6488 |   1000 |
| StartYear |         cannyhog |   53.7347 |   1000 |
| StartYear |           resnet |   51.5910 |   1000 |
| StartYear |              vit |   51.6154 |   1000 |
| StartYear | tfidf + cannyhog |   37.9784 |   1000 |
| StartYear |   tfidf + resnet |   37.3392 |   1000 |
| StartYear |      tfidf + vit |   37.7850 |   1000 |
| StartYear |  bert + cannyhog |   45.8785 |   1000 |
| StartYear |    bert + resnet |   44.8105 |   1000 |
| StartYear |       bert + vit |   45.3821 |   1000 |
+-----------+------------------+-----------+--------+
| YearRange |            tfidf |   10.9017 |      1 |
| YearRange |             bert |   11.1748 |   1000 |
| YearRange |         cannyhog

## Save Best Parameters per Model

Save best params if new models added

In [17]:
save_best_params(path_params, best_params, flag_new_model)

✅ Saved best parameters to best_params.json
