In [14]:
import os
import json
import pandas as pd
import numpy as np

In [10]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

In [25]:
from utils import read_features, read_targets, metrics

## Read Features and Targets

In [3]:
path = os.path.abspath(os.path.join(os.getcwd(), "../../data/chronology_prediction"))

In [4]:
X = read_features(path)

Loaded X_train_tfidf
Loaded X_train_bert
Loaded X_train_cannyhog
Loaded X_train_resnet
Loaded X_train_vit
Loaded X_test_tfidf
Loaded X_test_bert
Loaded X_test_cannyhog
Loaded X_test_resnet
Loaded X_test_vit


In [5]:
y = read_targets(path, ["StartYear", "YearRange"])

Loaded y_train
Loaded y_test


## Load Best Parameters from HP Tuning

In [6]:
path_params = "best_params.json"
with open(path_params, "r") as f:
    best_params = json.load(f)
    best_params = {
        model: {
            eval(k): v for k, v in param_dict.items()
        } for model, param_dict in best_params.items()
    }

## SMTH

In [9]:
# best_models = pd.read_csv("best_models.csv")
# for _, row in best_models.iterrows():
#     if row["data_type"] == "combo":
#         feature_sets = []
#         for part in row["features"].split(" + "):
#             feature_sets.append(X)

In [18]:
def rf_predict_with_std(model, X_test):
    # Get predictions from all trees
    all_preds = np.stack([tree.predict(X_test) for tree in model.estimators_])

    y_pred = np.mean(all_preds, axis=0)
    y_std = np.std(all_preds, axis=0)  # This is the uncertainty

    return y_pred, y_std


## Text --> StartYear

In [19]:
model = RandomForestRegressor(**best_params["RandomForest"][("tfidf", "StartYear")])
model.fit(X["train"]["tfidf"], y["train"]["StartYear"])
y_pred, y_std = rf_predict_with_std(model, X["test"]["tfidf"])



In [22]:
y_std

array([56.84633585, 44.51862981, 59.03203452, 34.21735671, 34.5888349 ,
       61.20792432, 51.5136875 , 28.14598373, 50.36117056,  8.17679644,
       28.5229732 , 31.4354895 , 60.00606219,  5.45985348, 48.48430158,
       57.11945028, 70.25779387, 31.21405292, 74.25932601, 55.32799924,
       43.03193582, 50.56864147, 52.95892276, 27.02313823, 20.44131845,
       25.34087607, 38.85469727, 59.11133901, 13.42841763, 82.77757849,
       10.5541461 , 51.73825954, 67.25592911, 73.29520789, 32.735913  ,
       51.09743144, 62.21864672, 44.65097983, 22.37275128, 69.00614103,
       39.07249672, 41.01755722, 43.29503436, 26.84264331, 85.31852085,
       34.09087121, 39.50443519, 49.31579362, 59.49789471, 73.13772966,
       39.39860404, 26.64000751, 47.51641822, 31.37176278, 21.45990447,
       33.67891922, 48.19157084, 65.75855458, 61.71879778, 39.60893334,
       37.075059  , 43.87949407, 24.69635601, 47.98986872, 41.90879979,
       53.05683274, 43.49629295, 67.43634035, 16.83775519, 51.00

In [26]:
for metric, get_metric_score in metrics.items():
    metric_score = get_metric_score(y["test"]["StartYear"], y_pred)
    if metric == "rmse": metric_score = np.sqrt(metric_score)
    print(f"{metric} = {metric_score}")

mae = 29.72931064572426
rmse = 40.47218366658402
r2 = 0.6095885256997997
medae = 22.149999999999977
maxerror: = 135.325


In [29]:
results = pd.DataFrame({"y_true": y["test"]["StartYear"], "y_pred": y_pred, "y_std": y_std})
results

Unnamed: 0,y_true,y_pred,y_std
0,-525.0,-433.290000,56.846336
1,-425.0,-460.460000,44.518630
2,-375.0,-439.670000,59.032035
3,-600.0,-508.150000,34.217357
4,-500.0,-495.750000,34.588835
...,...,...,...
186,-500.0,-501.300000,19.073804
187,-500.0,-448.686667,58.947374
188,-500.0,-482.100000,60.754341
189,-375.0,-464.700000,62.300963


In [30]:
from scipy.stats import norm

# For 95% confidence interval
z = norm.ppf(0.975)  # ≈ 1.96

results["CI_lower"] = results["y_pred"] - z * results["y_std"]
results["CI_upper"] = results["y_pred"] + z * results["y_std"]

In [42]:
def confidence_interval_around_prediction(y_std, N=10):
    z = N / y_std
    confidence = norm.cdf(z) * 2
    return min(confidence, 1.0)

N = 100  # ± years window

results["confidence_±10"] = results["y_std"].apply(
    lambda std: confidence_interval_around_prediction(std, N)
)

In [43]:
results

Unnamed: 0,y_true,y_pred,y_std,CI_lower,CI_upper,confidence_±10
0,-525.0,-433.290000,56.846336,-544.706771,-321.873229,1.0
1,-425.0,-460.460000,44.518630,-547.714911,-373.205089,1.0
2,-375.0,-439.670000,59.032035,-555.370662,-323.969338,1.0
3,-600.0,-508.150000,34.217357,-575.214787,-441.085213,1.0
4,-500.0,-495.750000,34.588835,-563.542871,-427.957129,1.0
...,...,...,...,...,...,...
186,-500.0,-501.300000,19.073804,-538.683969,-463.916031,1.0
187,-500.0,-448.686667,58.947374,-564.221397,-333.151936,1.0
188,-500.0,-482.100000,60.754341,-601.176321,-363.023679,1.0
189,-375.0,-464.700000,62.300963,-586.807644,-342.592356,1.0
