In [1]:
import os
import sys
import json
import numpy as np
import pandas as pd


In [2]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [3]:
sys.path.append(os.path.abspath(".."))
from utils import read_features, read_targets, metrics_c, run_cv_all, update_scoreboard, plot_compare_feature_scores, d_types_methods, reduce_components, print_info_features, combine_features_all_txt_img, scale_feature_set, run_cv_all_2

## Read Features and Targets

In [4]:
path = os.path.abspath(os.path.join(os.getcwd(), "../../data/chronology_prediction"))

In [5]:
X = read_features(path)
y = read_targets(path, ["HistoricalPeriod"])

Loaded X_train_tfidf
Loaded X_train_bert
Loaded X_train_cannyhog
Loaded X_train_resnet
Loaded X_train_vit
Loaded X_test_tfidf
Loaded X_test_bert
Loaded X_test_cannyhog
Loaded X_test_resnet
Loaded X_test_vit
Loaded y_train
Loaded y_test


## Scale and Reduce Features

Use n_components=0.95 to keep enough components to preserve 95% of the variance in the data.

In [6]:
reducers = {}
for method in ("bert",) + d_types_methods["image"]:
    X["train"][method], reducers[method] = reduce_components(X["train"][method], n_components=0.95)

    X["test"][method], _ = reduce_components(X["test"][method], reducer_fitted=reducers[method])

In [7]:
# for subset in X.keys():
#     for method in ("bert",) + d_types_methods["image"]:
#         X[subset][method] = scale_feature_set(X[subset][method])

## Combine & Re-scale Text & Image Feature Sets

In [8]:
for subset in X.keys():
    X_combos = combine_features_all_txt_img(X[subset])
    X_combos = {combo: scale_feature_set(X_combo) for combo, X_combo in X_combos.items()}

    X[subset].update(X_combos)

In [9]:
print_info_features(X)

X = {
	train: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 445), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 332), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 368), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 293), 
		tfidf + cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 632), 
		tfidf + resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 668), 
		tfidf + vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 593), 
		bert + cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 777), 
		bert + resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 813), 
		bert + vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 738), 
	},
	test: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 300), 
		bert: 
	

## Encode Target Labels

In [10]:
le = LabelEncoder()

y["train"]["HistoricalPeriod_encoded"] = le.fit_transform(y["train"]["HistoricalPeriod"])
y["test"]["HistoricalPeriod_encoded"] = le.transform(y["test"]["HistoricalPeriod"])

label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
for label, encoding in label_mapping.items():
    print(f"{encoding} --> {label}")

0 --> Archaic
1 --> Classical
2 --> Hellenistic
3 --> Orientalizing


## Load Best Parameters from HP Tuning

In [11]:
path_params = "best_params.json"
with open(path_params, "r") as f:
    best_params = json.load(f)
    best_params = {
        model: {
            eval(k): v for k, v in param_dict.items()
        } for model, param_dict in best_params.items()
    }

## Initialize 10-Fold Cross Validation


In [12]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds = list(skf.split(np.arange(y["train"].shape[0]), y["train"]["HistoricalPeriod_encoded"]))

## Initialize Scoreboard

In [13]:
scoreboard = pd.DataFrame(columns=["model", "target", "features"] + list(metrics_c.keys()))
scoreboard

Unnamed: 0,model,target,features,accuracy,precision,recall,f1


## Logistic Regression

In [14]:
model_name = 'LogisticRegression'
model_class = LogisticRegression

model_scoreboard = run_cv_all_2(
    model_name,
    model_class,
    best_params[model_name],
    folds,
    metrics_c,
    X["train"],
    y["train"][["HistoricalPeriod_encoded"]],
    enable_plots=False
)
# plot_compare_feature_scores(model_scoreboard)
scoreboard = update_scoreboard(scoreboard, model_scoreboard)

In [15]:
scoreboard

Unnamed: 0,model,target,features,accuracy,precision,recall,f1
0,LogisticRegression,HistoricalPeriod_encoded,tfidf,0.783602,0.451795,0.408849,0.411579
1,LogisticRegression,HistoricalPeriod_encoded,bert,0.718428,0.482238,0.404619,0.418639
2,LogisticRegression,HistoricalPeriod_encoded,cannyhog,0.668421,0.459434,0.400448,0.40619
3,LogisticRegression,HistoricalPeriod_encoded,resnet,0.67655,0.499124,0.362483,0.38018
4,LogisticRegression,HistoricalPeriod_encoded,vit,0.691089,0.580092,0.410162,0.43384
5,LogisticRegression,HistoricalPeriod_encoded,tfidf + cannyhog,0.788838,0.536953,0.5203,0.522944
6,LogisticRegression,HistoricalPeriod_encoded,tfidf + resnet,0.79407,0.537302,0.441594,0.458275
7,LogisticRegression,HistoricalPeriod_encoded,tfidf + vit,0.798722,0.573944,0.544372,0.551165
8,LogisticRegression,HistoricalPeriod_encoded,bert + cannyhog,0.713199,0.491724,0.490473,0.488236
9,LogisticRegression,HistoricalPeriod_encoded,bert + resnet,0.723072,0.513036,0.395828,0.410765


In [33]:
scoreboard

Unnamed: 0,model,target,features,accuracy,precision,recall,f1
0,LogisticRegression,HistoricalPeriod_encoded,tfidf,0.783602,0.451795,0.408849,0.411579
1,LogisticRegression,HistoricalPeriod_encoded,bert,0.71785,0.481629,0.405132,0.418756
2,LogisticRegression,HistoricalPeriod_encoded,cannyhog,0.653869,0.419239,0.422191,0.386692
3,LogisticRegression,HistoricalPeriod_encoded,resnet,0.683527,0.554487,0.399265,0.420627
4,LogisticRegression,HistoricalPeriod_encoded,vit,0.692826,0.589661,0.407722,0.43204
5,LogisticRegression,HistoricalPeriod_encoded,tfidf + cannyhog,0.77314,0.500951,0.525119,0.50648
6,LogisticRegression,HistoricalPeriod_encoded,tfidf + resnet,0.801061,0.623,0.501225,0.528703
7,LogisticRegression,HistoricalPeriod_encoded,tfidf + vit,0.806287,0.5986,0.559071,0.569892
8,LogisticRegression,HistoricalPeriod_encoded,bert + cannyhog,0.719611,0.448758,0.477517,0.456813
9,LogisticRegression,HistoricalPeriod_encoded,bert + resnet,0.747521,0.585085,0.470387,0.495277


## KNN

In [35]:
model_name = 'KNN'
model_class = KNeighborsClassifier

model_scoreboard = run_cv_all_2(
    model_name,
    model_class,
    best_params[model_name],
    folds,
    metrics_c,
    X["train"],
    y["train"][["HistoricalPeriod_encoded"]],
    enable_plots=False
)
# plot_compare_feature_scores(model_scoreboard)
scoreboard = update_scoreboard(scoreboard, model_scoreboard)

In [36]:
scoreboard

Unnamed: 0,model,target,features,accuracy,precision,recall,f1
0,LogisticRegression,HistoricalPeriod_encoded,tfidf,0.783602,0.451795,0.408849,0.411579
1,LogisticRegression,HistoricalPeriod_encoded,bert,0.71785,0.481629,0.405132,0.418756
2,LogisticRegression,HistoricalPeriod_encoded,cannyhog,0.653869,0.419239,0.422191,0.386692
3,LogisticRegression,HistoricalPeriod_encoded,resnet,0.683527,0.554487,0.399265,0.420627
4,LogisticRegression,HistoricalPeriod_encoded,vit,0.692826,0.589661,0.407722,0.43204
5,LogisticRegression,HistoricalPeriod_encoded,tfidf + cannyhog,0.77314,0.500951,0.525119,0.50648
6,LogisticRegression,HistoricalPeriod_encoded,tfidf + resnet,0.801061,0.623,0.501225,0.528703
7,LogisticRegression,HistoricalPeriod_encoded,tfidf + vit,0.806287,0.5986,0.559071,0.569892
8,LogisticRegression,HistoricalPeriod_encoded,bert + cannyhog,0.719611,0.448758,0.477517,0.456813
9,LogisticRegression,HistoricalPeriod_encoded,bert + resnet,0.747521,0.585085,0.470387,0.495277
