## Basics

### Imports

In [47]:
import joblib
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

from feature_extraction import get_tfidf_vectorizer, get_bert_embedding, extract_canny_features, extract_hog_features, \
    extract_resnet_features, extract_vit_features
from utils import get_device, d_types_methods, print_info_features, scale, encode, print_info_targets

In [2]:
import torch

### Settings

In [3]:
# Set Working Device
device = get_device()

PyTorch Version: 2.5.1
CUDA is available
GPU: NVIDIA GeForce RTX 4080
Using Device: cuda


In [4]:
path = "../../data/"

path_images = path + "images/"

path_features = path + "features2/tensors/"
os.makedirs(path_features, exist_ok=True)

path_targets = path + "targets/"
os.makedirs(path_targets, exist_ok=True)

In [5]:
TRAIN_SPLIT = 0.80
VAL_SPLIT = 0.10
TEST_SPLIT = 0.10
RANDOM_STATE = 42
STRATIFY_COLUMN = "HistoricalPeriod"

In [6]:
TEXT_COLUMN = "FullText"
IMAGE_COLUMN = "ImageFilename"

## Data

### Read Data

In [7]:
data = pd.read_csv(path + 'agora12_data_pp.csv')
data[IMAGE_COLUMN] = data[IMAGE_COLUMN].replace(np.nan, '', regex=True)
data

Unnamed: 0,Id,FullText,ImageFilename,StartYear,EndYear,MidpointYear,YearRange,HistoricalPeriod,ValidChronology
0,Agora:Object:Agora XII:1,foot missing. offset neck and echinoid mouth; ...,Agora_Image_2012.54.1450.jpg,-575.0,-550.0,-562.5,25.0,Archaic,True
1,Agora:Object:Agora XII:2,spreading ring foot with torus outer and conve...,,-525.0,-500.0,-512.5,25.0,Archaic,True
2,Agora:Object:Agora XII:3,flaring ring foot. torus mouth. ridge at junct...,,-500.0,-500.0,-500.0,0.0,Archaic,True
3,Agora:Object:Agora XII:4,ring foot. echinoid mouth inset from neck and ...,,-500.0,-500.0,-500.0,0.0,Archaic,True
4,Agora:Object:Agora XII:5,echinus ring foot. torus mouth; strap handles....,Agora_Image_2012.27.0009.jpg,-525.0,-500.0,-512.5,25.0,Archaic,True
...,...,...,...,...,...,...,...,...,...
1995,Agora:Object:Agora XII:2036,fragment of rim and floor with handle. deep ba...,,-350.0,-320.0,-335.0,30.0,Classical,True
1996,Agora:Object:Agora XII:2037,handles missing. basin and lid; handles probab...,Agora_Image_2012.55.1261.jpg,-350.0,-301.0,-325.5,49.0,Classical,True
1997,Agora:Object:Agora XII:2038,small series. plain flat-topped rim; flaring b...,Agora_Image_2012.25.0184.jpg,-435.0,-425.0,-430.0,10.0,Classical,True
1998,Agora:Object:Agora XII:2039,"small series. rim flat on top, roughly moulded...",Agora_Image_2012.55.1268.jpg,-375.0,-325.0,-350.0,50.0,Classical,True


### Drop Non-Valid Chronology Records
 (Null or Range > 50)


In [8]:
data = data.loc[data["ValidChronology"]].reset_index(drop=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1910 entries, 0 to 1909
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Id                1910 non-null   object 
 1   FullText          1910 non-null   object 
 2   ImageFilename     1910 non-null   object 
 3   StartYear         1910 non-null   float64
 4   EndYear           1910 non-null   float64
 5   MidpointYear      1910 non-null   float64
 6   YearRange         1910 non-null   float64
 7   HistoricalPeriod  1910 non-null   object 
 8   ValidChronology   1910 non-null   bool   
dtypes: bool(1), float64(4), object(4)
memory usage: 121.4+ KB


### Train, Validation, Test Split

In [9]:
set_names = ["train", "val", "test"]

In [10]:
indices_full = np.arange(data.shape[0])
indices = {}

In [11]:
indices["train"], indices_val_test = train_test_split(
    indices_full,
    test_size=(1 - TRAIN_SPLIT),
    stratify=data[STRATIFY_COLUMN],
    random_state=RANDOM_STATE,
)

In [12]:
indices["val"], indices["test"] = train_test_split(
    indices_val_test,
    test_size=TEST_SPLIT / (TEST_SPLIT + VAL_SPLIT),
    random_state=RANDOM_STATE,
)

In [13]:
for set_name in set_names:
    print(f"{set_name} set: {len(indices[set_name])} samples.")

train set: 1528 samples.
val set: 191 samples.
test set: 191 samples.


In [14]:
data_full = data.copy()
data = {}

for set_name in set_names:
    data[set_name] = data_full.loc[indices[set_name]]

## Feature Extraction

### Init Feature holder

In [15]:
X = {feature_type: {} for feature_types in d_types_methods.values() for feature_type in feature_types}

### TF-IDF

In [16]:
tfidf_vectorizer = get_tfidf_vectorizer()

for set_name in set_names:
    if set_name == "train":
        X["tfidf"][set_name] = tfidf_vectorizer.fit_transform(data[set_name][TEXT_COLUMN])
    else:
        X["tfidf"][set_name] = tfidf_vectorizer.transform(data[set_name][TEXT_COLUMN])

    X["tfidf"][set_name] = torch.tensor(X["tfidf"][set_name].toarray(), dtype=torch.float32)

In [17]:
print_info_features({"tfidf": X["tfidf"]})

{
	tfidf: {
		train: 
			<class 'torch.Tensor'>
			shape = torch.Size([1528, 300]), 
		val: 
			<class 'torch.Tensor'>
			shape = torch.Size([191, 300]), 
		test: 
			<class 'torch.Tensor'>
			shape = torch.Size([191, 300]), 
	},
}


### BERT

In [18]:
for set_name in set_names:
    X["bert"][set_name] = torch.stack(data[set_name][TEXT_COLUMN].apply(get_bert_embedding).tolist())

In [19]:
print_info_features({"bert": X["bert"]})

{
	bert: {
		train: 
			<class 'torch.Tensor'>
			shape = torch.Size([1528, 768]), 
		val: 
			<class 'torch.Tensor'>
			shape = torch.Size([191, 768]), 
		test: 
			<class 'torch.Tensor'>
			shape = torch.Size([191, 768]), 
	},
}


### CANNY + HOG

In [20]:
for set_name in set_names:
    canny_features = data[set_name][IMAGE_COLUMN].apply(extract_canny_features).tolist()
    canny_features = np.array(canny_features).transpose()

    hog_features = data[set_name][IMAGE_COLUMN].apply(extract_hog_features).tolist()
    hog_features = np.array(hog_features)

    X["cannyhog"][set_name] = np.column_stack((canny_features, hog_features))
    X["cannyhog"][set_name] = torch.tensor(X["cannyhog"][set_name], dtype=torch.float32)

In [21]:
print_info_features({"cannyhog": X["cannyhog"]})

{
	cannyhog: {
		train: 
			<class 'torch.Tensor'>
			shape = torch.Size([1528, 2917]), 
		val: 
			<class 'torch.Tensor'>
			shape = torch.Size([191, 2917]), 
		test: 
			<class 'torch.Tensor'>
			shape = torch.Size([191, 2917]), 
	},
}


### RESNET

In [22]:
for set_name in set_names:
    X["resnet"][set_name] = torch.stack(data[set_name][IMAGE_COLUMN].apply(extract_resnet_features).tolist())

In [23]:
print_info_features({"resnet": X["resnet"]})

{
	resnet: {
		train: 
			<class 'torch.Tensor'>
			shape = torch.Size([1528, 2048]), 
		val: 
			<class 'torch.Tensor'>
			shape = torch.Size([191, 2048]), 
		test: 
			<class 'torch.Tensor'>
			shape = torch.Size([191, 2048]), 
	},
}


### VIT

In [24]:
for set_name in set_names:
    X["vit"][set_name] = torch.stack(data[set_name][IMAGE_COLUMN].apply(extract_vit_features).tolist())

In [25]:
print_info_features({"vit": X["vit"]})

{
	vit: {
		train: 
			<class 'torch.Tensor'>
			shape = torch.Size([1528, 768]), 
		val: 
			<class 'torch.Tensor'>
			shape = torch.Size([191, 768]), 
		test: 
			<class 'torch.Tensor'>
			shape = torch.Size([191, 768]), 
	},
}


### Save Feature Sets

In [26]:
for feature_type in X.keys():
    for set_name, _X in X[feature_type].items():
        torch.save(
            _X,
            path_features + "X_" + set_name + "_" + feature_type + ".pt"
        )

## Targets

In [35]:
targets = {
    "regression": ["StartYear", "YearRange"],
    "classification": "HistoricalPeriod",
}

In [36]:
for task in targets.keys():
    os.makedirs(path_targets + task + "/", exist_ok=True)

In [37]:
y = {}
for task, target_set in targets.items():
    y[task] = {}
    for set_name in set_names:
        y[task][set_name] = data[set_name][target_set]

### Regression: Scale Targets

In [38]:
# Convert to numpy arrays
y["regression"] = {set_name: _y.to_numpy() for set_name, _y in y["regression"].items()}

In [39]:
# Scale y and get scaler
y["regression"], y_scaler = scale(y["regression"])

In [40]:
# Convert to tensors
y["regression"] = {
    subset: torch.tensor(_y, dtype=torch.float32, device=device)
    for subset, _y in y["regression"].items()
}
print_info_targets(y["regression"])

{
	train: 
		<class 'torch.Tensor'>
		shape   = torch.Size([1528, 2])
	val: 
		<class 'torch.Tensor'>
		shape   = torch.Size([191, 2])
	test: 
		<class 'torch.Tensor'>
		shape   = torch.Size([191, 2])
}


### Classification: Encode Target

In [41]:
y["classification"], y_encoder = encode(y["classification"])

0 --> Archaic
1 --> Classical
2 --> Hellenistic
3 --> Orientalizing


In [43]:
# Convert to tensors
y["classification"] = {
    subset: torch.tensor(_y, dtype=torch.long, device=device)
    for subset, _y in y["classification"].items()
}
print_info_targets(y["classification"])

{
	train: 
		<class 'torch.Tensor'>
		shape   = torch.Size([1528])
	val: 
		<class 'torch.Tensor'>
		shape   = torch.Size([191])
	test: 
		<class 'torch.Tensor'>
		shape   = torch.Size([191])
}


### Save Target Tensors

In [45]:
for task in y.keys():
    for set_name, _y in y[task].items():
        torch.save(
            _y,
            f"{path_targets}{task}/y_{set_name}.pt"
        )

In [48]:
# Export scaler
joblib.dump(y_scaler, f"{path_targets}regression/y_scaler.pkl")

['../../data/targets/regression/y_scaler.pkl']

In [49]:
# Export encoder
joblib.dump(y_encoder, f"{path_targets}classification/y_encoder.pkl")

['../../data/targets/classification/y_encoder.pkl']