In [1]:
import pandas as pd
import numpy as np
import os

import torch

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from utils import f_types, d_types_methods, print_info_features, print_info_targets

### Working Directory Path

In [4]:
# Local Path for Running Locally
path = "../data/"
path_features = path + "features/"

path_export = path + "chronology_prediction/"
os.makedirs(path_export, exist_ok=True)

### Read Data & Features

In [5]:
data = pd.read_csv(path + 'agora12_data_pp.csv')
data

Unnamed: 0,Id,FullText,ImageFilename,StartYear,EndYear,MidpointYear,YearRange,HistoricalPeriod,ValidChronology
0,Agora:Object:Agora XII:1,foot missing. offset neck and echinoid mouth; ...,Agora_Image_2012.54.1450.jpg,-575.0,-550.0,-562.5,25.0,Archaic,True
1,Agora:Object:Agora XII:2,spreading ring foot with torus outer and conve...,,-525.0,-500.0,-512.5,25.0,Archaic,True
2,Agora:Object:Agora XII:3,flaring ring foot. torus mouth. ridge at junct...,,-500.0,-500.0,-500.0,0.0,Archaic,True
3,Agora:Object:Agora XII:4,ring foot. echinoid mouth inset from neck and ...,,-500.0,-500.0,-500.0,0.0,Archaic,True
4,Agora:Object:Agora XII:5,echinus ring foot. torus mouth; strap handles....,Agora_Image_2012.27.0009.jpg,-525.0,-500.0,-512.5,25.0,Archaic,True
...,...,...,...,...,...,...,...,...,...
1995,Agora:Object:Agora XII:2036,fragment of rim and floor with handle. deep ba...,,-350.0,-320.0,-335.0,30.0,Classical,True
1996,Agora:Object:Agora XII:2037,handles missing. basin and lid; handles probab...,Agora_Image_2012.55.1261.jpg,-350.0,-301.0,-325.5,49.0,Classical,True
1997,Agora:Object:Agora XII:2038,small series. plain flat-topped rim; flaring b...,Agora_Image_2012.25.0184.jpg,-435.0,-425.0,-430.0,10.0,Classical,True
1998,Agora:Object:Agora XII:2039,"small series. rim flat on top, roughly moulded...",Agora_Image_2012.55.1268.jpg,-375.0,-325.0,-350.0,50.0,Classical,True


In [6]:
features = {ft: {} for ft in f_types}
for feature_type, (ext, loader, params) in f_types.items():
    for data_type, methods in d_types_methods.items():
        for method in methods:
            filename = f"{data_type}_{method}_{feature_type}.{ext}"
            file_path = os.path.join(path_features, filename)
            if os.path.exists(file_path):
                features[feature_type][method] = loader(file_path, **params)
                print(f"Loaded {filename}")

Loaded text_tfidf_vectors.csv
Loaded text_bert_vectors.csv
Loaded image_cannyhog_vectors.csv
Loaded image_resnet_vectors.csv
Loaded image_vit_vectors.csv
Loaded text_bert_tensors.pt
Loaded image_resnet_tensors.pt
Loaded image_vit_tensors.pt


In [7]:
def print_features_info(features):
    for ft in features.keys():
        for method, feature_set in features[ft].items():
            print(f"\n** {method.upper()} {ft.upper()} **")
            if ft == "vectors":
                print(feature_set.info())
            elif ft == "tensors":
                print(f"Dimensions: {feature_set.ndim}")
                print(f"Shape: {feature_set.shape}")
                print(f"dtype: {feature_set.dtype}")
                print(f"Memory usage: {(feature_set.element_size() * feature_set.numel()) / (1024 * 1024):.2f} MB")


print_features_info(features)


** TFIDF VECTORS **
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 300 entries, F0 to F299
dtypes: float64(300)
memory usage: 4.6 MB
None

** BERT VECTORS **
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 768 entries, F0 to F767
dtypes: float64(768)
memory usage: 11.7 MB
None

** CANNYHOG VECTORS **
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 2917 entries, F0 to F2916
dtypes: float64(2917)
memory usage: 44.5 MB
None

** RESNET VECTORS **
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 2048 entries, F0 to F2047
dtypes: float64(2048)
memory usage: 31.3 MB
None

** VIT VECTORS **
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 768 entries, F0 to F767
dtypes: float64(768)
memory usage: 11.7 MB
None

** BERT TENSORS **
Dimensions: 2
Shape: torch.Size([2000, 768])
dtype: torch.float32
Memory usage: 5.86 MB

** RE

### Convert Missing Tensors

In [8]:
for method, vectors in features["vectors"].items():
    if method not in features["tensors"].keys():
        features["tensors"][method] = torch.tensor(vectors.values, dtype=torch.float32)

### Drop Records with Non-Valid Chronology (Null or Range > 50)

In [9]:
valid_mask = data["ValidChronology"]
valid_mask_torch = torch.tensor(valid_mask, dtype=torch.bool)

In [10]:
data = data.loc[valid_mask].reset_index(drop=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1910 entries, 0 to 1909
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Id                1910 non-null   object 
 1   FullText          1910 non-null   object 
 2   ImageFilename     923 non-null    object 
 3   StartYear         1910 non-null   float64
 4   EndYear           1910 non-null   float64
 5   MidpointYear      1910 non-null   float64
 6   YearRange         1910 non-null   float64
 7   HistoricalPeriod  1910 non-null   object 
 8   ValidChronology   1910 non-null   bool   
dtypes: bool(1), float64(4), object(4)
memory usage: 121.4+ KB


In [11]:
for method, vectors in features["vectors"].items():
    features["vectors"][method] = vectors.loc[valid_mask].reset_index(drop=True)

In [12]:
for method, tensor in features["tensors"].items():
    features["tensors"][method] = tensor[valid_mask_torch]  # convert mask to numpy

In [13]:
print_features_info(features)


** TFIDF VECTORS **
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1910 entries, 0 to 1909
Columns: 300 entries, F0 to F299
dtypes: float64(300)
memory usage: 4.4 MB
None

** BERT VECTORS **
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1910 entries, 0 to 1909
Columns: 768 entries, F0 to F767
dtypes: float64(768)
memory usage: 11.2 MB
None

** CANNYHOG VECTORS **
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1910 entries, 0 to 1909
Columns: 2917 entries, F0 to F2916
dtypes: float64(2917)
memory usage: 42.5 MB
None

** RESNET VECTORS **
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1910 entries, 0 to 1909
Columns: 2048 entries, F0 to F2047
dtypes: float64(2048)
memory usage: 29.8 MB
None

** VIT VECTORS **
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1910 entries, 0 to 1909
Columns: 768 entries, F0 to F767
dtypes: float64(768)
memory usage: 11.2 MB
None

** BERT TENSORS **
Dimensions: 2
Shape: torch.Size([1910, 768])
dtype: torch.float32
Memory usage: 5.60 MB

** RE

## Target

In [14]:
target_cols = ["StartYear", "EndYear", "MidpointYear", "YearRange", "HistoricalPeriod"]
target = data[target_cols]
target

Unnamed: 0,StartYear,EndYear,MidpointYear,YearRange,HistoricalPeriod
0,-575.0,-550.0,-562.5,25.0,Archaic
1,-525.0,-500.0,-512.5,25.0,Archaic
2,-500.0,-500.0,-500.0,0.0,Archaic
3,-500.0,-500.0,-500.0,0.0,Archaic
4,-525.0,-500.0,-512.5,25.0,Archaic
...,...,...,...,...,...
1905,-375.0,-325.0,-350.0,50.0,Classical
1906,-350.0,-320.0,-335.0,30.0,Classical
1907,-350.0,-301.0,-325.5,49.0,Classical
1908,-435.0,-425.0,-430.0,10.0,Classical


## Split Train & Test Sets

In [15]:
indices = np.arange(data.shape[0])
train_idx, test_idx = train_test_split(indices, test_size=0.1, random_state=42)

train_idx_torch = torch.tensor(train_idx, dtype=torch.int64)
test_idx_torch = torch.tensor(test_idx, dtype=torch.int64)

In [16]:
X_vectors = {
    "train": {method: vectors.loc[train_idx] for method, vectors in features["vectors"].items()},
    "test": {method: vectors.loc[test_idx] for method, vectors in features["vectors"].items()}
}

print_info_features(X_vectors)

{
	train: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2917), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 2048), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (1719, 768), 
	},
	test: {
		tfidf: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 300), 
		bert: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 768), 
		cannyhog: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 2917), 
		resnet: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 2048), 
		vit: 
			<class 'pandas.core.frame.DataFrame'>
			shape = (191, 768), 
	},
}


In [17]:
X_tensors = {
    "train": {method: tensors[train_idx_torch] for method, tensors in features["tensors"].items()},
    "test": {method: tensors[test_idx_torch] for method, tensors in features["tensors"].items()}
}

print_info_features(X_tensors)

{
	train: {
		bert: 
			<class 'torch.Tensor'>
			shape = torch.Size([1719, 768]), 
		resnet: 
			<class 'torch.Tensor'>
			shape = torch.Size([1719, 2048]), 
		vit: 
			<class 'torch.Tensor'>
			shape = torch.Size([1719, 768]), 
		tfidf: 
			<class 'torch.Tensor'>
			shape = torch.Size([1719, 300]), 
		cannyhog: 
			<class 'torch.Tensor'>
			shape = torch.Size([1719, 2917]), 
	},
	test: {
		bert: 
			<class 'torch.Tensor'>
			shape = torch.Size([191, 768]), 
		resnet: 
			<class 'torch.Tensor'>
			shape = torch.Size([191, 2048]), 
		vit: 
			<class 'torch.Tensor'>
			shape = torch.Size([191, 768]), 
		tfidf: 
			<class 'torch.Tensor'>
			shape = torch.Size([191, 300]), 
		cannyhog: 
			<class 'torch.Tensor'>
			shape = torch.Size([191, 2917]), 
	},
}


In [18]:
y = {
    "train": target.loc[train_idx],
    "test": target.loc[test_idx]
}

print_info_targets(y)

{
	train: 
		<class 'pandas.core.frame.DataFrame'>
		shape   = (1719, 5)
		columns = ['StartYear', 'EndYear', 'MidpointYear', 'YearRange', 'HistoricalPeriod'],
	test: 
		<class 'pandas.core.frame.DataFrame'>
		shape   = (191, 5)
		columns = ['StartYear', 'EndYear', 'MidpointYear', 'YearRange', 'HistoricalPeriod'],
}


### Save Data

In [19]:
os.makedirs(path_export + "vectors", exist_ok=True)

for subset in X_vectors.keys():
    for method, vectors in X_vectors[subset].items():
        vectors.to_csv(
            path_export + "vectors/" + "X_" + subset + "_" + method + ".csv",
            index=False,
            encoding='utf-8',
            sep=',',
            header=True
        )

In [20]:
os.makedirs(path_export + "tensors", exist_ok=True)

for subset in X_tensors.keys():
    for method, tensors in X_tensors[subset].items():
        torch.save(
            tensors,
            path_export + "tensors/" + "X_" + subset + "_" + method + ".pt"
        )

In [21]:
os.makedirs(path_export + "targets", exist_ok=True)

for subset, target in y.items():
    target.to_csv(
        path_export + "targets/" + "y_" + subset + ".csv",
        index=False,
        encoding='utf-8',
        sep=',',
        header=True
    )