In [1]:
import pandas as pd
import numpy as np
import os

import torch

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from utils import f_types, d_types_methods

### Working Directory Path

In [4]:
# Local Path for Running Locally
path = "../data/"
path_features = path + "features/"

path_export = path + "chronology_prediction/"
os.makedirs(path_export, exist_ok=True)

### Read Data & Features

In [5]:
data = pd.read_csv(path + 'agora12_data_pp.csv')
data

Unnamed: 0,Id,FullText,ImageFilename,StartYear,EndYear,MidpointYear,YearRange,HistoricalPeriod,ValidChronology
0,Agora:Object:Agora XII:1,foot missing. offset neck and echinoid mouth; ...,Agora_Image_2012.54.1450.jpg,-575.0,-550.0,-562.5,25.0,Archaic,True
1,Agora:Object:Agora XII:2,spreading ring foot with torus outer and conve...,,-525.0,-500.0,-512.5,25.0,Archaic,True
2,Agora:Object:Agora XII:3,flaring ring foot. torus mouth. ridge at junct...,,-500.0,-500.0,-500.0,0.0,Archaic,True
3,Agora:Object:Agora XII:4,ring foot. echinoid mouth inset from neck and ...,,-500.0,-500.0,-500.0,0.0,Archaic,True
4,Agora:Object:Agora XII:5,echinus ring foot. torus mouth; strap handles....,Agora_Image_2012.27.0009.jpg,-525.0,-500.0,-512.5,25.0,Archaic,True
...,...,...,...,...,...,...,...,...,...
1995,Agora:Object:Agora XII:2036,fragment of rim and floor with handle. deep ba...,,-350.0,-320.0,-335.0,30.0,Classical,True
1996,Agora:Object:Agora XII:2037,handles missing. basin and lid; handles probab...,Agora_Image_2012.55.1261.jpg,-350.0,-301.0,-325.5,49.0,Classical,True
1997,Agora:Object:Agora XII:2038,small series. plain flat-topped rim; flaring b...,Agora_Image_2012.25.0184.jpg,-435.0,-425.0,-430.0,10.0,Classical,True
1998,Agora:Object:Agora XII:2039,"small series. rim flat on top, roughly moulded...",Agora_Image_2012.55.1268.jpg,-375.0,-325.0,-350.0,50.0,Classical,True


In [6]:
features = {ft: {} for ft in f_types}
for feature_type, (ext, loader, params) in f_types.items():
    for data_type, methods in d_types_methods.items():
        for method in methods:
            filename = f"{data_type}_{method}_{feature_type}.{ext}"
            file_path = os.path.join(path_features, filename)
            if os.path.exists(file_path):
                features[feature_type][method] = loader(file_path, **params)
                print(f"Loaded {filename}")

Loaded text_tfidf_vectors.csv
Loaded text_bert_vectors.csv
Loaded image_cannyhog_vectors.csv
Loaded image_resnet_vectors.csv
Loaded image_vit_vectors.csv
Loaded text_bert_tensors.pt
Loaded image_resnet_tensors.pt
Loaded image_vit_tensors.pt


In [7]:
for method in features["vectors"]:
    print(f"\nfeatures[\"vectors\"][\"{method}\"] = ")
    print(features["vectors"][method].info())


features["vectors"]["tfidf"] = 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 300 entries, F0 to F299
dtypes: float64(300)
memory usage: 4.6 MB
None

features["vectors"]["bert"] = 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 768 entries, F0 to F767
dtypes: float64(768)
memory usage: 11.7 MB
None

features["vectors"]["cannyhog"] = 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 2917 entries, F0 to F2916
dtypes: float64(2917)
memory usage: 44.5 MB
None

features["vectors"]["resnet"] = 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 2048 entries, F0 to F2047
dtypes: float64(2048)
memory usage: 31.3 MB
None

features["vectors"]["vit"] = 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 768 entries, F0 to F767
dtypes: float64(768)
memory usage: 11.7 MB
None


In [8]:
for method in features["tensors"]:
    if method in features["tensors"]:
        print(f"\nfeatures[\"tensors\"][\"{method}\"] = ")
        tensor = features["tensors"][method]
        print(
            f"Dimensions: {tensor.ndim}\nShape: {tensor.shape}\ndtype: {tensor.dtype}\nMemory usage: {(tensor.element_size() * tensor.numel()) / (1024 * 1024)} MB")


features["tensors"]["bert"] = 
Dimensions: 2
Shape: torch.Size([2000, 768])
dtype: torch.float32
Memory usage: 5.859375 MB

features["tensors"]["resnet"] = 
Dimensions: 2
Shape: torch.Size([2000, 2048])
dtype: torch.float32
Memory usage: 15.625 MB

features["tensors"]["vit"] = 
Dimensions: 2
Shape: torch.Size([2000, 768])
dtype: torch.float32
Memory usage: 5.859375 MB


### Drop Records with Non-Valid Chronology (Null or Range > 50)

In [9]:
valid_mask = data["ValidChronology"]

In [10]:
data = data.loc[valid_mask].reset_index(drop=True)
data

Unnamed: 0,Id,FullText,ImageFilename,StartYear,EndYear,MidpointYear,YearRange,HistoricalPeriod,ValidChronology
0,Agora:Object:Agora XII:1,foot missing. offset neck and echinoid mouth; ...,Agora_Image_2012.54.1450.jpg,-575.0,-550.0,-562.5,25.0,Archaic,True
1,Agora:Object:Agora XII:2,spreading ring foot with torus outer and conve...,,-525.0,-500.0,-512.5,25.0,Archaic,True
2,Agora:Object:Agora XII:3,flaring ring foot. torus mouth. ridge at junct...,,-500.0,-500.0,-500.0,0.0,Archaic,True
3,Agora:Object:Agora XII:4,ring foot. echinoid mouth inset from neck and ...,,-500.0,-500.0,-500.0,0.0,Archaic,True
4,Agora:Object:Agora XII:5,echinus ring foot. torus mouth; strap handles....,Agora_Image_2012.27.0009.jpg,-525.0,-500.0,-512.5,25.0,Archaic,True
...,...,...,...,...,...,...,...,...,...
1905,Agora:Object:Agora XII:2035,fragments from two closely similar examples; c...,Agora_Image_2012.56.0490.jpg,-375.0,-325.0,-350.0,50.0,Classical,True
1906,Agora:Object:Agora XII:2036,fragment of rim and floor with handle. deep ba...,,-350.0,-320.0,-335.0,30.0,Classical,True
1907,Agora:Object:Agora XII:2037,handles missing. basin and lid; handles probab...,Agora_Image_2012.55.1261.jpg,-350.0,-301.0,-325.5,49.0,Classical,True
1908,Agora:Object:Agora XII:2038,small series. plain flat-topped rim; flaring b...,Agora_Image_2012.25.0184.jpg,-435.0,-425.0,-430.0,10.0,Classical,True


In [11]:
for method in features["vectors"]:
    features["vectors"][method] = features["vectors"][method].loc[valid_mask].reset_index(drop=True)

    print(f"\nfeatures[\"vectors\"][\"{method}\"] = ")
    print(features["vectors"][method].info())


features["vectors"]["tfidf"] = 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1910 entries, 0 to 1909
Columns: 300 entries, F0 to F299
dtypes: float64(300)
memory usage: 4.4 MB
None

features["vectors"]["bert"] = 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1910 entries, 0 to 1909
Columns: 768 entries, F0 to F767
dtypes: float64(768)
memory usage: 11.2 MB
None

features["vectors"]["cannyhog"] = 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1910 entries, 0 to 1909
Columns: 2917 entries, F0 to F2916
dtypes: float64(2917)
memory usage: 42.5 MB
None

features["vectors"]["resnet"] = 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1910 entries, 0 to 1909
Columns: 2048 entries, F0 to F2047
dtypes: float64(2048)
memory usage: 29.8 MB
None

features["vectors"]["vit"] = 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1910 entries, 0 to 1909
Columns: 768 entries, F0 to F767
dtypes: float64(768)
memory usage: 11.2 MB
None


In [12]:
for method, tensor in features["tensors"].items():
    features["tensors"][method] = tensor[valid_mask.values]  # convert mask to numpy

    print(f"\nfeatures[\"tensors\"][\"{method}\"] = ")
    tensor = features["tensors"][method]
    print(
        f"Dimensions: {tensor.ndim}\nShape: {tensor.shape}\ndtype: {tensor.dtype}\nMemory usage: {(tensor.element_size() * tensor.numel()) / (1024 * 1024)} MB")


features["tensors"]["bert"] = 
Dimensions: 2
Shape: torch.Size([1910, 768])
dtype: torch.float32
Memory usage: 5.595703125 MB

features["tensors"]["resnet"] = 
Dimensions: 2
Shape: torch.Size([1910, 2048])
dtype: torch.float32
Memory usage: 14.921875 MB

features["tensors"]["vit"] = 
Dimensions: 2
Shape: torch.Size([1910, 768])
dtype: torch.float32
Memory usage: 5.595703125 MB


## Target

In [13]:
target_cols = ["StartYear", "EndYear", "MidpointYear", "YearRange", "HistoricalPeriod"]
target = data[target_cols]
target

Unnamed: 0,StartYear,EndYear,MidpointYear,YearRange,HistoricalPeriod
0,-575.0,-550.0,-562.5,25.0,Archaic
1,-525.0,-500.0,-512.5,25.0,Archaic
2,-500.0,-500.0,-500.0,0.0,Archaic
3,-500.0,-500.0,-500.0,0.0,Archaic
4,-525.0,-500.0,-512.5,25.0,Archaic
...,...,...,...,...,...
1905,-375.0,-325.0,-350.0,50.0,Classical
1906,-350.0,-320.0,-335.0,30.0,Classical
1907,-350.0,-301.0,-325.5,49.0,Classical
1908,-435.0,-425.0,-430.0,10.0,Classical


## Split Train & Test Sets

In [14]:
indices = np.arange(data.shape[0])
train_idx, test_idx = train_test_split(indices, test_size=0.1, random_state=42)

In [15]:
X = {
    "vectors": {
        "train": {method: vectors.loc[train_idx] for method, vectors in features["vectors"].items()},
        "test": {method: vectors.loc[test_idx] for method, vectors in features["vectors"].items()}
    },
    "tensors": {
        "train": {method: tensors[train_idx] for method, tensors in features["tensors"].items()},
        "test": {method: tensors[test_idx] for method, tensors in features["tensors"].items()}
    }
}
X

{'vectors': {'train': {'tfidf':        F0        F1        F2   F3        F4        F5   F6   F7   F8  \
   1455  0.0  0.000000  0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0   
   482   0.0  0.139391  0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0   
   772   0.0  0.000000  0.000000  0.0  0.000000  0.133552  0.0  0.0  0.0   
   1192  0.0  0.000000  0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0   
   1007  0.0  0.000000  0.202137  0.0  0.000000  0.000000  0.0  0.0  0.0   
   ...   ...       ...       ...  ...       ...       ...  ...  ...  ...   
   1130  0.0  0.000000  0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0   
   1294  0.0  0.000000  0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0   
   860   0.0  0.160136  0.000000  0.0  0.170923  0.143958  0.0  0.0  0.0   
   1459  0.0  0.000000  0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0   
   1126  0.0  0.000000  0.000000  0.0  0.000000  0.000000  0.0  0.0  0.0   
   
               F9  ...  F290  F291  F292  F293  F294  F

In [16]:
y = {
    "train": target.loc[train_idx],
    "test": target.loc[test_idx]
}
y

{'train':       StartYear  EndYear  MidpointYear  YearRange HistoricalPeriod
 1455     -425.0   -400.0        -412.5       25.0        Classical
 482      -430.0   -430.0        -430.0        0.0        Classical
 772      -410.0   -410.0        -410.0        0.0        Classical
 1192     -550.0   -550.0        -550.0        0.0          Archaic
 1007     -425.0   -400.0        -412.5       25.0        Classical
 ...         ...      ...           ...        ...              ...
 1130     -400.0   -400.0        -400.0        0.0        Classical
 1294     -500.0   -500.0        -500.0        0.0          Archaic
 860      -375.0   -350.0        -362.5       25.0        Classical
 1459     -520.0   -480.0        -500.0       40.0          Archaic
 1126     -420.0   -400.0        -410.0       20.0        Classical
 
 [1719 rows x 5 columns],
 'test':       StartYear  EndYear  MidpointYear  YearRange HistoricalPeriod
 1228     -525.0   -525.0        -525.0        0.0          Archaic
 11

In [17]:
os.makedirs(path_export + "vectors", exist_ok=True)

for subset in X["vectors"]:
    for method in X["vectors"][subset]:
        X["vectors"][subset][method].to_csv(
            path_export + "vectors/" + "X_" + subset + "_" + method + ".csv",
            index=False,
            encoding='utf-8',
            sep=',',
            header=True
        )

In [18]:
os.makedirs(path_export + "targets", exist_ok=True)

for subset in y:
    y[subset].to_csv(
        path_export + "targets/" + "y_" + subset + ".csv",
        index=False,
        encoding='utf-8',
        sep=',',
        header=True
    )