In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

df_train = pd.read_parquet("../data/df_train_clean.parquet")
print(df_train.head())


   objectBeginDate classification  accessionYear  \
0             1815        Daggers         1881.0   
1             1360      Paintings         1975.0   
2             1700          Glass         1975.0   
3             1750          Glass         1975.0   
4             1575          Glass         1975.0   

                                         objectDate  \
0                                       ca. 1840â€“50   
1                                          ca. 1365   
2  18th century, reassembled last half 19th century   
3                          second half 18th century   
4                                 late 16th century   

                     objectName                                culture  \
0  Dagger (Kindjal) with sheath  Caucasian, possibly northern Dagestan   
1                      Painting                                          
2                      Figurine               probably French (Nevers)   
3                         Panel      probably Bohemia or L

In [2]:
import torch
import numpy as np
import pandas as pd
from pathlib import Path
from sentence_transformers import SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

model = SentenceTransformer("intfloat/e5-large-v2", device=device)

texts = df_train["text_all"].tolist()

embeddings = model.encode(
    texts,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True,
)

print("Embeddings shape:", embeddings.shape)


Using device: cuda


Batches:   0%|          | 0/15169 [00:00<?, ?it/s]

Embeddings shape: (485385, 1024)


In [3]:
out_dir = Path("../data")
out_dir.mkdir(parents=True, exist_ok=True)

# 1) Save embeddings only
np.save(out_dir / "embeddings_text_all_e5large.npy", embeddings)

# 2) Save metadata (IDs + label + numeric + boolean)
NUMERIC = ["objectBeginDate", "objectEndDate", "accessionYear"]
BOOLEAN = ["isTimelineWork", "isPublicDomain"]

meta_cols = ["objectID", "label_isOnView"] + NUMERIC + BOOLEAN
df_meta = df_train[meta_cols].reset_index(drop=True)

df_meta.to_parquet(out_dir / "meta_for_model_e5large.parquet", index=False)

print("Saved embeddings and meta.")


Saved embeddings and meta.
