In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

data_dir = Path("../data")

embeddings = np.load(data_dir / "embeddings_text_all_e5large.npy")
df_meta = pd.read_parquet(data_dir / "meta_for_model_e5large.parquet")

print("Embeddings:", embeddings.shape)
print("Meta:", df_meta.shape)


Embeddings: (485385, 1024)
Meta: (485385, 7)


In [2]:
# embedding columns
emb_dim = embeddings.shape[1]
emb_cols = [f"emb_{i}" for i in range(emb_dim)]
df_emb = pd.DataFrame(embeddings, columns=emb_cols)

# combine meta + embeddings
df_features = pd.concat([df_meta, df_emb], axis=1)

print(df_features.shape)
df_features.head()


(485385, 1031)


Unnamed: 0,objectID,label_isOnView,objectBeginDate,objectEndDate,accessionYear,isTimelineWork,isPublicDomain,emb_0,emb_1,emb_2,...,emb_1014,emb_1015,emb_1016,emb_1017,emb_1018,emb_1019,emb_1020,emb_1021,emb_1022,emb_1023
0,24991,True,1815,1875,1881.0,True,True,0.019975,-0.057065,0.029697,...,0.020226,-0.00276,0.019967,0.035933,-0.011048,-0.024076,0.048695,-0.050089,0.027541,0.063055
1,459006,True,1360,1370,1975.0,False,True,0.007651,-0.010273,0.028847,...,0.025933,-0.030876,0.017105,0.029881,0.011504,-0.047783,0.036071,-0.028943,0.027055,0.025288
2,460813,True,1700,1900,1975.0,False,True,0.009857,-0.025266,0.026218,...,0.02831,-0.004571,0.016151,0.012445,0.012345,-0.04044,0.051912,-0.023447,-0.013325,0.004064
3,461279,True,1750,1800,1975.0,False,True,0.01029,-0.024361,0.034141,...,0.024727,0.012411,0.013989,0.023237,0.010939,-0.034853,0.059533,-0.014477,0.009337,0.023878
4,461472,True,1575,1600,1975.0,False,True,-0.006981,-0.021878,0.05956,...,0.030812,-0.00195,0.026431,0.045762,0.01406,-0.033036,0.023307,-0.018407,-0.001888,0.035842


In [3]:
# target
y = df_features["label_isOnView"].astype(int).values

# all feature columns (with names)
feature_columns = df_features.drop(columns=["objectID", "label_isOnView"]).columns.tolist()

# feature matrix as ndarray
X = df_features[feature_columns].values

print("X:", X.shape)
print("y:", y.shape)
print("First 5 feature names:", feature_columns[:5])


X: (485385, 1029)
y: (485385,)
First 5 feature names: ['objectBeginDate', 'objectEndDate', 'accessionYear', 'isTimelineWork', 'isPublicDomain']


In [4]:
from sklearn.model_selection import train_test_split

indices = np.arange(len(X))

# test split
idx_temp, idx_test = train_test_split(
    indices,
    test_size=0.15,
    stratify=y,
    random_state=42
)



In [5]:
# train/val split
idx_train, idx_val = train_test_split(
    idx_temp,
    test_size=0.15,
    stratify=y[idx_temp],
    random_state=42
)

X_train, y_train = X[idx_train], y[idx_train]
X_val,   y_val   = X[idx_val],   y[idx_val]
X_test,  y_test  = X[idx_test],  y[idx_test]

print("Train:", X_train.shape, y_train.shape)
print("Val:  ", X_val.shape,   y_val.shape)
print("Test: ", X_test.shape,  y_test.shape)


Train: (350690, 1029) (350690,)
Val:   (61887, 1029) (61887,)
Test:  (72808, 1029) (72808,)


In [None]:
splits_dir = data_dir / "splits_e5large"
splits_dir.mkdir(parents=True, exist_ok=True)

np.savez(splits_dir / "train.npz", X=X_train, y=y_train)
np.savez(splits_dir / "val.npz",   X=X_val,   y=y_val)
np.savez(splits_dir / "test.npz",  X=X_test,  y=y_test)

# save feature column names
np.save(splits_dir / "feature_columns.npy", np.array(feature_columns))

print("Saved splits and feature names.")
