In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

data_dir = Path("../data")

embeddings = np.load(data_dir / "embeddings_text_all_mpnet.npy")
df_meta = pd.read_parquet(data_dir / "meta_for_model.parquet")

print("Embeddings:", embeddings.shape)
print("Meta:", df_meta.shape)


Embeddings: (485385, 768)
Meta: (485385, 7)


In [2]:
# embedding columns
emb_dim = embeddings.shape[1]
emb_cols = [f"emb_{i}" for i in range(emb_dim)]
df_emb = pd.DataFrame(embeddings, columns=emb_cols)

# combine meta + embeddings
df_features = pd.concat([df_meta, df_emb], axis=1)

print(df_features.shape)
df_features.head()


(485385, 775)


Unnamed: 0,objectID,label_isOnView,objectBeginDate,objectEndDate,accessionYear,isTimelineWork,isPublicDomain,emb_0,emb_1,emb_2,...,emb_758,emb_759,emb_760,emb_761,emb_762,emb_763,emb_764,emb_765,emb_766,emb_767
0,24991,True,1815,1875,1881.0,True,True,0.066564,-0.097619,-0.006768,...,-0.022872,0.015826,0.025907,0.0109,0.000501,0.002534,-0.02279,0.035805,-0.01799,-0.022144
1,459006,True,1360,1370,1975.0,False,True,0.050238,0.030997,-0.015001,...,-0.064712,0.025086,0.049387,-0.00717,-0.026329,-0.050298,-0.024853,0.030199,-0.013203,0.007662
2,460813,True,1700,1900,1975.0,False,True,0.088138,0.039915,-0.016473,...,-0.041216,0.007579,0.051266,0.010833,-0.004339,-0.067634,-0.022012,-0.004035,-0.000299,-0.020221
3,461279,True,1750,1800,1975.0,False,True,0.061603,0.055202,0.006606,...,-0.052357,-0.012084,0.039223,-0.024551,-0.04316,-0.025256,-0.031689,0.008007,0.028851,-0.001046
4,461472,True,1575,1600,1975.0,False,True,0.082625,0.004931,-0.022013,...,-0.081222,0.003913,0.009686,-0.040473,-0.019271,-0.032337,-0.020384,0.004691,-0.019685,-0.003255


In [3]:
# target
y = df_features["label_isOnView"].astype(int).values

# all feature columns (with names)
feature_columns = df_features.drop(columns=["objectID", "label_isOnView"]).columns.tolist()

# feature matrix as ndarray
X = df_features[feature_columns].values

print("X:", X.shape)
print("y:", y.shape)
print("First 5 feature names:", feature_columns[:5])


X: (485385, 773)
y: (485385,)
First 5 feature names: ['objectBeginDate', 'objectEndDate', 'accessionYear', 'isTimelineWork', 'isPublicDomain']


In [4]:
from sklearn.model_selection import train_test_split

indices = np.arange(len(X))

# test split
idx_temp, idx_test = train_test_split(
    indices,
    test_size=0.15,
    stratify=y,
    random_state=42
)



In [6]:
# train/val split
idx_train, idx_val = train_test_split(
    idx_temp,
    test_size=0.15,
    stratify=y[idx_temp],
    random_state=42
)

X_train, y_train = X[idx_train], y[idx_train]
X_val,   y_val   = X[idx_val],   y[idx_val]
X_test,  y_test  = X[idx_test],  y[idx_test]

print("Train:", X_train.shape, y_train.shape)
print("Val:  ", X_val.shape,   y_val.shape)
print("Test: ", X_test.shape,  y_test.shape)


Train: (350690, 773) (350690,)
Val:   (61887, 773) (61887,)
Test:  (72808, 773) (72808,)


In [7]:
splits_dir = data_dir / "splits"
splits_dir.mkdir(parents=True, exist_ok=True)

np.savez(splits_dir / "train.npz", X=X_train, y=y_train)
np.savez(splits_dir / "val.npz",   X=X_val,   y=y_val)
np.savez(splits_dir / "test.npz",  X=X_test,  y=y_test)

# save feature column names
np.save(splits_dir / "feature_columns.npy", np.array(feature_columns))

print("Saved splits and feature names.")


Saved splits and feature names.
