In [21]:
import pandas as pd
import numpy as np

data = pd.read_csv('LAMALAB_CURATED_Tg_structured.csv')
print(f"数据集形状: {data.shape}")

数据集形状: (7367, 110)


In [22]:
# 加载 polyBERT 模型
from sentence_transformers import SentenceTransformer

polyBERT = SentenceTransformer('kuelumbus/polyBERT')

In [23]:
# 提取所有的 PSMILES
psmiles_list = data['PSMILES'].tolist()
# 检查是否有空值或无效数据
print(f"空值数量: {data['PSMILES'].isna().sum()}")
print(f"前5个PSMILES示例:")
for i, psmiles in enumerate(psmiles_list[:5]):
    print(f"{i+1}: {psmiles}")

空值数量: 0
前5个PSMILES示例:
1: [*]#C[SiH2]C#Cc1cccc(C#[*])c1
2: [*]#Cc1cccc(C#C[SiH](C#[*])c2ccccc2)c1
3: [*]#Cc1ccccc1C#C[SiH](C#[*])c1ccccc1
4: [*]/C(=C(/[*])c1ccc(C(C)(C)C)cc1)c1ccccc1
5: [*]/C(=C(/[*])c1ccc(CCCC)cc1)c1ccccc1


In [24]:
# 批量编码所有的 PSMILES
psmiles_embeddings = polyBERT.encode(psmiles_list, show_progress_bar=True)

print(f"嵌入向量形状: {psmiles_embeddings.shape}")
print(f"每个 PSMILES 的嵌入向量维度: {psmiles_embeddings.shape[1]}")

Batches:   0%|          | 0/231 [00:00<?, ?it/s]

嵌入向量形状: (7367, 600)
每个 PSMILES 的嵌入向量维度: 600


In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [45]:
# 读取数据集（如果还没有加载）
if 'data' not in globals():
    data = pd.read_csv('LAMALAB_CURATED_Tg_structured.csv')

In [46]:
# 根据可靠性等级划分数据
train_data = data[data['meta.reliability'] == 'black'].copy()
val_test_data = data[data['meta.reliability'].isin(['yellow', 'gold'])].copy()

# 对 yellow 和 gold 数据混合打乱后划分为验证集和测试集（各50%）
if len(val_test_data) > 0:
    val_data, test_data = train_test_split(
        val_test_data, 
        test_size=0.5, 
        random_state=42, 
        shuffle=True
    )

In [47]:
# 提取对应的索引用于后续的嵌入向量划分
train_indices = train_data.index.tolist()
val_indices = val_data.index.tolist() if len(val_data) > 0 else []
test_indices = test_data.index.tolist() if len(test_data) > 0 else []

In [None]:
train_embeddings = psmiles_embeddings[train_indices]
val_embeddings = psmiles_embeddings[val_indices]
test_embeddings = psmiles_embeddings[test_indices]
embeddings_all = psmiles_embeddings
    
print(f"训练集嵌入向量: {train_embeddings.shape}")
print(f"验证集嵌入向量: {val_embeddings.shape}")
print(f"测试集嵌入向量: {test_embeddings.shape}")

训练集嵌入向量: (7088, 600)
验证集嵌入向量: (137, 600)
测试集嵌入向量: (138, 600)


In [39]:
# 提取目标变量 Tg 值
train_y = train_data['labels.Exp_Tg(K)'].values
val_y = val_data['labels.Exp_Tg(K)'].values if len(val_data) > 0 else np.array([])
test_y = test_data['labels.Exp_Tg(K)'].values if len(test_data) > 0 else np.array([])

print(train_y.shape)
print(val_y.shape)
print(test_y.shape)


(7088,)
(137,)
(138,)
