In [2]:
# 读取数据集中的规范化后的psmiles，使用polyBERT转换为嵌入向量
import pandas as pd
import numpy as np

# 读取数据集
data = pd.read_csv('LAMALAB_CURATED_Tg_structured.csv')
print(f"数据集形状: {data.shape}")
print(f"前5行PSMILES:")
print(data['PSMILES'].head())

数据集形状: (7367, 110)
前5行PSMILES:
0                [*]#C[SiH2]C#Cc1cccc(C#[*])c1
1       [*]#Cc1cccc(C#C[SiH](C#[*])c2ccccc2)c1
2         [*]#Cc1ccccc1C#C[SiH](C#[*])c1ccccc1
3    [*]/C(=C(/[*])c1ccc(C(C)(C)C)cc1)c1ccccc1
4        [*]/C(=C(/[*])c1ccc(CCCC)cc1)c1ccccc1
Name: PSMILES, dtype: object


In [3]:
# 加载 polyBERT 模型
from sentence_transformers import SentenceTransformer

print("正在加载 polyBERT 模型...")
polyBERT = SentenceTransformer('kuelumbus/polyBERT')
print("模型加载完成!")

  from tqdm.autonotebook import tqdm, trange


正在加载 polyBERT 模型...
模型加载完成!


In [4]:
# 提取所有的 PSMILES
psmiles_list = data['PSMILES'].tolist()
print(f"总共有 {len(psmiles_list)} 个 PSMILES")

# 检查是否有空值或无效数据
print(f"空值数量: {data['PSMILES'].isna().sum()}")
print(f"前5个PSMILES示例:")
for i, psmiles in enumerate(psmiles_list[:5]):
    print(f"{i+1}: {psmiles}")

总共有 7367 个 PSMILES
空值数量: 0
前5个PSMILES示例:
1: [*]#C[SiH2]C#Cc1cccc(C#[*])c1
2: [*]#Cc1cccc(C#C[SiH](C#[*])c2ccccc2)c1
3: [*]#Cc1ccccc1C#C[SiH](C#[*])c1ccccc1
4: [*]/C(=C(/[*])c1ccc(C(C)(C)C)cc1)c1ccccc1
5: [*]/C(=C(/[*])c1ccc(CCCC)cc1)c1ccccc1


In [5]:
# 使用 polyBERT 对所有 PSMILES 进行编码转换为嵌入向量
print("开始对 PSMILES 进行编码转换...")
print("这可能需要几分钟时间，请耐心等待...")

# 批量编码所有的 PSMILES
psmiles_embeddings = polyBERT.encode(psmiles_list, show_progress_bar=True)

print(f"编码完成!")
print(f"嵌入向量形状: {psmiles_embeddings.shape}")
print(f"每个 PSMILES 的嵌入向量维度: {psmiles_embeddings.shape[1]}")


开始对 PSMILES 进行编码转换...
这可能需要几分钟时间，请耐心等待...


Batches: 100%|██████████| 231/231 [00:30<00:00,  7.68it/s]

编码完成!
嵌入向量形状: (7367, 600)
每个 PSMILES 的嵌入向量维度: 600





In [6]:
# 存储嵌入向量到变量中，并进行基本分析
# 主要的嵌入向量矩阵
polymer_embeddings = psmiles_embeddings

print("=== 嵌入向量存储完成 ===")
print(f"变量名: polymer_embeddings")
print(f"数据类型: {type(polymer_embeddings)}")
print(f"形状: {polymer_embeddings.shape}")
print(f"数据类型: {polymer_embeddings.dtype}")

print("\n=== 嵌入向量统计信息 ===")
print(f"最小值: {polymer_embeddings.min():.6f}")
print(f"最大值: {polymer_embeddings.max():.6f}")
print(f"均值: {polymer_embeddings.mean():.6f}")
print(f"标准差: {polymer_embeddings.std():.6f}")

print("\n=== 示例：前3个聚合物的嵌入向量（仅显示前10个维度）===")
for i in range(3):
    print(f"PSMILES {i+1}: {psmiles_list[i]}")
    print(f"嵌入向量前10维: {polymer_embeddings[i][:10]}")
    print("---")

=== 嵌入向量存储完成 ===
变量名: polymer_embeddings
数据类型: <class 'numpy.ndarray'>
形状: (7367, 600)
数据类型: float32

=== 嵌入向量统计信息 ===
最小值: -3.132480


最大值: 4.910603
均值: -0.001582
标准差: 0.624954

=== 示例：前3个聚合物的嵌入向量（仅显示前10个维度）===
PSMILES 1: [*]#C[SiH2]C#Cc1cccc(C#[*])c1
嵌入向量前10维: [ 0.55221206  0.9560781   0.03296211 -0.34050205 -0.31080493  0.42567027
  0.3735829  -0.25116748 -0.57380605  1.4445828 ]
---
PSMILES 2: [*]#Cc1cccc(C#C[SiH](C#[*])c2ccccc2)c1
嵌入向量前10维: [ 0.4551718   0.6557985   0.31338024 -0.11561695 -0.5141235   0.17613572
  0.14291468  0.21427798  0.12901457  0.6660925 ]
---
PSMILES 3: [*]#Cc1ccccc1C#C[SiH](C#[*])c1ccccc1
嵌入向量前10维: [ 0.34316123  0.45972577  0.58509946  0.04770828 -0.59174454  0.02948072
 -0.03604525  0.00871426  0.36277086  0.32263896]
---


In [7]:
# 对嵌入向量进行归一化
from sklearn.preprocessing import normalize, StandardScaler, MinMaxScaler
import numpy as np

print("=== 嵌入向量归一化 ===")
print("原始嵌入向量统计:")
print(f"形状: {polymer_embeddings.shape}")
print(f"最小值: {polymer_embeddings.min():.6f}")
print(f"最大值: {polymer_embeddings.max():.6f}")
print(f"均值: {polymer_embeddings.mean():.6f}")
print(f"标准差: {polymer_embeddings.std():.6f}")
# L2归一化：每个向量的L2范数归一化为1，保持余弦相似性
polymer_embeddings_l2 = normalize(polymer_embeddings, norm='l2', axis=1)

print(f"L2归一化后的向量范数（前5个）: {np.linalg.norm(polymer_embeddings_l2[:5], axis=1)}")
print(f"L2归一化后统计 - 最小值: {polymer_embeddings_l2.min():.6f}, 最大值: {polymer_embeddings_l2.max():.6f}")
print(f"L2归一化后统计 - 均值: {polymer_embeddings_l2.mean():.6f}, 标准差: {polymer_embeddings_l2.std():.6f}")

# 为了方便后续使用，将L2归一化版本设为主要变量
normalized_embeddings = polymer_embeddings_l2
print(f"\n主要归一化变量: normalized_embeddings (L2归一化)")
print(f"形状: {normalized_embeddings.shape}")

=== 嵌入向量归一化 ===
原始嵌入向量统计:
形状: (7367, 600)
最小值: -3.132480
最大值: 4.910603
均值: -0.001582
标准差: 0.624954
L2归一化后的向量范数（前5个）: [0.99999994 1.         1.         0.99999994 1.        ]
L2归一化后统计 - 最小值: -0.188396, 最大值: 0.305970
L2归一化后统计 - 均值: -0.000104, 标准差: 0.040825

主要归一化变量: normalized_embeddings (L2归一化)
形状: (7367, 600)


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
# 按照 8:1:1 划分数据集
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"训练集数量: {len(train_data)}")
print(f"验证集数量: {len(val_data)}")
print(f"测试集数量: {len(test_data)}")

训练集数量: 5893
验证集数量: 737
测试集数量: 737


In [10]:
# 提取对应的索引用于后续的嵌入向量划分
train_indices = train_data.index.tolist()
val_indices = val_data.index.tolist() if len(val_data) > 0 else []
test_indices = test_data.index.tolist() if len(test_data) > 0 else []

In [11]:
# 如果已有嵌入向量，同时划分嵌入向量
if 'normalized_embeddings' in globals():
    train_embeddings = normalized_embeddings[train_indices]
    val_embeddings = normalized_embeddings[val_indices] if len(val_indices) > 0 else np.array([])
    test_embeddings = normalized_embeddings[test_indices] if len(test_indices) > 0 else np.array([])
    
    print(f"\n=== 嵌入向量划分 ===")
    print(f"训练集嵌入向量: {train_embeddings.shape}")
    print(f"验证集嵌入向量: {val_embeddings.shape}")
    print(f"测试集嵌入向量: {test_embeddings.shape}")


=== 嵌入向量划分 ===
训练集嵌入向量: (5893, 600)
验证集嵌入向量: (737, 600)
测试集嵌入向量: (737, 600)


In [15]:
# 提取目标变量 Tg 值
train_y = train_data['Tg'].values
val_y = val_data['Tg'].values if len(val_data) > 0 else np.array([])
test_y = test_data['Tg'].values if len(test_data) > 0 else np.array([])

print(f"\n=== 目标变量 Tg 值 ===")
print(f"训练集 Tg 范围: {train_y.min():.2f} - {train_y.max():.2f} K")
if len(val_y) > 0:
    print(f"验证集 Tg 范围: {val_y.min():.2f} - {val_y.max():.2f} K")
if len(test_y) > 0:
    print(f"测试集 Tg 范围: {test_y.min():.2f} - {test_y.max():.2f} K")


=== 目标变量 Tg 值 ===
训练集 Tg 范围: 134.15 - 763.15 K
验证集 Tg 范围: 153.05 - 768.15 K
测试集 Tg 范围: 144.15 - 677.15 K


In [16]:
# 使用机器学习模型训练Tg预测
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# 设置随机种子确保可重现性
np.random.seed(42)

print("=== Tg预测机器学习模型训练 ===")

# 检查数据是否存在
if not all([var in globals() for var in ['train_embeddings', 'val_embeddings', 'test_embeddings', 'train_y', 'val_y', 'test_y']]):
    print("错误: 请先运行数据划分代码")
    exit()

print(f"训练集: {train_embeddings.shape[0]} 样本, {train_embeddings.shape[1]} 特征")
print(f"验证集: {val_embeddings.shape[0]} 样本" if len(val_embeddings) > 0 else "验证集: 0 样本")
print(f"测试集: {test_embeddings.shape[0]} 样本" if len(test_embeddings) > 0 else "测试集: 0 样本")

# 定义评估指标函数
def evaluate_model(y_true, y_pred, model_name="Model"):
    """计算回归模型评估指标"""
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    return {
        'Model': model_name,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R²': r2
    }

# 定义模型字典
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0, random_state=42),
    'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42, max_iter=2000),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'SVR': SVR(kernel='rbf', C=1.0, gamma='scale'),
    'Neural Network': MLPRegressor(hidden_layer_sizes=(256, 128, 64), 
                                   max_iter=500, 
                                   random_state=42, 
                                   early_stopping=True,
                                   validation_fraction=0.1),
}

# 如果有xgboost，添加XGBoost模型
try:
    models['XGBoost'] = xgb.XGBRegressor(
        n_estimators=100,
        random_state=42,
        n_jobs=-1,
        verbosity=0
    )
except:
    print("XGBoost不可用，跳过XGBoost模型")

print(f"\n=== 开始训练 {len(models)} 个模型 ===")

# 存储结果
results = []
trained_models = {}
predictions = {}

# 训练和评估每个模型
for model_name, model in models.items():
    print(f"\n训练 {model_name}...")
    
    try:
        # 训练模型
        model.fit(train_embeddings, train_y)
        trained_models[model_name] = model
        
        # 预测
        train_pred = model.predict(train_embeddings)
        predictions[f'{model_name}_train'] = train_pred
        
        # 评估训练集
        train_metrics = evaluate_model(train_y, train_pred, f"{model_name}_train")
        results.append(train_metrics)
        
        # 如果有验证集，在验证集上评估
        if len(val_embeddings) > 0:
            val_pred = model.predict(val_embeddings)
            predictions[f'{model_name}_val'] = val_pred
            val_metrics = evaluate_model(val_y, val_pred, f"{model_name}_val")
            results.append(val_metrics)
        
        # 如果有测试集，在测试集上预测
        if len(test_embeddings) > 0:
            test_pred = model.predict(test_embeddings)
            predictions[f'{model_name}_test'] = test_pred
            test_metrics = evaluate_model(test_y, test_pred, f"{model_name}_test")
            results.append(test_metrics)
            
        print(f"{model_name} 训练完成 ✓")
        
    except Exception as e:
        print(f"{model_name} 训练失败: {e}")
        continue

# 将结果转换为DataFrame
results_df = pd.DataFrame(results)
print(f"\n=== 模型性能评估结果 ===")
print(results_df.round(4))

# 找出最佳模型（基于验证集或训练集R²）
if len(val_embeddings) > 0:
    val_results = results_df[results_df['Model'].str.contains('_val')]
    if len(val_results) > 0:
        best_model_row = val_results.loc[val_results['R²'].idxmax()]
        best_model_name = best_model_row['Model'].replace('_val', '')
    else:
        train_results = results_df[results_df['Model'].str.contains('_train')]
        best_model_row = train_results.loc[train_results['R²'].idxmax()]
        best_model_name = best_model_row['Model'].replace('_train', '')
else:
    train_results = results_df[results_df['Model'].str.contains('_train')]
    best_model_row = train_results.loc[train_results['R²'].idxmax()]
    best_model_name = best_model_row['Model'].replace('_train', '')

print(f"\n=== 最佳模型: {best_model_name} ===")
print(f"最佳性能: R² = {best_model_row['R²']:.4f}, RMSE = {best_model_row['RMSE']:.4f}")


=== Tg预测机器学习模型训练 ===
训练集: 5893 样本, 600 特征
验证集: 737 样本
测试集: 737 样本

=== 开始训练 8 个模型 ===

训练 Linear Regression...
Linear Regression 训练完成 ✓

训练 Ridge Regression...
Ridge Regression 训练完成 ✓

训练 ElasticNet...
ElasticNet 训练完成 ✓

训练 Random Forest...
Random Forest 训练完成 ✓

训练 Gradient Boosting...
Gradient Boosting 训练完成 ✓

训练 SVR...
SVR 训练完成 ✓

训练 Neural Network...
Neural Network 训练完成 ✓

训练 XGBoost...
XGBoost 训练完成 ✓

=== 模型性能评估结果 ===
                      Model         MSE      RMSE      MAE      R²
0   Linear Regression_train   1576.8364   39.7094  29.6730  0.8759
1     Linear Regression_val   2018.5120   44.9279  33.4028  0.8461
2    Linear Regression_test   2328.7365   48.2570  34.8917  0.8091
3    Ridge Regression_train   2130.0215   46.1522  34.4824  0.8324
4      Ridge Regression_val   2294.8087   47.9042  35.2852  0.8250
5     Ridge Regression_test   2480.9304   49.8089  36.7014  0.7966
6          ElasticNet_train  11370.6777  106.6334  89.4251  0.1051
7            ElasticNet_val  11743.251