In [4]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem, Draw, Lipinski, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [41]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data, Batch
from torch_geometric.nn import GCNConv, GATConv
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader

从CSV文件中加载分子数据，并将这些数据转换为图（graph）表示，以便于后续使用图神经网络（GNNs）进行处理

In [34]:
def load_data(csv_file):
    data = pd.read_csv(csv_file, encoding='ANSI')
    molecules = []
    max_atoms = 0 #最大原子数
    max_edges = 0 # 最大边数
    #按照id分组处理
    for  _,group in data.groupby('ID'):
        for _,row in group.iterrows():
            features = row.drop(['SMILES', 'ID']).values
            smiles = row['SMILES']
            molecule_id = row['ID']
            print(smiles)
            mol = Chem.MolFromSmiles(smiles)
            # SMILE 无效 → 跳过
            if mol is None:
                continue
            num_atoms = len(mol.GetAtoms()) # 原子数获取
            max_atoms = max(max_atoms, num_atoms) # 最大原子数更新
            bonds = mol.GetBonds()
            num_edges = len(bonds) #边数获取
#             adj_matrix = np.zeros((num_atoms, num_atoms)) # 邻接矩阵初始化
#             # 填充邻接矩阵，建立连接关系(无向图)
#             for bond in bonds:
#                 i = bond.GetBeginAtomIdx()
#                 j = bond.GetEndAtomIdx()
#                 adj_matrix[i,j] = 1
#                 adj_matrix[j,i] = 1 #无向图
            adj_matrix = Chem.GetAdjacencyMatrix(mol)  # 使用RDKit直接获取邻接矩阵
            # 将原子特征转换为(num_atoms, 148)形状
            features = np.array(features, dtype=np.float32) #强制转换为float32
            atom_features = torch.tensor(features, dtype=torch.float).repeat(num_atoms, 148) # (num_atoms, 148)
            # 填充atom_features 到 max_atom
            if num_atoms < max_atoms:
                padding = torch.zeros((max_atoms-num_atoms, 148), dtype=torch.float)
                atom_features = torch.cat([atom_features, padding], dim=0)
            # 处理邻接矩阵，转化为edge_index
            edge_index = torch.tensor(np.array(np.nonzero(adj_matrix)), dtype=torch.long) # (2, num_edges)
            # 更新最大边数的计算，基于实际边数
            max_edges = max(max_edges, edge_index.shape[1])
#             # 如果edge_index边数小于max_edges,需要填充到max_edges
#             if edge_index.shape[i] < max_edges:
#                 padding_edges = torch.zeros((2, int(max_edges - edge_index.shipe[i])), dtype=torch.long)
#                 edge_index = torch.cat([edge_index, padding_edges], dim=1)

            print(f"edge_index.shape:{edge_index.shape}")
            print(f"atom_feature.shape:{atom_features.shape}")

            #保存分子信息
            molecules.append ({
                'x':atom_features,
                'edge_index': edge_index,
                'smiles': smiles,
                'id': molecule_id
            })
                
    print(f"最大原子数，{max_atoms}， 最大边数，{max_edges}")
    print(type(data))
                
    return molecules
                

加载额外的数据，并对这些数据进行标准化处理，同时保留ID列以供后续匹配或参考。

In [36]:
def load_extra_data(csv_file):
    df = pd.read_csv(csv_file)
    id_list = df['ID'].values
    extra_data = df.drop(columns=[ 'ID']).values # 删除id列，提取特征数据
    scaler = StandardScaler()
    extra_data_scale = scaler.fit_transform(extra_data)
    extra_data_with_id = pd.DataFrame(extra_data_scale, columns=df.columns[1:])
    extra_data_with_id.insert(0, 'ID', id_list) # 将id插入到第一列位置
    return id_list, extra_data_with_id

图神经网络定义与训练

In [12]:
class GraphNN(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=64, dropout=0.3, num_layers=2):
        super().__init__()
        #定义图神经网络编码器层
        self.encoder = nn.ModuleList([
            GATConv(input_dim if i == 0 else hidden_dim, hidden_dim)
            for i in range(num_layers)
        ])
        #定义解码器层
        self.decoder = GATConv(hidden_dim, output_dim)
        #Dropout层
        self.dropout = dropout
        #归一化层
        self.layer_norm = nn.LayerNorm(hidden_dim)
        #初始化网络参数
        self._initialize_weights()
        
    def _initialize_weights(self):
        # 对图卷积层权重初始化
        for layer in self.encoder:
            if isinstance(layer, GATConv):
                torch.nn.init.xavier_uniform_(layer.lin_src.weight)
                torch.nn.init.xavier_uniform_(layer.lin_dst.weight)

        # 解码器层初始化
        if isinstance(self.decoder, GATConv):
            torch.nn.init.xavier_uniform_(layer.lin_src.weight)
            torch.nn.init.xavier_uniform_(layer.lin_dst.weight)

    def forward(self, batch):
        x, edge_index = batch.x, batch.edge_index
        edge_attr = batch.edge_attr if 'edge_attr' in batch else None

        #前向传播过程，编码器+激活函数+层归一化
        for layer in self.encoder:
            x = F.dropout(x, p=self.dropout, training=self.training)
            x = layer(x, edge_index, edge_attr=edge_attr)
            x = F.relu(x)
            x = self.layer_norm(x) #归一化

        #解码层输出
        output = self.decoder(x, edge_index, edge_attr=edge_attr)
        return output

In [13]:
# 训练图神经网络模型
def train_graph_nn(model, data_loader, optimizer, clip_val=1.0):
    model.train()
    total_loss = 0
    loss_fun = torch.nn.MSELoss()
    num_samples = 0
    for data in data_loader:
        optimizer.zero_grad() #梯度清空
        
        output = model(data)#传递数据到前向计算
        y = data.y
        loss = loss_fun(output, y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * len(y)
        num_samples += len(y)
    
    avg_loss = total_loss / num_samples if num_samples > 0 else float('nan')
    
    return avg_loss

数据批量合并

In [14]:
def collate_fn(batch):
    # 获取所有图的原子特征x和边索引edgeindex
    x_list = [data['x'] for data in batch] #每个data是torch_geometric.data.Data对象
    edge_index_list = [data['edge_index'] for data in batch]
    smile_list = [data['smiles'] for data in batch]
    id_list = [data['id'] for data in batch]
    
    #拼接原子特征，确保图特征维度一致
    x = torch.cat(x_list, dim=0)
    print(f"x shape after concatenation:{x.shape}")
    #拼接所有边索引，注意我们要对每个图边索引进行偏移，确保他们不会冲突
    edge_index = []
    num_nodes_accum = 0 # 累计节点数
    for edge_idx in edge_index_list:
        edge_idx_offset = edge_idx + num_nodes_accum
        edge_index.append(edge_idx_offset)
        num_nodes_accum += edge_idx.max().item() + 1
        
    #拼接所有边索引，确保形状是(2, num_edges)
    edge_index = torch.cat(edge_idx, dim=1)
    print(f"edge_index shape after concatenation:{edge_index.shape}")
        
    #创建一个批量数据对象，确保返回batch对象
    batch_data = Batch(x=x, edge_index=edge_index, smiles=smile_list, id=id_list)
    return batch_data

功能预测器

In [15]:
class Fun_Predictor(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_rate=0.5):
        super(Fun_Predictor, self).__init__()
        
        # 定义各层
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.dropout = nn.Dropout(dropout_rate)
        
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.bn2 = nn.BatchNorm1d(hidden_size)
        
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, 1)  # 假设最终预测是一个值
        
        self._initialize_weights()
        
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
    
    def forward(self, combined_features):
            x = torch.relu(self.fc1(combined_features))
            x = self.bn1(x)
            x = self.dropout(x)
            x = torch.relu(self.fc2(x))
            x = self.bn2(x)
            x = self.dropout(x)
            x = torch.relu(self.fc3(x))
            predict_IE = self.fc4(x)
            return predict_IE

In [16]:
def train_fun_predictor(model_fun, graph_nn, molecule_data, extra_data_with_id, criterion):
    model_fun.train()
    graph_nn.eval()
    total_loss = 0
    graph_embedding = []
    target_vals = []
    data_list = []
    
    # 创建一个字典，将额外特征与id匹配，去除id列
    extra_data_dict = {row['ID']:torch.tensor(row[1:],values,dtype=torch.float32) for _,row in extra_data_with_id.iterrows()}
    
    # 初始化一个列表保存额外特征值
    extra_feature = []
    
    # 遍历所有额外特征数据，并为每个分子获取对应的图嵌入和特征
    for _,row in extra_data_with_id.iterrows():
        id_ = row['ID']
        extra_feature_tensor = extra_data_dict[id_].unsqueeze(0)
        molecule_data_for_id = next((data for data in molecule_data if data['id'] == id_), None)
        if molecule_data_for_id is None:
            raise ValueError(f"No mol data found for ID:{id_}")
        x = molecule_data_for_id['x'] # 原子特征
        edge_index = molecule_data_for_id['edge_index'] # 邻接矩阵
        graph_data = Data(x=x, edge_index = edge_index)
        
        # 获取图嵌入
        with torch.no_grad():
            embedding = graph_nn(graph_data) #使用graphNN获取图嵌入
            pooled_embedding = torch.mean(embedding, dim=0).unsqueeze(0) #平均池化
            
        # 合并图嵌入和额外特征
        combined_embedding = torch.cat((pooled_embedding, extra_feature_tensor), dim=1)
        graph_embeddings.append(combined_embedding)
        
        # 设置目标值，假设目标值是额外特征的最后一列
        target_val = extra_feature_tensor[:, -1]  # 取最后一列作为目标值
        target_vals.append(target_val)
        
    # 将所有图嵌入和目标值组合成批次
    batch_embeddings = torch.cat(graph_embeddings, dim=0)
    batch_targets = torch.cat(target_vals, dim=0).unsqueeze(-1)  # 确保目标值形状正确

    # 清空梯度
    optimizer.zero_grad()

    # 前向传播
    predictions = model_fun(batch_embeddings)

    # 计算损失
    loss = criterion(predictions, batch_targets)

    # 反向传播
    loss.backward()

    # 更新模型参数
    optimizer.step()

    # 累积损失
    total_loss += loss.item()

    return total_loss

In [17]:
# 主训练函数
def main():
    # 加载数据
    csv_file_molecules = 'molecules.csv'  # 替换为您的分子数据文件路径
    csv_file_extra = 'extra_data.csv'  # 替换为您的额外特征数据文件路径
    
    molecule_data = load_data(csv_file_molecules)
    id_list, extra_data_with_id = load_extra_data(csv_file_extra)
    
    # 创建 Data 对象列表
    data_list = [Data(x=data['x'], edge_index=data['edge_index'], y=torch.tensor([0.0])) for data in molecule_data]  # y 是占位符
    
    # 创建 DataLoader
    data_loader = DataLoader(data_list, batch_size=32, shuffle=True, collate_fn=collate_fn)
    
    # 定义模型参数
    input_dim = 148  # 根据您的原子特征数量调整
    output_dim = 64  # 图嵌入的维度
    hidden_size = 128  # 全连接层的隐藏单元数量
    
    # 实例化模型
    graph_nn = GraphNN(input_dim=input_dim, output_dim=output_dim)
    fun_predictor = Fun_Predictor(input_size=output_dim + extra_data_with_id.shape[1] - 1, hidden_size=hidden_size)
    
    # 定义优化器和损失函数
    optimizer_gnn = torch.optim.Adam(graph_nn.parameters(), lr=0.001)
    optimizer_fp = torch.optim.Adam(fun_predictor.parameters(), lr=0.001)
    criterion = torch.nn.MSELoss()
    
    # 训练循环
    num_epochs = 50
    for epoch in range(num_epochs):
        # 训练图神经网络
        loss_gnn = train_graph_nn(graph_nn, data_loader, optimizer_gnn)
        
        # 训练功能预测模型
        loss_fp = train_fun_predictor(fun_predictor, graph_nn, molecule_data, extra_data_with_id, criterion, optimizer_fp)
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss_GNN: {loss_gnn:.4f}, Loss_FP: {loss_fp:.4f}')

In [42]:
main()

S=c1nccc[nH]1
edge_index.shape:torch.Size([2, 14])
atom_feature.shape:torch.Size([7, 148])
S=c1nccc[nH]1
edge_index.shape:torch.Size([2, 14])
atom_feature.shape:torch.Size([7, 148])
S=c1nccc[nH]1
edge_index.shape:torch.Size([2, 14])
atom_feature.shape:torch.Size([7, 148])
S=c1nccc[nH]1
edge_index.shape:torch.Size([2, 14])
atom_feature.shape:torch.Size([7, 148])
S=c1nccc[nH]1
edge_index.shape:torch.Size([2, 14])
atom_feature.shape:torch.Size([7, 148])
c3ccc(CSc2nc1ccccc1[nH]2)cc3
edge_index.shape:torch.Size([2, 38])
atom_feature.shape:torch.Size([17, 148])
CCCCCCCCC(O)C(CCCCCCCC(=O)OCC(COC(=O)CCCCCCCC(O)C(CC(O)C(CCCCC)[NH+]1CCCC1)[NH+]2CCCC2)OC(=O)CCCCCCCC(C(O)CCCCCCCC)[NH+]3CCCC3)[NH+]4CCCC4.[Cl-].[Cl-].[Cl-].[Cl-]
edge_index.shape:torch.Size([2, 180])
atom_feature.shape:torch.Size([91, 148])
CCCCCCCCC(O)C(CCCCCCCC(=O)OCC(COC(=O)CCCCCCCC(O)C(CC(O)C(CCCCC)[NH+]1CCCC1)[NH+]2CCCC2)OC(=O)CCCCCCCC(C(O)CCCCCCCC)[NH+]3CCCC3)[NH+]4CCCC4.[Cl-].[Cl-].[Cl-].[Cl-]
edge_index.shape:torch.Size([2, 1

edge_index.shape:torch.Size([2, 14])
atom_feature.shape:torch.Size([158, 148])
Sc1ncccn1
edge_index.shape:torch.Size([2, 14])
atom_feature.shape:torch.Size([158, 148])
Sc1ncccn1
edge_index.shape:torch.Size([2, 14])
atom_feature.shape:torch.Size([158, 148])
Sc1ncccn1
edge_index.shape:torch.Size([2, 14])
atom_feature.shape:torch.Size([158, 148])
Sc1ncccn1
edge_index.shape:torch.Size([2, 14])
atom_feature.shape:torch.Size([158, 148])
Cc1ccnc(S)n1
edge_index.shape:torch.Size([2, 16])
atom_feature.shape:torch.Size([158, 148])
Cc1ccnc(S)n1
edge_index.shape:torch.Size([2, 16])
atom_feature.shape:torch.Size([158, 148])
Cc1ccnc(S)n1
edge_index.shape:torch.Size([2, 16])
atom_feature.shape:torch.Size([158, 148])
Cc1ccnc(S)n1
edge_index.shape:torch.Size([2, 16])
atom_feature.shape:torch.Size([158, 148])
Cc1ccnc(S)n1
edge_index.shape:torch.Size([2, 16])
atom_feature.shape:torch.Size([158, 148])
CC(C)/C4=C/C3=C/CC2C(C)(/C1=N/CCN1CCNCCNCCN)CCCC2(C)C3CC4
edge_index.shape:torch.Size([2, 72])
atom_featu

edge_index.shape:torch.Size([2, 110])
atom_feature.shape:torch.Size([158, 148])
O=C(CCN(CC/N=C/c1ccccc1O)CC/N=C/c2ccccc2O)CCN(CC/N=C/c3ccccc3O)CC/N=C/c4ccccc4O
edge_index.shape:torch.Size([2, 110])
atom_feature.shape:torch.Size([158, 148])
O=C1CC(=O)NC(=S)N1
edge_index.shape:torch.Size([2, 18])
atom_feature.shape:torch.Size([158, 148])
O=C1CC(=O)NC(=S)N1
edge_index.shape:torch.Size([2, 18])
atom_feature.shape:torch.Size([158, 148])
O=C1CC(=O)NC(=S)N1
edge_index.shape:torch.Size([2, 18])
atom_feature.shape:torch.Size([158, 148])
O=C1CC(=O)NC(=S)N1
edge_index.shape:torch.Size([2, 18])
atom_feature.shape:torch.Size([158, 148])
O=C1CC(=O)NC(=S)N1
edge_index.shape:torch.Size([2, 18])
atom_feature.shape:torch.Size([158, 148])
O=C1CC(=O)NC(=S)N1
edge_index.shape:torch.Size([2, 18])
atom_feature.shape:torch.Size([158, 148])
O=C1CC(=O)NC(=S)N1
edge_index.shape:torch.Size([2, 18])
atom_feature.shape:torch.Size([158, 148])
O=C1CC(=O)NC(=S)N1
edge_index.shape:torch.Size([2, 18])
atom_feature.shape

edge_index.shape:torch.Size([2, 120])
atom_feature.shape:torch.Size([158, 148])
Oc9ccc(c7c1ccc(n1)c(c2ccc(O)cc2)c3ccc([nH]3)c(c4ccc(O)cc4)c5ccc(n5)c(c6ccc(O)cc6)c8ccc7[nH]8)cc9
edge_index.shape:torch.Size([2, 120])
atom_feature.shape:torch.Size([158, 148])
c9ccc(c7c1ccc(n1)c(c2ccccc2)c3ccc([nH]3)c(c4ccccc4)c5ccc(n5)c(c6ccccc6)c8ccc7[nH]8)cc9
edge_index.shape:torch.Size([2, 112])
atom_feature.shape:torch.Size([158, 148])
c9ccc(c7c1ccc(n1)c(c2ccccc2)c3ccc([nH]3)c(c4ccccc4)c5ccc(n5)c(c6ccccc6)c8ccc7[nH]8)cc9
edge_index.shape:torch.Size([2, 112])
atom_feature.shape:torch.Size([158, 148])
c9ccc(c7c1ccc(n1)c(c2ccccc2)c3ccc([nH]3)c(c4ccccc4)c5ccc(n5)c(c6ccccc6)c8ccc7[nH]8)cc9
edge_index.shape:torch.Size([2, 112])
atom_feature.shape:torch.Size([158, 148])
CCCCCCCC/C=C/CCCCCCC/C1=N/CCN1CCNC(=O)CCN(CCC(=O)NCCN2CC/N=C2/CCCCCCC/C=C/CCCCCCCC)CCN(CCC(=O)NCCN3CC/N=C3/CCCCCCC/C=C/CCCCCCCC)CCC(=O)NCCN4CC/N=C4/CCCCCCC/C=C/CCCCCCCC
edge_index.shape:torch.Size([2, 246])
atom_feature.shape:torch.Size([158,

edge_index.shape:torch.Size([2, 44])
atom_feature.shape:torch.Size([158, 148])
CCCCCCCCCCC/C1=N/CCN1C(C)N
edge_index.shape:torch.Size([2, 38])
atom_feature.shape:torch.Size([158, 148])
CCCCCCCCCCC/C1=N/CCN1C(C)N
edge_index.shape:torch.Size([2, 38])
atom_feature.shape:torch.Size([158, 148])
CCCCCCCCCCC/C1=N/CCN1C(C)N
edge_index.shape:torch.Size([2, 38])
atom_feature.shape:torch.Size([158, 148])
CCCCCCCCCCC/C1=N/CCN1C(C)N
edge_index.shape:torch.Size([2, 38])
atom_feature.shape:torch.Size([158, 148])
CCCCCCCCCCC/C1=N/CCN1C(C)N
edge_index.shape:torch.Size([2, 38])
atom_feature.shape:torch.Size([158, 148])
CCCCCCCCCCC/C1=N/CCN1C(C)N
edge_index.shape:torch.Size([2, 38])
atom_feature.shape:torch.Size([158, 148])
CCCCCCCCCCC/C1=N/CCN1C(C)N
edge_index.shape:torch.Size([2, 38])
atom_feature.shape:torch.Size([158, 148])
CCCCCCCCCCC/C1=N/CCN1CCN
edge_index.shape:torch.Size([2, 38])
atom_feature.shape:torch.Size([158, 148])
CCCCCCCCCCC/C1=N/CCN1CCN
edge_index.shape:torch.Size([2, 38])
atom_feature.

edge_index.shape:torch.Size([2, 264])
atom_feature.shape:torch.Size([158, 148])
Sc2nc1ccccc1[nH]2
edge_index.shape:torch.Size([2, 22])
atom_feature.shape:torch.Size([158, 148])
Sc2nc1ccccc1[nH]2
edge_index.shape:torch.Size([2, 22])
atom_feature.shape:torch.Size([158, 148])
Sc2nc1ccccc1[nH]2
edge_index.shape:torch.Size([2, 22])
atom_feature.shape:torch.Size([158, 148])
Sc2nc1ccccc1[nH]2
edge_index.shape:torch.Size([2, 22])
atom_feature.shape:torch.Size([158, 148])
Sc2nc1ccccc1[nH]2
edge_index.shape:torch.Size([2, 22])
atom_feature.shape:torch.Size([158, 148])
Sc2nc1ccccc1[nH]2
edge_index.shape:torch.Size([2, 22])
atom_feature.shape:torch.Size([158, 148])
Sc2nc1ccccc1[nH]2
edge_index.shape:torch.Size([2, 22])
atom_feature.shape:torch.Size([158, 148])
Sc2nc1ccccc1[nH]2
edge_index.shape:torch.Size([2, 22])
atom_feature.shape:torch.Size([158, 148])
Sc2nc1ccccc1[nH]2
edge_index.shape:torch.Size([2, 22])
atom_feature.shape:torch.Size([158, 148])
Sc2nc1ccccc1[nH]2
edge_index.shape:torch.Size([

edge_index.shape:torch.Size([2, 38])
atom_feature.shape:torch.Size([158, 148])
CCCCCCCCCCC/C1=N/CCN1CCN
edge_index.shape:torch.Size([2, 38])
atom_feature.shape:torch.Size([158, 148])
CCCCCCCCCCC/C1=N/CCN1CCN
edge_index.shape:torch.Size([2, 38])
atom_feature.shape:torch.Size([158, 148])
CCCCCCCCCCC/C1=N/CCN1CCN
edge_index.shape:torch.Size([2, 38])
atom_feature.shape:torch.Size([158, 148])
CCCCCCCCCCC/C1=N/CCN1CCN
edge_index.shape:torch.Size([2, 38])
atom_feature.shape:torch.Size([158, 148])
CCCCCCCCCCC/C1=N/CCN1CCN
edge_index.shape:torch.Size([2, 38])
atom_feature.shape:torch.Size([158, 148])
最大原子数，158， 最大边数，334
<class 'pandas.core.frame.DataFrame'>


AttributeError: 'NoneType' object has no attribute 'weight'