In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from pathlib import Path
from rdkit import Chem
from rdkit import RDLogger
from scipy.interpolate import interp1d
from torch.utils.data import DataLoader, TensorDataset

# Disable RDLogger warnings
RDLogger.DisableLog('rdApp.*')
import os

os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
functional_groups = {
    'Acid anhydride': Chem.MolFromSmarts('[CX3](=[OX1])[OX2][CX3](=[OX1])'),
    'Acyl halide': Chem.MolFromSmarts('[CX3](=[OX1])[F,Cl,Br,I]'),
    'Alcohol': Chem.MolFromSmarts('[#6][OX2H]'),
    'Aldehyde': Chem.MolFromSmarts('[CX3H1](=O)[#6,H]'),
    'Alkane': Chem.MolFromSmarts('[CX4;H3,H2]'),
    'Alkene': Chem.MolFromSmarts('[CX3]=[CX3]'),
    'Alkyne': Chem.MolFromSmarts('[CX2]#[CX2]'),
    'Amide': Chem.MolFromSmarts('[NX3][CX3](=[OX1])[#6]'),
    'Amine': Chem.MolFromSmarts('[NX3;H2,H1,H0;!$(NC=O)]'),
    'Arene': Chem.MolFromSmarts('[cX3]1[cX3][cX3][cX3][cX3][cX3]1'),
    'Azo compound': Chem.MolFromSmarts('[#6][NX2]=[NX2][#6]'),
    'Carbamate': Chem.MolFromSmarts('[NX3][CX3](=[OX1])[OX2H0]'),
    'Carboxylic acid': Chem.MolFromSmarts('[CX3](=O)[OX2H]'),
    'Enamine': Chem.MolFromSmarts('[NX3][CX3]=[CX3]'),
    'Enol': Chem.MolFromSmarts('[OX2H][#6X3]=[#6]'),
    'Ester': Chem.MolFromSmarts('[#6][CX3](=O)[OX2H0][#6]'),
    'Ether': Chem.MolFromSmarts('[OD2]([#6])[#6]'),
    'Haloalkane': Chem.MolFromSmarts('[#6][F,Cl,Br,I]'),
    'Hydrazine': Chem.MolFromSmarts('[NX3][NX3]'),
    'Hydrazone': Chem.MolFromSmarts('[NX3][NX2]=[#6]'),
    'Imide': Chem.MolFromSmarts('[CX3](=[OX1])[NX3][CX3](=[OX1])'),
    'Imine': Chem.MolFromSmarts('[$([CX3]([#6])[#6]),$([CX3H][#6])]=[$([NX2][#6]),$([NX2H])]'),
    'Isocyanate': Chem.MolFromSmarts('[NX2]=[C]=[O]'),
    'Isothiocyanate': Chem.MolFromSmarts('[NX2]=[C]=[S]'),
    'Ketone': Chem.MolFromSmarts('[#6][CX3](=O)[#6]'),
    'Nitrile': Chem.MolFromSmarts('[NX1]#[CX2]'),
    'Phenol': Chem.MolFromSmarts('[OX2H][cX3]:[c]'),
    'Phosphine': Chem.MolFromSmarts('[PX3]'),
    'Sulfide': Chem.MolFromSmarts('[#16X2H0]'),
    'Sulfonamide': Chem.MolFromSmarts('[#16X4]([NX3])(=[OX1])(=[OX1])[#6]'),
    'Sulfonate': Chem.MolFromSmarts('[#16X4](=[OX1])(=[OX1])([#6])[OX2H0]'),
    'Sulfone': Chem.MolFromSmarts('[#16X4](=[OX1])(=[OX1])([#6])[#6]'),
    'Sulfonic acid': Chem.MolFromSmarts('[#16X4](=[OX1])(=[OX1])([#6])[OX2H]'),
    'Sulfoxide': Chem.MolFromSmarts('[#16X3]=[OX1]'),
    'Thial': Chem.MolFromSmarts('[CX3H1](=S)[#6,H]'),
    'Thioamide': Chem.MolFromSmarts('[NX3][CX3]=[SX1]'),
    'Thiol': Chem.MolFromSmarts('[#16X2H]')
}
def match_group(mol: Chem.Mol, func_group) -> int:
    if type(func_group) == Chem.Mol:
        n = len(mol.GetSubstructMatches(func_group))
    else:
        n = func_group(mol)
    return 0 if n == 0 else 1
# Function to map SMILES to functional groups (no change)
def get_functional_groups(smiles: str) -> dict:
    smiles = smiles.strip().replace(' ', '')
    mol = Chem.MolFromSmiles(smiles)
    if mol is None: 
        return None
    func_groups = [match_group(mol, smarts) for smarts in functional_groups.values()]
    return func_groups

def interpolate_to_600(spec):
    old_x = np.arange(len(spec))
    new_x = np.linspace(min(old_x), max(old_x), 600)
    interp = interp1d(old_x, spec)
    return interp(new_x)

def make_msms_spectrum(spectrum):
    msms_spectrum = np.zeros(10000)
    for peak in spectrum:
        peak_pos = int(peak[0]*10)
        peak_pos = min(peak_pos, 9999)
        msms_spectrum[peak_pos] = peak[1]
    return msms_spectrum

# Define CNN Model in PyTorch
class CNNModel(nn.Module):
    def __init__(self, num_fgs):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=31, kernel_size=11, padding='same')
        self.conv2 = nn.Conv1d(in_channels=31, out_channels=62, kernel_size=11, padding='same')
        self.fc1 = nn.Linear(62 * 150, 4927)
        self.fc2 = nn.Linear(4927, 2785)
        self.fc3 = nn.Linear(2785, 1574)
        self.fc4 = nn.Linear(1574, num_fgs)
        self.dropout = nn.Dropout(0.48599073736368)
        self.batch_norm1 = nn.BatchNorm1d(31)
        self.batch_norm2 = nn.BatchNorm1d(62)

    def forward(self, x):
        x = F.relu(self.batch_norm1(self.conv1(x)))
        x = F.max_pool1d(x, 2)
        x = F.relu(self.batch_norm2(self.conv2(x)))
        x = F.max_pool1d(x, 2)
        x = x.view(x.size(0), -1)  # Flatten
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc4(x))
        return x




In [2]:
# Training function in PyTorch
def train_model(X_train, y_train, X_test, num_fgs, weighted=False, batch_size=41, epochs=41):
    device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
    model = CNNModel(num_fgs).to(device)
    
    # Define optimizer and loss
    optimizer = optim.Adam(model.parameters())
    
    if weighted:
        class_weights = calculate_class_weights(y_train)
        criterion = WeightedBinaryCrossEntropyLoss(class_weights).to(device)
    else:
        criterion = nn.BCELoss().to(device)

    # Create DataLoader
    train_data = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
    test_data = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

    # Train the model
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, targets in train_loader:
            
            inputs, targets = inputs.to(device), targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs.unsqueeze(1))  # Add channel dimension
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()

        print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}')

    # Evaluate the model
    model.eval()
    predictions = []
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs = inputs.to(device)
            outputs = model(inputs.unsqueeze(1))
            predictions.append(outputs.cpu().numpy())

    predictions = np.concatenate(predictions)
    return (predictions > 0.5).astype(int)

# Custom loss function with class weights
class WeightedBinaryCrossEntropyLoss(nn.Module):
    def __init__(self, class_weights):
        super(WeightedBinaryCrossEntropyLoss, self).__init__()
        self.class_weights = class_weights

    def forward(self, y_pred, y_true):
        loss = self.class_weights[0] * (1 - y_true) * torch.log(1 - y_pred + 1e-15) + \
               self.class_weights[1] * y_true * torch.log(y_pred + 1e-15)
        return -loss.mean()

# Calculate class weights
def calculate_class_weights(y_true):
    num_samples = y_true.shape[0]
    class_weights = np.zeros((2, y_true.shape[1]))
    for i in range(y_true.shape[1]):
        weights_n = num_samples / (2 * (y_true[:, i] == 0).sum())
        weights_p = num_samples / (2 * (y_true[:, i] == 1).sum())
        class_weights[0, i] = weights_n
        class_weights[1, i] = weights_p
    return torch.tensor(class_weights.T, dtype=torch.float32)



In [3]:
# Loading data (no change)
analytical_data = Path("/data/zjh2/multimodal-spectroscopic-dataset-main/data/multimodal_spectroscopic_dataset")
out_path = Path("/home/dwj/icml_guangpu/multimodal-spectroscopic-dataset-main/runs/runs_f_groups/all")
columns = ["h_nmr_spectra", "c_nmr_spectra", "ir_spectra"]
seed = 3245

# 准备存储合并后的数据
all_data = []
i=0
# 一次性读取文件并处理所有列
for parquet_file in analytical_data.glob("*.parquet"):
    i+=1
    # 读取所有需要的列
    data = pd.read_parquet(parquet_file, columns=columns + ['smiles'])
    
    # 对每个列进行插值
    for column in columns:
        data[column] = data[column].map(interpolate_to_600)
    
    # 添加功能团信息
    data['func_group'] = data.smiles.map(get_functional_groups)
    
    all_data.append(data)
    print(f"Loaded Data from: ", i)
    
# 合并所有数据
training_data = pd.concat(all_data, ignore_index=True)



Loaded Data from:  1
Loaded Data from:  2
Loaded Data from:  3
Loaded Data from:  4
Loaded Data from:  5
Loaded Data from:  6
Loaded Data from:  7
Loaded Data from:  8
Loaded Data from:  9
Loaded Data from:  10
Loaded Data from:  11
Loaded Data from:  12
Loaded Data from:  13
Loaded Data from:  14
Loaded Data from:  15
Loaded Data from:  16
Loaded Data from:  17
Loaded Data from:  18
Loaded Data from:  19
Loaded Data from:  20
Loaded Data from:  21
Loaded Data from:  22
Loaded Data from:  23
Loaded Data from:  24
Loaded Data from:  25
Loaded Data from:  26
Loaded Data from:  27
Loaded Data from:  28
Loaded Data from:  29
Loaded Data from:  30
Loaded Data from:  31
Loaded Data from:  32
Loaded Data from:  33
Loaded Data from:  34
Loaded Data from:  35
Loaded Data from:  36
Loaded Data from:  37
Loaded Data from:  38
Loaded Data from:  39
Loaded Data from:  40
Loaded Data from:  41
Loaded Data from:  42
Loaded Data from:  43
Loaded Data from:  44
Loaded Data from:  45
Loaded Data from:  

In [7]:


def batch_process_samples(column_data, pca, batch_size=10000):
    num_samples = column_data.shape[0]
    reduced_data = []

    for start_idx in range(0, num_samples, batch_size):
        end_idx = min(start_idx + batch_size, num_samples)
        batch = column_data[start_idx:end_idx]  # shape = (batch_size, 3, 600)
        batch_reshaped = batch.reshape(-1, 600)  # 展平批量数据
        reduced_batch = pca.fit_transform(batch_reshaped).reshape(batch.shape[0], -1)  # 降维
        reduced_data.append(reduced_batch)
        print(f"Processed {end_idx}/{num_samples} samples.")

    return np.vstack(reduced_data)

# 执行批量降维




from sklearn.decomposition import KernelPCA
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
transformed_columns=[]
def process_and_reduce(training_data):
    # 取出前三列数据
    selected_columns = training_data.iloc[:, :3]
    
    # 将每列数据转换为2D数组形式，并拼接成一个整体（n_samples, 3）
    i=0
    for column in training_data.columns:
        if i<3:
        # 将每列数据转换为2D数组形式（样本数, 600）
            i+=1
            column_data = np.vstack(training_data[column].values)
            print(column_data.shape)
            transformed_columns.append(column_data)
    
    # 将三列变换后的结果拼接成1800维向量
    column_data = np.stack(transformed_columns, axis=1)
    print("Original shape of selected data:", column_data.shape)
    
    # 执行 PCA 降维到一列
    pca = KernelPCA(kernel='rbf', n_components=1, gamma=0.1)  # gamma值可调
    reduced_data_list = []

    # 遍历所有样本，每个样本 shape = (3, 600)
    reduced_data_batch = batch_process_samples(column_data, pca, batch_size=10000)
    print("Shape of reduced data:", reduced_data_batch.shape)

    # 将降维后的所有样本拼接成最终数组
    reduced_data = np.vstack(reduced_data_list)  # shape = (n_samples, 600)
    print("Shape after PCA reduction:", reduced_data.shape)
    
    # 删除原来的前三列，拼接降维后的结果
    reduced_df = pd.DataFrame(reduced_data, columns=['reduced_data'])
    result = pd.concat([reduced_df, training_data.iloc[:, 3:]], axis=1)
    
    return result


# 使用示例
processed_data = process_and_reduce(training_data)
print(processed_data.head())


(794403, 600)
(794403, 600)
(794403, 600)
Original shape of selected data: (794403, 3, 600)


: 

In [5]:
train, test = train_test_split(training_data, test_size=0.1, random_state=seed)

X_train = np.stack(train[column].to_list())
y_train = np.stack(train['func_group'].to_list())
X_test = np.stack(test[column].to_list())
y_test = np.stack(test['func_group'].to_list())

In [None]:
# Train extended model
predictions = train_model(X_train, y_train, X_test, num_fgs=37, weighted=False)

# Evaluate the model
f1 = f1_score(y_test, predictions, average='micro')
print(f'F1 Score: {f1}')

# Save results
with open(out_path / "results.pickle", "wb") as file:
    pickle.dump({'pred': predictions, 'tgt': y_test}, file)

2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
Epoch 1/41, Loss: 0.21697327263758215
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
Epoch 2/41, Loss: 0.1795719624824927
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
Epoch 3/41, Loss: 0.17160328781940568
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
Epoch 4/41, Loss: 0.1658416707331026
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
Epoch 5/41, Loss: 0.16268213743894872
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2


In [None]:
#备份




from sklearn.decomposition import KernelPCA
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
transformed_columns=[]
def process_and_reduce(training_data):
    # 取出前三列数据
    selected_columns = training_data.iloc[:, :3]
    
    # 将每列数据转换为2D数组形式，并拼接成一个整体（n_samples, 3）
    i=0
    for column in training_data.columns:
        if i<3:
        # 将每列数据转换为2D数组形式（样本数, 600）
            i+=1
            column_data = np.vstack(training_data[column].values)
            transformed_columns.append(column_data)
    
    # 将三列变换后的结果拼接成1800维向量
    column_data = np.concatenate(transformed_columns, axis=1)
    print("Original shape of selected data:", column_data.shape)
    
    # 执行 PCA 降维到一列
    pca = KernelPCA(kernel='rbf', n_components=600, gamma=0.1)  # gamma值可调
    



    # 将降维后的所有样本拼接成最终数组
    reduced_datareduced_data = pca.fit_transform(column_data)
    print("Shape after PCA reduction:", reduced_datareduced_data.shape)
    expanded_data = np.expand_dims(reduced_datareduced_data, axis=1)
    # 删除原来的前三列，拼接降维后的结果
   

    # 转换成 pandas DataFrame
    df = pd.DataFrame({'feature_list': [row.tolist() for row in reshaped_data]})
    result = pd.concat([df, training_data.iloc[:, 3:]], axis=1)
    
    return result


# 使用示例
processed_data = process_and_reduce(training_data)
print(processed_data.head())
