In [2]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.3.3


In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from rdkit import *
from rdkit.Chem import Draw
from sklearn.metrics import r2_score

# Генератор синтетических молекул
def generate_molecules(num_molecules=100, max_atoms=20):
    mols = []
    for _ in range(num_molecules):
        # Создаем случайную молекулу (для демонстрации используем циклогексаны)
        mol = Chem.Ring(6)  # В реальности нужно использовать реальные молекулы
        mols.append(mol)
    return mols

# Класс для преобразования молекул в графы
class MolGraph:
    def __init__(self):
        self.atom_types = ['C', 'N', 'O']  # Пример типов атомов
        self.num_atom_types = len(self.atom_types)

    def mol_to_graph(self, mol):
        G = nx.Graph()

        # Добавляем атомы
        for atom in mol.GetAtoms():
            atom_type = self.atom_types.index(atom.GetSymbol())
            G.add_node(atom.GetIdx(), feature=atom_type)

        # Добавляем связи
        for bond in mol.GetBonds():
            G.add_edge(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())

        return G

# Слой Neural FP
class NeuralFPLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(NeuralFPLayer, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim

        # Преобразование для атомов
        self.atom_transform = nn.Linear(input_dim, output_dim)

        # Преобразование для соседей
        self.neighbor_transform = nn.Linear(input_dim, output_dim)

    def forward(self, atom_features, neighbor_features):
        # atom_features: [num_atoms, input_dim]
        # neighbor_features: [num_atoms, max_neighbors, input_dim]

        # Среднее по соседям
        mean_neighbors = torch.mean(neighbor_features, dim=1)

        # Комбинация собственных и соседних признаков
        new_features = self.atom_transform(atom_features) + \
                      self.neighbor_transform(mean_neighbors)

        return F.relu(new_features)

# Модель Neural FP
class NeuralFP(nn.Module):
    def __init__(self, num_atom_types, hidden_dim, output_dim, radius=3):
        super(NeuralFP, self).__init__()
        self.radius = radius
        self.atom_embedding = nn.Embedding(num_atom_types, hidden_dim)

        # Слои для каждого радиуса
        self.layers = nn.ModuleList()
        for _ in range(radius):
            self.layers.append(NeuralFPLayer(hidden_dim, hidden_dim))

        # Регрессор
        self.regressor = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim))

    def forward(self, graphs):
        # graphs: список графов (молекул)
        mol_vectors = []

        for graph in graphs:
            # Получаем признаки атомов
            atom_features = []
            neighbor_indices = []

            for node in graph.nodes():
                atom_type = graph.nodes[node]['feature']
                atom_features.append(atom_type)

                # Собираем соседей
                neighbors = list(graph.neighbors(node))
                neighbor_indices.append(neighbors)

            # Преобразуем в тензоры
            atom_features = torch.tensor(atom_features, dtype=torch.long)
            atom_features = self.atom_embedding(atom_features)

            # Итеративное обновление
            for layer in self.layers:
                # Собираем признаки соседей для каждого атома
                neighbor_features = []
                max_neighbors = max(len(n) for n in neighbor_indices) if neighbor_indices else 0

                for i, neighbors in enumerate(neighbor_indices):
                    if neighbors:
                        features = atom_features[neighbors]
                        # Дополняем нулями, если соседей меньше максимального количества
                        if len(neighbors) < max_neighbors:
                            padding = torch.zeros(max_neighbors - len(neighbors),
                                                 atom_features.size(1))
                            features = torch.cat([features, padding], dim=0)
                    else:
                        features = torch.zeros(max_neighbors, atom_features.size(1))

                    neighbor_features.append(features)

                neighbor_features = torch.stack(neighbor_features)

                # Обновляем признаки атомов
                atom_features = layer(atom_features, neighbor_features)

            # Суммируем признаки атомов для получения молекулярного вектора
            mol_vector = torch.sum(atom_features, dim=0)
            mol_vectors.append(mol_vector)

        # Преобразуем в тензор
        mol_vectors = torch.stack(mol_vectors)

        # Прогнозируем свойства
        out = self.regressor(mol_vectors)
        return out

# Функция для визуализации молекул
def visualize_molecules(mols, properties=None, mols_per_row=4):
    img = Draw.MolsToGridImage(mols[:mols_per_row**2],
                              legends=[str(p) for p in properties[:mols_per_row**2]] if properties else None,
                              molsPerRow=mols_per_row,
                              subImgSize=(200,200))
    plt.figure(figsize=(10,10))
    plt.imshow(img)
    plt.axis('off')
    plt.show()

# Функция для визуализации обучения
def plot_training(epochs, train_losses, val_losses):
    plt.figure(figsize=(10, 5))
    plt.plot(range(epochs), train_losses, label='Training Loss')
    plt.plot(range(epochs), val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.show()

# Основной код
def main():
    # Параметры
    num_molecules = 100
    hidden_dim = 64
    output_dim = 1  # Прогнозируем одно свойство
    radius = 3  # Количество итераций/радиус
    batch_size = 16
    epochs = 50
    lr = 0.001

    # Генерация данных (в реальности нужно использовать реальные молекулы)
    mols = generate_molecules(num_molecules)

    # Преобразование в графы
    converter = MolGraph()
    graphs = [converter.mol_to_graph(mol) for mol in mols]

    # Случайные свойства для демонстрации
    properties = np.random.randn(num_molecules)

    # Разделение данных
    indices = np.random.permutation(num_molecules)
    train_idx = indices[:int(0.7*num_molecules)]
    val_idx = indices[int(0.7*num_molecules):int(0.85*num_molecules)]
    test_idx = indices[int(0.85*num_molecules):]

    # Модель и оптимизатор
    model = NeuralFP(converter.num_atom_types, hidden_dim, output_dim, radius)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    # Обучение
    train_losses = []
    val_losses = []

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0

        # Мини-батчи
        for i in range(0, len(train_idx), batch_size):
            batch_indices = train_idx[i:i+batch_size]
            batch_graphs = [graphs[idx] for idx in batch_indices]
            batch_props = torch.FloatTensor(properties[batch_indices])

            optimizer.zero_grad()

            # Forward pass
            output = model(batch_graphs).squeeze()
            loss = criterion(output, batch_props)

            # Backward pass
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        train_losses.append(epoch_loss / (len(train_idx)/batch_size))

        # Валидация
        model.eval()
        with torch.no_grad():
            val_graphs = [graphs[idx] for idx in val_idx]
            val_props = torch.FloatTensor(properties[val_idx])
            val_output = model(val_graphs).squeeze()
            val_loss = criterion(val_output, val_props)
            val_losses.append(val_loss.item())

        if epoch % 5 == 0:
            print(f'Epoch {epoch:03d}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}')

    # Визуализация обучения
    plot_training(epochs, train_losses, val_losses)

    # Тестирование
    model.eval()
    with torch.no_grad():
        test_graphs = [graphs[idx] for idx in test_idx]
        test_props = properties[test_idx]
        test_output = model(test_graphs).squeeze().numpy()

        r2 = r2_score(test_props, test_output)
        print(f'Test R^2 Score: {r2:.4f}')

    # Визуализация молекул с предсказанными свойствами
    sample_indices = test_idx[:16]
    sample_mols = [mols[i] for i in sample_indices]
    sample_props = properties[sample_indices]
    sample_preds = test_output[:len(sample_indices)]

    # Создаем подписи с истинными и предсказанными значениями
    legends = [f"True: {p:.2f}, Pred: {pred:.2f}"
               for p, pred in zip(sample_props, sample_preds)]

    img = Draw.MolsToGridImage(sample_mols, legends=legends,
                             molsPerRow=4, subImgSize=(200,200))
    plt.figure(figsize=(10,10))
    plt.imshow(img)
    plt.axis('off')
    plt.title("True vs Predicted Properties")
    plt.show()

if __name__ == '__main__':
    main()

AttributeError: module 'rdkit.Chem' has no attribute 'Ring'

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Draw, Descriptors
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from sklearn.metrics import r2_score
from torch.utils.data import Dataset, DataLoader

# Класс для загрузки молекулярных данных
class MoleculeDataset(Dataset):
    def __init__(self, smiles_list, properties):
        self.smiles_list = smiles_list
        self.properties = properties
        self.atom_types = ['C', 'N', 'O', 'F', 'P', 'S', 'Cl', 'Br', 'I', 'H']  # Расширенный список атомов

    def __len__(self):
        return len(self.smiles_list)

    def __getitem__(self, idx):
        mol = Chem.MolFromSmiles(self.smiles_list[idx])
        if mol is None:
            return None

        # Преобразуем молекулу в граф
        graph = nx.Graph()

        # Добавляем атомы с их признаками
        for atom in mol.GetAtoms():
            symbol = atom.GetSymbol()
            try:
                atom_type = self.atom_types.index(symbol)
            except ValueError:
                atom_type = len(self.atom_types)  # Для неизвестных атомов

            graph.add_node(atom.GetIdx(), feature=atom_type)

        # Добавляем связи
        for bond in mol.GetBonds():
            graph.add_edge(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())

        # Получаем свойство
        prop = torch.FloatTensor([self.properties[idx]])

        return graph, prop

# Слой Neural FP
class NeuralFPLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(NeuralFPLayer, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim

        # Преобразование для атомов
        self.atom_transform = nn.Linear(input_dim, output_dim)

        # Преобразование для соседей
        self.neighbor_transform = nn.Linear(input_dim, output_dim)

    def forward(self, atom_features, neighbor_features):
        # atom_features: [num_atoms, input_dim]
        # neighbor_features: [num_atoms, max_neighbors, input_dim]

        # Среднее по соседям
        mean_neighbors = torch.mean(neighbor_features, dim=1)

        # Комбинация собственных и соседних признаков
        new_features = self.atom_transform(atom_features) + \
                      self.neighbor_transform(mean_neighbors)

        return F.relu(new_features)

# Модель Neural FP
class NeuralFP(nn.Module):
    def __init__(self, num_atom_types, hidden_dim, output_dim, radius=3):
        super(NeuralFP, self).__init__()
        self.radius = radius
        self.atom_embedding = nn.Embedding(num_atom_types + 1, hidden_dim)  # +1 для неизвестных атомов

        # Слои для каждого радиуса
        self.layers = nn.ModuleList()
        for _ in range(radius):
            self.layers.append(NeuralFPLayer(hidden_dim, hidden_dim))

        # Регрессор
        self.regressor = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim))

    def forward(self, graphs):
        # graphs: список графов (молекул)
        mol_vectors = []

        for graph in graphs:
            # Получаем признаки атомов
            atom_features = []
            neighbor_indices = []

            for node in graph.nodes():
                atom_type = graph.nodes[node]['feature']
                atom_features.append(atom_type)

                # Собираем соседей
                neighbors = list(graph.neighbors(node))
                neighbor_indices.append(neighbors)

            # Преобразуем в тензоры
            atom_features = torch.tensor(atom_features, dtype=torch.long)
            atom_features = self.atom_embedding(atom_features)

            # Итеративное обновление
            for layer in self.layers:
                # Собираем признаки соседей для каждого атома
                neighbor_features = []
                max_neighbors = max(len(n) for n in neighbor_indices) if neighbor_indices else 0

                for i, neighbors in enumerate(neighbor_indices):
                    if neighbors:
                        features = atom_features[neighbors]
                        # Дополняем нулями, если соседей меньше максимального количества
                        if len(neighbors) < max_neighbors:
                            padding = torch.zeros(max_neighbors - len(neighbors),
                                                 atom_features.size(1))
                            features = torch.cat([features, padding], dim=0)
                    else:
                        features = torch.zeros(max_neighbors, atom_features.size(1))

                    neighbor_features.append(features)

                neighbor_features = torch.stack(neighbor_features)

                # Обновляем признаки атомов
                atom_features = layer(atom_features, neighbor_features)

            # Суммируем признаки атомов для получения молекулярного вектора
            mol_vector = torch.sum(atom_features, dim=0)
            mol_vectors.append(mol_vector)

        # Преобразуем в тензор
        mol_vectors = torch.stack(mol_vectors)

        # Прогнозируем свойства
        out = self.regressor(mol_vectors)
        return out

# Функция для визуализации молекул
def visualize_molecules(smiles_list, properties=None, preds=None, mols_per_row=4):
    mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
    legends = []

    for i in range(len(mols)):
        if properties is not None and preds is not None:
            legends.append(f"True: {properties[i]:.2f}, Pred: {preds[i]:.2f}")
        elif properties is not None:
            legends.append(f"True: {properties[i]:.2f}")
        elif preds is not None:
            legends.append(f"Pred: {preds[i]:.2f}")

    img = Draw.MolsToGridImage(mols[:mols_per_row**2],
                             legends=legends[:mols_per_row**2],
                             molsPerRow=mols_per_row,
                             subImgSize=(200,200))
    plt.figure(figsize=(10,10))
    plt.imshow(img)
    plt.axis('off')
    plt.show()

# Функция для визуализации обучения
def plot_training(epochs, train_losses, val_losses):
    plt.figure(figsize=(10, 5))
    plt.plot(range(epochs), train_losses, label='Training Loss')
    plt.plot(range(epochs), val_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.show()

# Основной код
def main():
    # Параметры
    hidden_dim = 64
    output_dim = 1  # Прогнозируем одно свойство
    radius = 3  # Количество итераций/радиус
    batch_size = 32
    epochs = 100
    lr = 0.001

    # Пример данных (в реальности нужно использовать реальный датасет)
    smiles_list = [
        'CCO',  # Этанол
        'CC(=O)O',  # Уксусная кислота
        'C1CCCCC1',  # Циклогексан
        'c1ccccc1',  # Бензол
        'CCN(CC)CC',  # Триэтиламин
        'O=C(O)C1=CC=CC=C1',  # Бензойная кислота
        'C1CCC1',  # Циклобутан
        'CCOC(=O)C',  # Этилацетат
        'CC(C)C',  # Изобутан
        'C1COCCO1'  # 1,4-Диоксан
    ]

    # Вычисляем логарифм коэффициента распределения октанол-вода как пример свойства
    properties = [np.log(Descriptors.MolLogP(Chem.MolFromSmiles(smiles)) for smiles in smiles_list)]

    # Создаем датасет
    dataset = MoleculeDataset(smiles_list, properties)

    # Разделение данных
    train_size = int(0.7 * len(dataset))
    val_size = int(0.15 * len(dataset))
    test_size = len(dataset) - train_size - val_size

    train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
        dataset, [train_size, val_size, test_size])

    # Модель и оптимизатор
    model = NeuralFP(len(dataset.atom_types), hidden_dim, output_dim, radius)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    # Обучение
    train_losses = []
    val_losses = []

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0

        # Мини-батчи (упрощенная версия без DataLoader)
        for i in range(0, len(train_dataset), batch_size):
            batch_graphs = []
            batch_props = []

            for j in range(i, min(i + batch_size, len(train_dataset))):
                graph, prop = train_dataset[j]
                if graph is not None:
                    batch_graphs.append(graph)
                    batch_props.append(prop)

            if not batch_graphs:
                continue

            batch_props = torch.cat(batch_props)

            optimizer.zero_grad()

            # Forward pass
            output = model(batch_graphs)
            loss = criterion(output, batch_props)

            # Backward pass
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        if len(train_dataset) > 0:
            train_losses.append(epoch_loss / (len(train_dataset)/batch_size))

        # Валидация
        model.eval()
        val_loss = 0
        with torch.no_grad():
            val_graphs = []
            val_props = []

            for i in range(len(val_dataset)):
                graph, prop = val_dataset[i]
                if graph is not None:
                    val_graphs.append(graph)
                    val_props.append(prop)

            if val_graphs:
                val_props = torch.cat(val_props)
                val_output = model(val_graphs)
                val_loss = criterion(val_output, val_props).item()
                val_losses.append(val_loss)

        if epoch % 10 == 0:
            print(f'Epoch {epoch:03d}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}')

    # Визуализация обучения
    plot_training(epochs, train_losses, val_losses)

    # Тестирование
    model.eval()
    with torch.no_grad():
        test_graphs = []
        test_props = []
        test_smiles = []

        for i in range(len(test_dataset)):
            idx = test_dataset.indices[i]
            graph, prop = dataset[idx]
            if graph is not None:
                test_graphs.append(graph)
                test_props.append(prop.item())
                test_smiles.append(dataset.smiles_list[idx])

        if test_graphs:
            test_output = model(test_graphs).squeeze().numpy()

            r2 = r2_score(test_props, test_output)
            print(f'Test R^2 Score: {r2:.4f}')

            # Визуализация молекул с предсказанными свойствами
            visualize_molecules(test_smiles, test_props, test_output)

if __name__ == '__main__':
    main()

TypeError: loop of ufunc does not support argument 0 of type generator which has no callable log method