In [1]:
from torch_geometric.data import Data
import torch

def mol_to_graph_data_obj(mol, label_vec):
    atom_features = []
    for atom in mol.GetAtoms():
        # Przykładowe cechy atomu: liczba atomowa, stopień, liczba wodorków, hybrydyzacja, ładunek formalny
        atom_features.append([
            atom.GetAtomicNum(),
            atom.GetDegree(),
            atom.GetTotalNumHs(),
            atom.GetHybridization().real,
            atom.GetFormalCharge()
        ])
    x = torch.tensor(atom_features, dtype=torch.float)

    edge_index = []
    edge_attr = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edge_index.append([i, j])
        edge_index.append([j, i])
        # Przykładowe cechy wiązania: typ wiązania (jedno, podwójne, potrójne, aromatyczne)
        bond_type = bond.GetBondType()
        edge_attr.append([bond_type == Chem.rdchem.BondType.SINGLE,
                          bond_type == Chem.rdchem.BondType.DOUBLE,
                          bond_type == Chem.rdchem.BondType.TRIPLE,
                          bond_type == Chem.rdchem.BondType.AROMATIC])
        edge_attr.append([bond_type == Chem.rdchem.BondType.SINGLE,
                          bond_type == Chem.rdchem.BondType.DOUBLE,
                          bond_type == Chem.rdchem.BondType.TRIPLE,
                          bond_type == Chem.rdchem.BondType.AROMATIC])

    if len(edge_index) > 0:
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_attr = torch.tensor(edge_attr, dtype=torch.float)
    else:
        edge_index = torch.empty((2, 0), dtype=torch.long)
        edge_attr = torch.empty((0, 4), dtype=torch.float)

    y = torch.tensor(label_vec, dtype=torch.float)

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)


In [2]:
from torch_geometric.data import Dataset

class OdorDataset(Dataset):
    def __init__(self, mols, labels):
        super().__init__()
        self.mols = mols
        self.labels = labels

    def len(self):
        return len(self.mols)

    def get(self, idx):
        return mol_to_graph_data_obj(self.mols[idx], self.labels[idx])


In [3]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import MessagePassing, global_mean_pool
from torch_geometric.utils import add_self_loops

class MPNNLayer(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super().__init__(aggr='add')  # sum aggregation
        self.mlp = nn.Sequential(
            nn.Linear(in_channels * 2 + 4, out_channels),
            nn.ReLU(),
            nn.Linear(out_channels, out_channels)
        )

    def forward(self, x, edge_index, edge_attr):
        # Dodaj pętlę własną (self-loop)
        edge_index, edge_attr = add_self_loops(edge_index, edge_attr=edge_attr, fill_value=0, num_nodes=x.size(0))
        return self.propagate(edge_index, x=x, edge_attr=edge_attr)

    def message(self, x_i, x_j, edge_attr):
        # x_i = target node features, x_j = source node features
        tmp = torch.cat([x_i, x_j, edge_attr], dim=-1)
        return self.mlp(tmp)

class SimpleMPNN(nn.Module):
    def __init__(self, num_node_features, num_classes):
        super().__init__()
        self.mp1 = MPNNLayer(num_node_features, 64)
        self.mp2 = MPNNLayer(64, 64)
        self.fc1 = nn.Linear(64, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, data):
        x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
        x = self.mp1(x, edge_index, edge_attr)
        x = F.relu(x)
        x = self.mp2(x, edge_index, edge_attr)
        x = F.relu(x)
        x = global_mean_pool(x, batch)  # pooling po grafach
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [4]:
import os, sys
# Zakładam, że folder 'scripts' jest na tym samym poziomie co 'notebooks'
scripts_path = os.path.abspath(os.path.join('..', 'scripts'))
if scripts_path not in sys.path:
    sys.path.append(scripts_path)

# Teraz możesz importować moduł
from predicting_odor_from_molecular_structure import *
import pandas as pd


df = goodscents()
df.head()

Unnamed: 0,TGSC ID,CID,Concentration %,Solvent,MolecularWeight,IsomericSMILES,IUPACName,name,Labels,mol
0,1000111,7476,100.0,,150.17,CC(=O)C1=CC=C(C=C1)OC,1-(4-methoxyphenyl)ethanone,4'-methoxyacetophenone,"[sweet, vanilla, cherry maraschino cherry, pow...",<rdkit.Chem.rdchem.Mol object at 0x0000023D668...
1,1031871,7478,10.0,dipropylene glycol,152.15,COC1=CC=C(C=C1)C(=O)O,4-methoxybenzoic acid,4-methoxybenzoic acid,"[phenolic, animal, fecal, medicinal]",<rdkit.Chem.rdchem.Mol object at 0x0000023D668...
2,1009281,7501,0.1,triacetin,104.15,C=CC1=CC=CC=C1,styrene,styrene,"[sweet, plastic, floral, balsamic]",<rdkit.Chem.rdchem.Mol object at 0x0000023D668...
3,1001651,244,100.0,,108.14,C1=CC=C(C=C1)CO,phenylmethanol,benzyl alcohol,"[sweet, floral, rose, fruity, phenolic, balsam...",<rdkit.Chem.rdchem.Mol object at 0x0000023D668...
4,1001491,240,10.0,dipropylene glycol,106.12,C1=CC=C(C=C1)C=O,benzaldehyde,benzaldehyde,"[sweet, cherry, cherry maraschino cherry, nutt...",<rdkit.Chem.rdchem.Mol object at 0x0000023D668...


In [6]:
from sklearn.preprocessing import MultiLabelBinarizer

classes = label_frequencies(df, labels_column='Labels').Label.to_list()
mlb = MultiLabelBinarizer(classes=classes)
Y = mlb.fit_transform(df['Labels'])

mlb, Y.shape

(MultiLabelBinarizer(classes=['fruity', 'green', 'sweet', 'floral', 'woody',
                              'herbal', 'fatty', 'fresh', 'waxy', 'spicy',
                              'citrus', 'rose', 'sulfurous', 'earthy', 'nutty',
                              'tropical', 'odorless', 'balsamic', 'oily',
                              'minty', 'vegetable', 'powdery', 'musty', 'meaty',
                              'creamy', 'pineapple', 'apple', 'aldehydic',
                              'roasted', 'ethereal', ...]),
 (4626, 667))

In [8]:
from torch_geometric.loader import DataLoader
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

dataset = OdorDataset(list(df['mol']), Y)  # Y to macierz binarna etykiet
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

model = SimpleMPNN(num_node_features=5, num_classes=Y.shape[1]).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

for epoch in range(20):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch)
        loss = criterion(out, batch.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch.num_graphs
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader.dataset):.4f}")

# Ewaluacja (np. AUROC)
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in test_loader:
        batch = batch.to(device)
        out = model(batch)
        probs = torch.sigmoid(out).cpu()
        all_preds.append(probs)
        all_labels.append(batch.y.cpu())
all_preds = torch.cat(all_preds).numpy()
all_labels = torch.cat(all_labels).numpy()

from sklearn.metrics import roc_auc_score
print("Test AUROC macro:", roc_auc_score(all_labels, all_preds, average='macro'))


ValueError: Target size (torch.Size([21344])) must be the same as input size (torch.Size([32, 667]))

In [10]:
out.shape

torch.Size([32, 667])

In [13]:
batch.y.shape

torch.Size([21344])

In [14]:
32*667

21344

In [15]:
out

tensor([[-0.0744, -0.1585, -0.2344,  ..., -0.0558, -0.2167,  0.0912],
        [-0.0662, -0.1641, -0.2109,  ..., -0.0546, -0.1990,  0.1030],
        [-0.0768, -0.1528, -0.2414,  ..., -0.0665, -0.2167,  0.0900],
        ...,
        [-0.0765, -0.1724, -0.2343,  ..., -0.0580, -0.2189,  0.1015],
        [-0.0704, -0.1656, -0.2134,  ..., -0.0620, -0.2072,  0.1104],
        [-0.0557, -0.1330, -0.2108,  ..., -0.0556, -0.1895,  0.0800]],
       device='cuda:0', grad_fn=<AddmmBackward0>)