# Inspiration
- https://diyago.github.io/2020/11/07/graphs-vs-cv.html

smiles,activity   
OC=1C=CC=CC1CNC2=NC=3C=CC=CC3N2,1


The dataset entry provided above consists of two parts:


SMILES (Simplified Molecular Input Line Entry System) notation: "OC=1C=CC=CC1CNC2=NC=3C=CC=CC3N2". SMILES is a string representation of a chemical structure that encodes molecular information such as atom types, bond types, and connectivity. In this case, the SMILES string represents a molecule with a cyclohexane ring substituted with a piperazine ring and a pyridine ring.

Activity: "1". This represents the activity or property of the molecule that you are interested in predicting or analyzing. Without additional context, it's difficult to say exactly what "activity" means in this context, but it is likely to be some kind of biological activity (e.g., binding to a protein, inhibiting an enzyme, etc.) that has been measured experimentally for this molecule. The specific value of "1" suggests that the molecule is active in this assay or has some positive effect on the biological system being tested.

In [2]:
!pip install torch_geometric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch_geometric
  Downloading torch_geometric-2.3.1.tar.gz (661 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/661.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: torch_geometric
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone
  Created wheel for torch_geometric: filename=torch_geometric-2.3.1-py3-none-any.whl size=910476 sha256=641b56c51d2704228f22c6022b43962b5df897884ee4586747c3d833824fb639
  Stored in directory: /root/.cache/pip/wheels/ac/dc/30/e2874821ff308ee67dcd7a66dbde912411e19e35a1addda028
Su

In [4]:
!pip install skorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting skorch
  Downloading skorch-0.12.1-py3-none-any.whl (193 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.7/193.7 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: skorch
Successfully installed skorch-0.12.1


In [6]:
!pip install rdkit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit
  Downloading rdkit-2023.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m74.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.1


In [15]:
from torch_geometric.data import Data

In [17]:
from torch_geometric.nn import GCNConv, JumpingKnowledge, global_add_pool
from torch.nn import functional as F
from torch_geometric import transforms
from skorch import NeuralNetClassifier
import torch
from torch_geometric.data import Batch
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from torch_geometric.data import InMemoryDataset, download_url
from rdkit import Chem
import pandas as pd
import torch
from tqdm import tqdm
from torch_geometric import utils
import matplotlib.pyplot as plt
import networkx as nx


class COVID(InMemoryDataset):
    url = 'https://github.com/yangkevin2/coronavirus_data/raw/master/data/mpro_xchem.csv'

    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super(COVID, self).__init__(root, transform, pre_transform, pre_filter)
        # Load processed data
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return ['mpro_xchem.csv']

    @property
    def processed_file_names(self):
        return ['data.pt']

    def download(self):
        download_url(self.url, self.raw_dir)

    def process(self):
        df = pd.read_csv(self.raw_paths[0])
        data_list = []
        for smiles, label in df.itertuples(False, None):
            mol = Chem.MolFromSmiles(smiles)  # Read the molecule info
            adj = Chem.GetAdjacencyMatrix(mol)  # Get molecule structure
            # You should extract other features here!
            data = Data(num_nodes=adj.shape[0],
                        edge_index=torch.Tensor(adj).nonzero().T,  y=label)
            data_list.append(data)
        self.data, self.slices = self.collate(data_list)
        torch.save((self.data, self.slices), self.processed_paths[0])


class SimpleGNN(torch.nn.Module):
    def __init__(self, dataset, hidden=64, layers=6):
        super(SimpleGNN, self).__init__()
        self.dataset = dataset
        self.convs = torch.nn.ModuleList()
        self.convs.append(GCNConv(in_channels=dataset.num_node_features,
                                  out_channels=hidden))

        for _ in range(1, layers):
            self.convs.append(GCNConv(in_channels=hidden, out_channels=hidden))

        self.jk = JumpingKnowledge(mode="cat")
        self.jk_lin = torch.nn.Linear(
            in_features=hidden*layers, out_features=hidden)
        self.lin_1 = torch.nn.Linear(in_features=hidden, out_features=hidden)
        self.lin_2 = torch.nn.Linear(
            in_features=hidden, out_features=dataset.num_classes)

    def forward(self, index):
        data = Batch.from_data_list(self.dataset[index])
        x = data.x
        xs = []
        for conv in self.convs:
            x = F.relu(conv(x=x, edge_index=data.edge_index))
            xs.append(x)

        x = self.jk(xs)
        x = F.relu(self.jk_lin(x))
        x = global_add_pool(x, batch=data.batch)
        x = F.relu(self.lin_1(x))
        x = F.softmax(self.lin_2(x), dim=-1)
        return x




In [20]:
if __name__ == "__main__":
    print("Preprocessing data")
    ohd = transforms.OneHotDegree(max_degree=4)
    covid = COVID(root='./data/COVID/', transform=ohd)

    X_train, X_test, y_train, y_test = train_test_split(
        torch.arange(len(covid)).long(), covid.data.y, test_size=0.3, random_state=42)

    print("Generating for train images")
    for graph in tqdm(X_train):
        fig = plt.figure(figsize=(6, 6))
        G = utils.to_networkx(covid[int(graph)])
        a = nx.draw_kamada_kawai(G)
        plt.savefig("./train/id_{}_y_{}.jpg".format(int(graph),
                                                    covid.data.y[int(graph)]), format="jpg")

    print("Generating for test images")
    for graph in tqdm(X_test):
        fig = plt.figure(figsize=(6, 6))
        G = utils.to_networkx(covid[int(graph)])
        a = nx.draw_kamada_kawai(G)
        plt.savefig("./test/id_{}_y_{}.jpg".format(int(graph),
                                                   covid.data.y[int(graph)]), format="jpg")

Output hidden; open in https://colab.research.google.com to view.