In [41]:
import deepchem as dc
import pandas as pd
import rdkit

_, tox21, _ = dc.molnet.load_tox21()

In [42]:
df = pd.DataFrame(tox21[0].ids, columns=['smiles'])

In [43]:
train_dataset, valid_dataset, test_dataset = tox21
logits = train_dataset.y #6264 sets of logits (12 toxicity labels)

# DONE: remove rows from logits (if needed)

In [44]:
# get all adj matrices from smiles + add to dataframe

from rdkit.Chem import MolFromSmiles, rdmolops
from rdkit.Chem import AllChem, AddHs


matrices = []
for i in range(df.size):
    smile = df.iloc[i]["smiles"]
    mol = MolFromSmiles(smile)
    matrices.append(rdmolops.GetAdjacencyMatrix(mol))

df.insert(1, "Matrices", matrices, True)
df



Unnamed: 0,smiles,Matrices
0,CC(O)(P(=O)(O)O)P(=O)(O)O,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, ..."
1,CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,OC[C@H](O)[C@@H](O)[C@H](O)CO,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, ..."
3,CCCCCCCC(=O)[O-].CCCCCCCC(=O)[O-].[Zn+2],"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,CC(C)COC(=O)C(C)C,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 1, ..."
...,...,...
6259,CC1CCCCN1CCCOC(=O)c1ccc(OC2CCCCC2)cc1,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
6260,Cc1cc(CCCOc2c(C)cc(-c3noc(C(F)(F)F)n3)cc2C)on1,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
6261,O=C1OC(OC(=O)c2cccnc2Nc2cccc(C(F)(F)F)c2)c2ccc...,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
6262,CC(=O)C1(C)CC2=C(CCCC2(C)C)CC1C,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [45]:
# get label with most frequent toxicity label of 1
freqs = [0]*12
labels = tox21[0].tasks

for i in range(len(logits)):
    for t in range(12):
        if logits[i][t] == 1:
            freqs[t] += 1

maxLabel = labels[0]
maxFreq = 0
for i in range(len(labels)):
    if freqs[i] > maxFreq:
        maxFreq = freqs[i]
        maxLabel = labels[i]
    #print(f'{labels[i]}: {freqs[i]}')

print(f'\nMax label: {maxLabel}')




Max label: SR-ARE


In [46]:
# extract only maxLabel column from tasks
labelFreq = [x[7] for x in logits]

df.insert(2, "SR-ARE Task", labelFreq, True)
df

Unnamed: 0,smiles,Matrices,SR-ARE Task
0,CC(O)(P(=O)(O)O)P(=O)(O)O,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, ...",0.0
1,CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.0
2,OC[C@H](O)[C@@H](O)[C@H](O)CO,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, ...",0.0
3,CCCCCCCC(=O)[O-].CCCCCCCC(=O)[O-].[Zn+2],"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.0
4,CC(C)COC(=O)C(C)C,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 1, ...",0.0
...,...,...,...
6259,CC1CCCCN1CCCOC(=O)c1ccc(OC2CCCCC2)cc1,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.0
6260,Cc1cc(CCCOc2c(C)cc(-c3noc(C(F)(F)F)n3)cc2C)on1,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.0
6261,O=C1OC(OC(=O)c2cccnc2Nc2cccc(C(F)(F)F)c2)c2ccc...,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.0
6262,CC(=O)C1(C)CC2=C(CCCC2(C)C)CC1C,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.0


In [49]:
# extract vector features of atoms with featurizers? -- https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html

# or use rdkit -- rdkit_featurizer = dc.feat.RDKitDescriptors()
feat = dc.feat.ConvMolFeaturizer(per_atom_fragmentation=False)
featNumpy = feat.featurize(df["smiles"])




numpy.ndarray

In [71]:
# convert data to numpy

numpyY = df["SR-ARE Task"].to_numpy()
print(type(numpyY))
print(type(featNumpy))


<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


<bound method ConvMol.get_adjacency_list of <deepchem.feat.mol_graphs.ConvMol object at 0x7f163cc0b040>>

In [61]:
# turn into dataframe --> dataloader
# TODO: figure out how to properly convert featurized numpy array into dataframe/loader
import torch



TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [58]:
# DONE: extract toxicity labels from tox21[0]
    # insert into pandas dataframe -- (if rows need to be deleted, both labels and smiles matrices should be removed)
    # OR collect all row indices that need to be removed --> remove them from input and label data when passing into the data loader

# DONE: determine how to make dataframe --> dataloader (?)
    # identify how to account for different sized adj matrix inputs

# DONE: convert from smiles to adjacency matrices (rdkit? built in function w/ deepchem?)

# DONE: create vectors of atom information

# TODO: ask dr minocceri how to combine feature vector and adjacency matrices
    # DONE? featurizer already collects adjacency matrix as feature

# DONE: visualize atom/molecule input data

In [23]:
# module import
from torch import nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import torchvision


In [24]:
import requests
from pathlib import Path 

# Download helper functions from Learn PyTorch repo (if not already downloaded)
if Path("helper_functions.py").is_file():
  print("helper_functions.py already exists, skipping download")
else:
  print("Downloading helper_functions.py")
  # Note: you need the "raw" GitHub URL for this to work
  request = requests.get("https://raw.githubusercontent.com/mrdbourke/pytorch-deep-learning/main/helper_functions.py")
  with open("helper_functions.py", "wb") as f:
    f.write(request.content)

helper_functions.py already exists, skipping download


In [25]:
from helper_functions import accuracy_fn

In [26]:
class FashionMNISTModelV2(nn.Module):

    def __init__(self, input_shape: int, hidden_units: int, output_shape: int):
        super().__init__()
        self.block_1 = nn.Sequential(
            nn.Conv2d(in_channels=input_shape, 
                      out_channels=hidden_units, 
                      kernel_size=3, # how big is the square that's going over the image?
                      stride=1, # default
                      padding=1),# options = "valid" (no padding) or "same" (output has same shape as input) or int for specific number 
            nn.ReLU(),
            nn.Conv2d(in_channels=hidden_units, 
                      out_channels=hidden_units,
                      kernel_size=3,
                      stride=1,
                      padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,
                         stride=2) # default stride value is same as kernel_size
        )
        self.block_2 = nn.Sequential(
            nn.Conv2d(hidden_units, hidden_units, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(hidden_units, hidden_units, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            # Where did this in_features shape come from? 
            # It's because each layer of our network compresses and changes the shape of our inputs data.
            nn.Linear(in_features=hidden_units*7*7, 
                      out_features=output_shape)
        )
    
    def forward(self, x: torch.Tensor):
        return self.classifier(self.block_2(self.block1(x)))


In [27]:
loss_fn = nn.CrossEntropyLoss() # this is also called "criterion"/"cost function" in some places
optimizer = torch.optim.SGD(params = model.parameters(), lr=0.1)

NameError: name 'model' is not defined

In [None]:
epochs = 3

# Create training and testing loop
for epoch in range(epochs):
    print(f"Epoch: {epoch}\n-------")

    ### Training
    train_loss = 0
    # Add a loop to loop through training batches
    for batch, (X, y) in enumerate(train_dataloader):
        model.train() 
        # 1. Forward pass
        y_pred = model(X)

        # 2. Calculate loss (per batch)
        loss = loss_fn(y_pred, y)
        train_loss += loss # accumulatively add up the loss per epoch 

        # 3. Optimizer zero grad
        optimizer.zero_grad()

        # 4. Loss backward
        loss.backward()

        # 5. Optimizer step
        optimizer.step()

        # Print out how many samples have been seen
        if batch % 400 == 0:
            print(f"Looked at {batch * len(X)}/{len(train_dataloader.dataset)} samples")

    # Divide total train loss by length of train dataloader (average loss per batch per epoch)
    train_loss /= len(train_dataloader)
    
    ### Testing
    # Setup variables for accumulatively adding up loss and accuracy 
    test_loss, test_acc = 0, 0 
    model.eval()
    with torch.inference_mode():
        for X, y in test_dataloader:
            # 1. Forward pass
            test_pred = model(X)
           
            # 2. Calculate loss (accumatively)
            test_loss += loss_fn(test_pred, y) # accumulatively add up the loss per epoch

            # 3. Calculate accuracy (preds need to be same as y_true)
            test_acc += accuracy_fn(y_true=y, y_pred=test_pred.argmax(dim=1))
        
        # Calculations on test metrics need to happen inside torch.inference_mode()
        # Divide total test loss by length of test dataloader (per batch)
        test_loss /= len(test_dataloader)

        # Divide total accuracy by length of test dataloader (per batch)
        test_acc /= len(test_dataloader)

    ## Print out what's happening
    print(f"\nTrain loss: {train_loss:.5f} | Test loss: {test_loss:.5f}, Test acc: {test_acc:.2f}%\n")