In [1]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

# Load split data
X_train = np.load("../data/splits/X_train.npy")
y_train = np.load("../data/splits/y_train.npy")

X_val = np.load("../data/splits/X_val.npy")
y_val = np.load("../data/splits/y_val.npy")

# Convert to tensors
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32)

X_val_t = torch.tensor(X_val, dtype=torch.float32)
y_val_t = torch.tensor(y_val, dtype=torch.float32)

#Dataset class
class CrisprDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# DataLoaders
train_dataset = CrisprDataset(X_train_t, y_train_t)
val_dataset = CrisprDataset(X_val_t, y_val_t)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Get one batch
X_batch, y_batch = next(iter(train_loader))

X_batch.shape, y_batch.shape

(torch.Size([32, 20, 4]), torch.Size([32]))

In [11]:
import torch.nn as nn
import torch.nn.functional as F

class CrisprCNN(nn.Module):
    def __init__(self):
        super(CrisprCNN, self).__init__()

        # input: (batch, 20, 4), convolution step
        self.conv1 = nn.Conv1d(in_channels=4, out_channels=32, kernel_size=3)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3)

        self.dropout = nn.Dropout(0.3)

        # flatten and regress a single efficiency value
        self.fc1 = nn.Linear(64 * 16, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
         #Transpose: (batch, 20, 4) to (batch, 4, 20)
        x = x.permute(0, 2, 1)

        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))

        x = torch.flatten(x, start_dim=1)

        x = self.dropout(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)

        return x
    

In [12]:
model = CrisprCNN()
print(model)

CrisprCNN(
  (conv1): Conv1d(4, 32, kernel_size=(3,), stride=(1,))
  (conv2): Conv1d(32, 64, kernel_size=(3,), stride=(1,))
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=1024, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)


In [13]:
# first forward pass
predictions = model(X_batch)

predictions.shape

torch.Size([32, 1])

In [14]:
# sanity check to see if model is making predictions
predictions [:5].detach().numpy().flatten(), y_batch[:5].numpy()


(array([0.10000792, 0.10328935, 0.06724935, 0.11038876, 0.05861484],
       dtype=float32),
 array([0.547, 0.485, 0.848, 0.088, 0.083], dtype=float32))

In [15]:
import os

os.makedirs("../model", exist_ok =True)
torch.save(model.state_dict(), "../model/crispr_cnn_untrained_day5.pth")