In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import json
import fastai
import math

In [2]:
ROOT = "../../data/protein/classification/full_750/"
DATA_PATH = ROOT+"3_kmers"
EMBEDDING_PATH = "../../data/protein/classification/data_sources/protVec_100d_3grams.csv"
MODEL_PATH = "../../weights/protein/classification/full_750/3_kmers"
SEQUENCE_LENGTH=748

In [3]:
num_workers = 8 # On cloud 8
batch_size = 256

In [4]:
with open(ROOT+"classToIndex.json") as f:
    data = json.load(f)
NUM_CLASSES = max(data.values())+1

In [5]:
class EnzymeDataSet(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, path, seq_length, is_test=True, vocal_size=20, transform=None):
        self.data = np.load(path)
        self.seq_length = seq_length
        self.is_test = is_test
        self.vocal_size = vocal_size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data[idx]
        if not self.is_test:
            mutations = int(len(row)*0.05)
            np.put(row, np.random.randint(0, len(row)-1, mutations), np.random.randint(1, vocal_size, mutations))
        end_padding = self.seq_length-len(row[0])
        seq = np.pad(row[0], mode="constant", pad_width=(0,end_padding))
        return np.int64(seq), row[1]

In [6]:
embeddings = np.loadtxt(open(EMBEDDING_PATH, "rb"), delimiter="\t", skiprows=1, usecols=[i for i in range(1,101)])

In [7]:
vocal_size = min(8001, embeddings.shape[0])

In [8]:
train_ds = EnzymeDataSet(ROOT+"3_kmers/train/data.npy", 748, is_test=False, vocal_size=vocal_size)
val_ds = EnzymeDataSet(ROOT+"3_kmers/val/data.npy", 748)
test_ds = EnzymeDataSet(ROOT+"3_kmers/test/data.npy", 748)

In [9]:
def gelu(x):
    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

In [10]:
class ResnetIdentityBlock(nn.Module):

    def __init__(self, num_inputs, num_outputs, kernel_size, strides, dilation_rate=1, dropout=0.2, downsample = True, 
                 act=F.relu):
        super(ResnetIdentityBlock, self).__init__()
        self.act = act
        self.conv1 = nn.Conv1d(num_inputs, num_outputs, kernel_size, stride=strides, dilation =1)
        self.conv2 = nn.Conv1d(num_outputs, num_outputs, kernel_size, stride=1, dilation =dilation_rate)
        self.padding = nn.ReflectionPad1d((1,1))
        self.bn1 = nn.BatchNorm1d(num_inputs)
        self.bn2 = nn.BatchNorm1d(num_outputs)
        if downsample:            
            self.downsample = nn.Conv1d(num_inputs, num_outputs, kernel_size, stride=strides, dilation = 1)
        
    def forward(self, x):
        residual = x

        out = self.act(self.bn1(x))
        out = self.conv1(out)
        out = self.padding(out)
        out = self.act(self.bn2(out))
        out = self.conv2(out)
        if self.downsample is not None:
            residual = self.downsample(residual)
        out += residual     
        return out

In [11]:
class EnzymeClassifier(nn.Module):
    def __init__(self):
        super(EnzymeClassifier, self).__init__()
        self.embedding = nn.Embedding(embeddings.shape[0], embeddings.shape[1])
        self.embedding.from_pretrained(torch.from_numpy(embeddings)) 
        self.resnet_block1 = ResnetIdentityBlock(embeddings.shape[1], 128, 3, 2)
        self.resnet_block2 = ResnetIdentityBlock(128, 128, 3, 2)   
        self.resnet_block3 = ResnetIdentityBlock(128, 256, 3, 2)  
#         self.resnet_block4 = ResnetIdentityBlock(512, NUM_CLASSES, 3, 2)
#         self.bn = nn.BatchNorm1d(NUM_CLASSES)
#         self.out = torch.nn.Linear(1024, NUM_CLASSES)
        self.resnet_block4 = ResnetIdentityBlock(256, 512, 3, 2)
        self.bn = nn.BatchNorm1d(NUM_CLASSES)
        self.final_conv = nn.Conv1d(512, NUM_CLASSES, 1)

    def forward(self, x):
        embedded_seq = self.embedding(x)
        embedded_seq.transpose_(1, 2)
        h1 = self.resnet_block1(embedded_seq)
        h2 = self.resnet_block2(h1)    
        h3 = self.resnet_block3(h2)   
        h4 = self.resnet_block4(h3)
        out = self.bn(self.final_conv(F.relu(h4)))
        
        out = torch.mean(out, dim=2)
        
        return out

In [12]:
enzymeClassifier = EnzymeClassifier().cuda()

In [13]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(enzymeClassifier.parameters(), lr=0.001)

In [14]:
sum(p.numel() for p in enzymeClassifier.parameters() if p.requires_grad)

3603317

In [15]:
data = fastai.basic_data.DataBunch.create(train_ds=train_ds, valid_ds=val_ds, test_ds=test_ds, bs=batch_size, num_workers=num_workers)
learner = fastai.basic_train.Learner(data, enzymeClassifier, 
        loss_func=criterion, 
        metrics=fastai.accuracy, 
        path=None, 
        model_dir='models')

In [16]:
learner

Learner(data=<fastai.basic_data.DataBunch object at 0x7fce268a2588>, model=EnzymeClassifier(
  (embedding): Embedding(9048, 100)
  (resnet_block1): ResnetIdentityBlock(
    (conv1): Conv1d(100, 128, kernel_size=(3,), stride=(2,))
    (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
    (padding): ReflectionPad1d((1, 1))
    (bn1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (downsample): Conv1d(100, 128, kernel_size=(3,), stride=(2,))
  )
  (resnet_block2): ResnetIdentityBlock(
    (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(2,))
    (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
    (padding): ReflectionPad1d((1, 1))
    (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (downsample): Conv1d(128, 1

In [None]:
learner.fit_one_cycle(10)

epoch,train_loss,valid_loss,accuracy
1,5.694957,5.442441,0.216990
2,3.693374,3.451525,0.522042
,,,


In [None]:
learner.fit(1)

In [None]:
preds, truth =learner.get_preds(is_test=True)

In [None]:
fastai.accuracy(preds, truth)